Automatic imputation and posterior predictive sampling with new data

I am trying to combine PyMC3s automatic imputation with the posterior predictive sampling for new data. I think a typical uses case would be a Bayesian regression with missing data where the result could be used to predict new observation based on new data. I took the linear regression model from @junpenglao here to test this:

Posterior Predictive

Generally to use new data in the posterior predictive check, e.g. in a regression model, the data must be passed with pm.Data.

import numpy as np
import pymc3 as pm

x = np.array([[1, 2], [3, 4], [1, 3]])
y = np.array([5, 6, 7])

new_x = np.array([[4, 1.1], [2, 2]])
y_placeholder = np.zeros(new_x.shape[0])

with pm.Model() as model:
    xd = pm.Data('x', x)
    yd = pm.Data('y', y)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5., shape=x.shape[0])
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(xd, coeff),
        sigma=noise,
        observed=yd
    )
    
    trace = pm.sample(return_inferencedata=True)

    pm.set_data({
        'x':new_x,
        'y':y_placeholder
    })
    pred = pm.sample_posterior_predictive(trace, var_names=['observed'])

Then we can look at the posterior of y for the new data new_x in pred['observed'].

Automatic Imputation

Alternatively, if x has missing values I can use the PyMC3 automatic imputation:

import numpy as np
import pymc3 as pm

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])

with pm.Model() as model:
    # imputation
    all_x = pm.Normal('x_prior', 0, 1, observed=x_with_missing)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5.)
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(all_x, coeff),
        sigma=noise,
        observed=y
    )
    
    trace = pm.sample()

And the inference runs smoothly.

Both Together

But when I try to combine the two, the inference fails:

import numpy as np
import pymc3 as pm

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])

with pm.Model() as model:
    xd = pm.Data('x', x_with_missing)
    yd = pm.Data('y', y)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5.)
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(xd, coeff),
        sigma=noise,
        observed=yd
    )
    
    trace = pm.sample()
Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
observed   NaN
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 191, in _start_loop
    point, stats = self._compute_point()
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 218, in _compute_point
    point, stats = self._step_method.step(self._point)
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py", line 269, in step
    apoint, stats = self.astep(array)
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 167, in astep
    raise SamplingError("Bad initial energy")
pymc3.exceptions.SamplingError: Bad initial energy
"""

The above exception was the direct cause of the following exception:

SamplingError                             Traceback (most recent call last)
SamplingError: Bad initial energy

The above exception was the direct cause of the following exception:

ParallelSamplingError                     Traceback (most recent call last)
<ipython-input-90-0e11d56bec9e> in <module>
     21     )
     22 
---> 23     trace = pm.sample()
     24 
     25     pm.set_data({

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, return_inferencedata, idata_kwargs, mp_ctx, pickle_backend, **kwargs)
    540         _print_step_hierarchy(step)
    541         try:
--> 542             trace = _mp_sample(**sample_args, **parallel_args)
    543         except pickle.PickleError:
    544             _log.warning("Could not pickle model, sampling singlethreaded.")

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in _mp_sample(draws, tune, step, chains, cores, chain, random_seed, start, progressbar, trace, model, callback, discard_tuned_samples, mp_ctx, pickle_backend, **kwargs)
   1461         try:
   1462             with sampler:
-> 1463                 for draw in sampler:
   1464                     trace = traces[draw.chain - chain]
   1465                     if trace.supports_sampler_stats and draw.stats is not None:

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in __iter__(self)
    490 
    491         while self._active:
--> 492             draw = ProcessAdapter.recv_draw(self._active)
    493             proc, is_last, draw, tuning, stats, warns = draw
    494             self._total_draws += 1

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in recv_draw(processes, timeout)
    363             else:
    364                 error = RuntimeError("Chain %s failed." % proc.chain)
--> 365             raise error from old_error
    366         elif msg[0] == "writing_done":
    367             proc._readable = True

ParallelSamplingError: Bad initial energy

I know I will have to set the x_prior_missing RV to fit the shape of the missing values in the new_x in the posterior predictive sampling. But for now, I am stuck with the sampling already.

Is there a way to fix this? I was looking for an option to pass the mask to pm.Data but could not find it.

Yeah seems it is a problem of pm.Data (maybe even theano.shared in general). I have this pretty old blog post https://junpenglao.xyz/Blogs/posts/2017-10-23-OOS_missing.html dealing with this: basically creating a new RV that conditioned on the new predictors for posterior predictive samples.

2 Likes

Thanks @junpenglao I will read through the post and see if I can figure it out. I also suspect the issue may be, that the theano compute graph may change when the data x is replaced with new data. As the new data may have a different count of missing values at different places and the x_missing RV is linked differently to x. But this is just a hunch.

I thought I got it but something is still off. I first get the trace with imputation:

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
y = np.array([1, 6.4, 7])

x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)

with pm.Model() as model:
    all_x = pm.Normal('x_prior', 0, 1, observed=x_with_missing)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5., shape=x.shape[0])
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(all_x, coeff),
        sigma=noise,
        observed=y
    )
    
    trace = pm.sample()

Then I make the new RVs for the posterior predictive sampling:

new_x = np.array([[4, 1.1], [np.nan, 2]])
new_x_with_missing = np.ma.masked_array(new_x, mask=new_x!=np.nan)

with model:
    all_x_new = pm.Normal('x_prior_new', 0, 1, observed=new_x_with_missing)
    new_noise = pm.HalfNormal('new_sigma', 5., shape=new_x.shape[0])
    observed_new = pm.Normal(
        'observed_new', 
        mu=intercept + pm.math.dot(all_x_new, coeff),
        sigma=new_noise,
        shape=(new_x.shape[0],)
    )

And then I tried

pred = pm.sample_posterior_predictive(trace, model=model)

but this dose not make samples for the new RVs, e.g. observed_new :

>>> pred.keys()
dict_keys(['x_prior', 'observed', 'x_prior_new'])

So I tried

pred = pm.sample_posterior_predictive(trace, var_names=['observed_new'], model=model)

and

pred = pm.sample_posterior_predictive(trace, vars=[observed_new], model=model)

but this fails with

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-132-f29805b9f1c1> in <module>
     11         shape=(new_x.shape[0],)
     12     )
---> 13     pred = pm.sample_posterior_predictive(trace, var_names=['observed_new'])

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in sample_posterior_predictive(trace, samples, model, vars, var_names, size, keep_size, random_seed, progressbar)
   1720                 param = _trace[idx % len_trace]
   1721 
-> 1722             values = draw_values(vars, point=param, size=size)
   1723             for k, v in zip(vars, values):
   1724                 ppc_trace_t.insert(k.name, v, idx)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in draw_values(params, point, size)
    729                     # This may fail for autotransformed RVs, which don't
    730                     # have the random method
--> 731                     value = _draw_value(next_, point=point, givens=temp_givens, size=size)
    732                     givens[next_.name] = (next_, value)
    733                     drawn[(next_, size)] = value

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in _draw_value(param, point, givens, size)
    893             return point[param.name]
    894         elif hasattr(param, "random") and param.random is not None:
--> 895             return param.random(point=point, size=size)
    896         elif (
    897             hasattr(param, "distribution")

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/model.py in __call__(self, *args, **kwargs)
     97 
     98     def __call__(self, *args, **kwargs):
---> 99         return getattr(self.obj, self.method_name)(*args, **kwargs)
    100 
    101 

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/continuous.py in random(self, point, size)
    511         array
    512         """
--> 513         mu, tau, _ = draw_values([self.mu, self.tau, self.sigma], point=point, size=size)
    514         return generate_samples(
    515             stats.norm.rvs, loc=mu, scale=tau ** -0.5, dist_shape=self.shape, size=size

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in draw_values(params, point, size)
    772                                 if node.name not in givens and (node, size) in drawn:
    773                                     givens[node.name] = (node, drawn[(node, size)])
--> 774                         value = _draw_value(param, point=point, givens=givens.values(), size=size)
    775                         evaluated[param_idx] = drawn[(param, size)] = value
    776                         givens[param.name] = (param, value)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in _draw_value(param, point, givens, size)
    944                 input_vals = []
    945             func = _compile_theano_function(param, input_vars)
--> 946             output = func(*input_vals)
    947             return output
    948     raise ValueError("Unexpected type in draw_value: %s" % type(param))

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in __call__(self, *args, **kwargs)
   2106             vargs.extend([kwargs[_n] for _n in names])
   2107 
-> 2108         return self._vectorize_call(func=func, args=vargs)
   2109 
   2110     def _get_ufunc_and_otypes(self, func, args):

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _vectorize_call(self, func, args)
   2180         """Vectorized call to `func` over positional `args`."""
   2181         if self.signature is not None:
-> 2182             res = self._vectorize_call_with_signature(func, args)
   2183         elif not args:
   2184             res = func()

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _vectorize_call_with_signature(self, func, args)
   2210 
   2211         broadcast_shape, dim_sizes = _parse_input_dimensions(
-> 2212             args, input_core_dims)
   2213         input_shapes = _calculate_shapes(broadcast_shape, dim_sizes,
   2214                                          input_core_dims)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _parse_input_dimensions(args, input_core_dims)
   1871     dim_sizes = {}
   1872     for arg, core_dims in zip(args, input_core_dims):
-> 1873         _update_dim_sizes(dim_sizes, arg, core_dims)
   1874         ndim = arg.ndim - len(core_dims)
   1875         dummy_array = np.lib.stride_tricks.as_strided(0, arg.shape[:ndim])

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _update_dim_sizes(dim_sizes, arg, core_dims)
   1837             '%d-dimensional argument does not have enough '
   1838             'dimensions for all core dimensions %r'
-> 1839             % (arg.ndim, core_dims))
   1840 
   1841     core_shape = arg.shape[-num_core_dims:]

ValueError: 0-dimensional argument does not have enough dimensions for all core dimensions ('i_1_0',)

What does this mean and what am I doing wrong?

I see, you have individual noise term for each row and also missing data from the predictor… Unfortunately I have no good answer for this, you can try putting hyper prior on noise, and plug the hyperprior into the sigma of observed_new, but new_x still need to be dense without missing. Basically, it only works if:

with model:
    observed_new = pm.Normal(
        'observed_new', 
        mu=intercept + pm.math.dot(all_x_new, coeff),
        sigma=noise,
        shape=(new_x.shape[0],)
    )

Where you are not adding new leafs to your Bayesian model that does not conditioned on additional unknown quantities.

1 Like

Thank you for the quick responses! The individual noise terms are not required but I indeed have missing values in the new data for which I hoped to use the imputation posterior all_x_new to sample. As you posted in OOS predictions with missing input values I could fill the gaps of x_with_missing by sampling them from an appropriate distribution and putting the result together with x_with_missing. That helps a lot!