Automatic imputation and posterior predictive sampling with new data

I am trying to combine PyMC3s automatic imputation with the posterior predictive sampling for new data. I think a typical uses case would be a Bayesian regression with missing data where the result could be used to predict new observation based on new data. I took the linear regression model from @junpenglao here to test this:

Posterior Predictive

Generally to use new data in the posterior predictive check, e.g. in a regression model, the data must be passed with pm.Data.

import numpy as np
import pymc3 as pm

x = np.array([[1, 2], [3, 4], [1, 3]])
y = np.array([5, 6, 7])

new_x = np.array([[4, 1.1], [2, 2]])
y_placeholder = np.zeros(new_x.shape[0])

with pm.Model() as model:
    xd = pm.Data('x', x)
    yd = pm.Data('y', y)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5., shape=x.shape[0])
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(xd, coeff),
        sigma=noise,
        observed=yd
    )
    
    trace = pm.sample(return_inferencedata=True)

    pm.set_data({
        'x':new_x,
        'y':y_placeholder
    })
    pred = pm.sample_posterior_predictive(trace, var_names=['observed'])

Then we can look at the posterior of y for the new data new_x in pred['observed'].

Automatic Imputation

Alternatively, if x has missing values I can use the PyMC3 automatic imputation:

import numpy as np
import pymc3 as pm

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])

with pm.Model() as model:
    # imputation
    all_x = pm.Normal('x_prior', 0, 1, observed=x_with_missing)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5.)
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(all_x, coeff),
        sigma=noise,
        observed=y
    )
    
    trace = pm.sample()

And the inference runs smoothly.

Both Together

But when I try to combine the two, the inference fails:

import numpy as np
import pymc3 as pm

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])

with pm.Model() as model:
    xd = pm.Data('x', x_with_missing)
    yd = pm.Data('y', y)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5.)
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(xd, coeff),
        sigma=noise,
        observed=yd
    )
    
    trace = pm.sample()
Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
observed   NaN
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 191, in _start_loop
    point, stats = self._compute_point()
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 218, in _compute_point
    point, stats = self._step_method.step(self._point)
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py", line 269, in step
    apoint, stats = self.astep(array)
  File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 167, in astep
    raise SamplingError("Bad initial energy")
pymc3.exceptions.SamplingError: Bad initial energy
"""

The above exception was the direct cause of the following exception:

SamplingError                             Traceback (most recent call last)
SamplingError: Bad initial energy

The above exception was the direct cause of the following exception:

ParallelSamplingError                     Traceback (most recent call last)
<ipython-input-90-0e11d56bec9e> in <module>
     21     )
     22 
---> 23     trace = pm.sample()
     24 
     25     pm.set_data({

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, return_inferencedata, idata_kwargs, mp_ctx, pickle_backend, **kwargs)
    540         _print_step_hierarchy(step)
    541         try:
--> 542             trace = _mp_sample(**sample_args, **parallel_args)
    543         except pickle.PickleError:
    544             _log.warning("Could not pickle model, sampling singlethreaded.")

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in _mp_sample(draws, tune, step, chains, cores, chain, random_seed, start, progressbar, trace, model, callback, discard_tuned_samples, mp_ctx, pickle_backend, **kwargs)
   1461         try:
   1462             with sampler:
-> 1463                 for draw in sampler:
   1464                     trace = traces[draw.chain - chain]
   1465                     if trace.supports_sampler_stats and draw.stats is not None:

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in __iter__(self)
    490 
    491         while self._active:
--> 492             draw = ProcessAdapter.recv_draw(self._active)
    493             proc, is_last, draw, tuning, stats, warns = draw
    494             self._total_draws += 1

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in recv_draw(processes, timeout)
    363             else:
    364                 error = RuntimeError("Chain %s failed." % proc.chain)
--> 365             raise error from old_error
    366         elif msg[0] == "writing_done":
    367             proc._readable = True

ParallelSamplingError: Bad initial energy

I know I will have to set the x_prior_missing RV to fit the shape of the missing values in the new_x in the posterior predictive sampling. But for now, I am stuck with the sampling already.

Is there a way to fix this? I was looking for an option to pass the mask to pm.Data but could not find it.

Yeah seems it is a problem of pm.Data (maybe even theano.shared in general). I have this pretty old blog post https://junpenglao.xyz/Blogs/posts/2017-10-23-OOS_missing.html dealing with this: basically creating a new RV that conditioned on the new predictors for posterior predictive samples.

Thanks @junpenglao I will read through the post and see if I can figure it out. I also suspect the issue may be, that the theano compute graph may change when the data x is replaced with new data. As the new data may have a different count of missing values at different places and the x_missing RV is linked differently to x. But this is just a hunch.

I thought I got it but something is still off. I first get the trace with imputation:

x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
y = np.array([1, 6.4, 7])

x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)

with pm.Model() as model:
    all_x = pm.Normal('x_prior', 0, 1, observed=x_with_missing)

    coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
    intercept = pm.Normal('c', 0, 10)
    noise = pm.HalfNormal('sigma', 5., shape=x.shape[0])
    observed = pm.Normal(
        'observed', 
        mu=intercept + pm.math.dot(all_x, coeff),
        sigma=noise,
        observed=y
    )
    
    trace = pm.sample()

Then I make the new RVs for the posterior predictive sampling:

new_x = np.array([[4, 1.1], [np.nan, 2]])
new_x_with_missing = np.ma.masked_array(new_x, mask=new_x!=np.nan)

with model:
    all_x_new = pm.Normal('x_prior_new', 0, 1, observed=new_x_with_missing)
    new_noise = pm.HalfNormal('new_sigma', 5., shape=new_x.shape[0])
    observed_new = pm.Normal(
        'observed_new', 
        mu=intercept + pm.math.dot(all_x_new, coeff),
        sigma=new_noise,
        shape=(new_x.shape[0],)
    )

And then I tried

pred = pm.sample_posterior_predictive(trace, model=model)

but this dose not make samples for the new RVs, e.g. observed_new :

>>> pred.keys()
dict_keys(['x_prior', 'observed', 'x_prior_new'])

So I tried

pred = pm.sample_posterior_predictive(trace, var_names=['observed_new'], model=model)

and

pred = pm.sample_posterior_predictive(trace, vars=[observed_new], model=model)

but this fails with

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-132-f29805b9f1c1> in <module>
     11         shape=(new_x.shape[0],)
     12     )
---> 13     pred = pm.sample_posterior_predictive(trace, var_names=['observed_new'])

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in sample_posterior_predictive(trace, samples, model, vars, var_names, size, keep_size, random_seed, progressbar)
   1720                 param = _trace[idx % len_trace]
   1721 
-> 1722             values = draw_values(vars, point=param, size=size)
   1723             for k, v in zip(vars, values):
   1724                 ppc_trace_t.insert(k.name, v, idx)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in draw_values(params, point, size)
    729                     # This may fail for autotransformed RVs, which don't
    730                     # have the random method
--> 731                     value = _draw_value(next_, point=point, givens=temp_givens, size=size)
    732                     givens[next_.name] = (next_, value)
    733                     drawn[(next_, size)] = value

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in _draw_value(param, point, givens, size)
    893             return point[param.name]
    894         elif hasattr(param, "random") and param.random is not None:
--> 895             return param.random(point=point, size=size)
    896         elif (
    897             hasattr(param, "distribution")

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/model.py in __call__(self, *args, **kwargs)
     97 
     98     def __call__(self, *args, **kwargs):
---> 99         return getattr(self.obj, self.method_name)(*args, **kwargs)
    100 
    101 

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/continuous.py in random(self, point, size)
    511         array
    512         """
--> 513         mu, tau, _ = draw_values([self.mu, self.tau, self.sigma], point=point, size=size)
    514         return generate_samples(
    515             stats.norm.rvs, loc=mu, scale=tau ** -0.5, dist_shape=self.shape, size=size

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in draw_values(params, point, size)
    772                                 if node.name not in givens and (node, size) in drawn:
    773                                     givens[node.name] = (node, drawn[(node, size)])
--> 774                         value = _draw_value(param, point=point, givens=givens.values(), size=size)
    775                         evaluated[param_idx] = drawn[(param, size)] = value
    776                         givens[param.name] = (param, value)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/distributions/distribution.py in _draw_value(param, point, givens, size)
    944                 input_vals = []
    945             func = _compile_theano_function(param, input_vars)
--> 946             output = func(*input_vals)
    947             return output
    948     raise ValueError("Unexpected type in draw_value: %s" % type(param))

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in __call__(self, *args, **kwargs)
   2106             vargs.extend([kwargs[_n] for _n in names])
   2107 
-> 2108         return self._vectorize_call(func=func, args=vargs)
   2109 
   2110     def _get_ufunc_and_otypes(self, func, args):

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _vectorize_call(self, func, args)
   2180         """Vectorized call to `func` over positional `args`."""
   2181         if self.signature is not None:
-> 2182             res = self._vectorize_call_with_signature(func, args)
   2183         elif not args:
   2184             res = func()

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _vectorize_call_with_signature(self, func, args)
   2210 
   2211         broadcast_shape, dim_sizes = _parse_input_dimensions(
-> 2212             args, input_core_dims)
   2213         input_shapes = _calculate_shapes(broadcast_shape, dim_sizes,
   2214                                          input_core_dims)

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _parse_input_dimensions(args, input_core_dims)
   1871     dim_sizes = {}
   1872     for arg, core_dims in zip(args, input_core_dims):
-> 1873         _update_dim_sizes(dim_sizes, arg, core_dims)
   1874         ndim = arg.ndim - len(core_dims)
   1875         dummy_array = np.lib.stride_tricks.as_strided(0, arg.shape[:ndim])

/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/numpy/lib/function_base.py in _update_dim_sizes(dim_sizes, arg, core_dims)
   1837             '%d-dimensional argument does not have enough '
   1838             'dimensions for all core dimensions %r'
-> 1839             % (arg.ndim, core_dims))
   1840 
   1841     core_shape = arg.shape[-num_core_dims:]

ValueError: 0-dimensional argument does not have enough dimensions for all core dimensions ('i_1_0',)

What does this mean and what am I doing wrong?

I see, you have individual noise term for each row and also missing data from the predictor… Unfortunately I have no good answer for this, you can try putting hyper prior on noise, and plug the hyperprior into the sigma of observed_new, but new_x still need to be dense without missing. Basically, it only works if:

with model:
    observed_new = pm.Normal(
        'observed_new', 
        mu=intercept + pm.math.dot(all_x_new, coeff),
        sigma=noise,
        shape=(new_x.shape[0],)
    )

Where you are not adding new leafs to your Bayesian model that does not conditioned on additional unknown quantities.

Thank you for the quick responses! The individual noise terms are not required but I indeed have missing values in the new data for which I hoped to use the imputation posterior all_x_new to sample. As you posted in OOS predictions with missing input values I could fill the gaps of x_with_missing by sampling them from an appropriate distribution and putting the result together with x_with_missing. That helps a lot!