I am trying to combine PyMC3s automatic imputation with the posterior predictive sampling for new data. I think a typical uses case would be a Bayesian regression with missing data where the result could be used to predict new observation based on new data. I took the linear regression model from @junpenglao here to test this:
Posterior Predictive
Generally to use new data in the posterior predictive check, e.g. in a regression model, the data must be passed with pm.Data
.
import numpy as np
import pymc3 as pm
x = np.array([[1, 2], [3, 4], [1, 3]])
y = np.array([5, 6, 7])
new_x = np.array([[4, 1.1], [2, 2]])
y_placeholder = np.zeros(new_x.shape[0])
with pm.Model() as model:
xd = pm.Data('x', x)
yd = pm.Data('y', y)
coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
intercept = pm.Normal('c', 0, 10)
noise = pm.HalfNormal('sigma', 5., shape=x.shape[0])
observed = pm.Normal(
'observed',
mu=intercept + pm.math.dot(xd, coeff),
sigma=noise,
observed=yd
)
trace = pm.sample(return_inferencedata=True)
pm.set_data({
'x':new_x,
'y':y_placeholder
})
pred = pm.sample_posterior_predictive(trace, var_names=['observed'])
Then we can look at the posterior of y
for the new data new_x
in pred['observed']
.
Automatic Imputation
Alternatively, if x
has missing values I can use the PyMC3 automatic imputation:
import numpy as np
import pymc3 as pm
x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])
with pm.Model() as model:
# imputation
all_x = pm.Normal('x_prior', 0, 1, observed=x_with_missing)
coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
intercept = pm.Normal('c', 0, 10)
noise = pm.HalfNormal('sigma', 5.)
observed = pm.Normal(
'observed',
mu=intercept + pm.math.dot(all_x, coeff),
sigma=noise,
observed=y
)
trace = pm.sample()
And the inference runs smoothly.
Both Together
But when I try to combine the two, the inference fails:
import numpy as np
import pymc3 as pm
x = np.array([[np.nan, 2.1], [2, 4], [1, 5]])
x_with_missing = np.ma.masked_array(x, mask=x!=np.nan)
y = np.array([1, 6.4, 7])
with pm.Model() as model:
xd = pm.Data('x', x_with_missing)
yd = pm.Data('y', y)
coeff = pm.Normal('beta', 0, 1, shape=x.shape[1])
intercept = pm.Normal('c', 0, 10)
noise = pm.HalfNormal('sigma', 5.)
observed = pm.Normal(
'observed',
mu=intercept + pm.math.dot(xd, coeff),
sigma=noise,
observed=yd
)
trace = pm.sample()
Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
observed NaN
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 191, in _start_loop
point, stats = self._compute_point()
File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 218, in _compute_point
point, stats = self._step_method.step(self._point)
File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py", line 269, in step
apoint, stats = self.astep(array)
File "/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 167, in astep
raise SamplingError("Bad initial energy")
pymc3.exceptions.SamplingError: Bad initial energy
"""
The above exception was the direct cause of the following exception:
SamplingError Traceback (most recent call last)
SamplingError: Bad initial energy
The above exception was the direct cause of the following exception:
ParallelSamplingError Traceback (most recent call last)
<ipython-input-90-0e11d56bec9e> in <module>
21 )
22
---> 23 trace = pm.sample()
24
25 pm.set_data({
/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, return_inferencedata, idata_kwargs, mp_ctx, pickle_backend, **kwargs)
540 _print_step_hierarchy(step)
541 try:
--> 542 trace = _mp_sample(**sample_args, **parallel_args)
543 except pickle.PickleError:
544 _log.warning("Could not pickle model, sampling singlethreaded.")
/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/sampling.py in _mp_sample(draws, tune, step, chains, cores, chain, random_seed, start, progressbar, trace, model, callback, discard_tuned_samples, mp_ctx, pickle_backend, **kwargs)
1461 try:
1462 with sampler:
-> 1463 for draw in sampler:
1464 trace = traces[draw.chain - chain]
1465 if trace.supports_sampler_stats and draw.stats is not None:
/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in __iter__(self)
490
491 while self._active:
--> 492 draw = ProcessAdapter.recv_draw(self._active)
493 proc, is_last, draw, tuning, stats, warns = draw
494 self._total_draws += 1
/usr/local/Caskroom/miniconda/base/envs/pymcEnv/lib/python3.7/site-packages/pymc3/parallel_sampling.py in recv_draw(processes, timeout)
363 else:
364 error = RuntimeError("Chain %s failed." % proc.chain)
--> 365 raise error from old_error
366 elif msg[0] == "writing_done":
367 proc._readable = True
ParallelSamplingError: Bad initial energy
I know I will have to set the x_prior_missing
RV to fit the shape of the missing values in the new_x
in the posterior predictive sampling. But for now, I am stuck with the sampling already.
Is there a way to fix this? I was looking for an option to pass the mask to pm.Data
but could not find it.