I read a lot about automatic missing data imputation on PyMC discourse and the net but Iβm still struggling to get the model right.
I have 10 measurements for each of ou_length
(~90) observed variables. The values are stored in np.array measured.shape (10, ou_length)
.Some of the values in measured
are missing (filled with NaN). The model is a simple Bayesian linear and RSM regression. When I run it with full, complete set of data (no NaN) it executes correctly but when missing data is injected and masked I just canβt get it right. The sampling is completed but it trips on post-processing the chains due to array dimensionality problem. Below is the sample of the code and error message. Any suggestions would be much appreciated.
masked_measured = np.ma.masked_invalid(measured)
with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement
# Define priors for the unknown parameters
priors = []
for i, pri in enumerate(pri_inputs):
# getting my priors sorted here
priors = tt.stack(priors)[:, None]
# Define the surrogate models for the outputs using the priors defined above
linear = sm_linear * priors
rsm = tt.sum(sm_rsm * priors * priors.T, axis=2)
out_mu = sm_intercept + tt.sum(linear, axis=0) + tt.sum(rsm, axis=1)
# Define loglikelihood as Multivariate Normal with defined covariance matrix and observations
loglikelihood = pm.MvNormal(
"loglikelihood", mu=out_mu, cov=true_cov, observed=masked_measured, shape=ou_length
)
with model:
# Inference
step = pm.NUTS() # using NUTS sampling
trace = pm.sample(
draws=samples[0],
tune=samples[1],
discard_tuned_samples=False,
step=step,
cores=2,
progressbar=True,
)
Error message:
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [loglikelihood_missing, priors_4, priors_3, priors_2, priors_1, priors_0]
Sampling 2 chains, 0 divergences: 100%|βββββββββββββββββββββ| 5000/5000 [02:41<00:00, 30.91draws/s]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-85-84299ce1f9f2> in <module>
2 # Inference
3 step = pm.NUTS() # using NUTS sampling
----> 4 trace = pm.sample(
5 draws=samples[0],
6 tune=samples[1],
d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, **kwargs)
496 warnings.warn("The number of samples is too small to check convergence reliably.")
497 else:
--> 498 trace.report._run_convergence_checks(trace, model)
499
500 trace.report._log_summary()
d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\backends\report.py in _run_convergence_checks(self, trace, model)
82 varnames.append(rv_name)
83
---> 84 self._ess = ess = ess(trace, var_names=varnames)
85 self._rhat = rhat = rhat(trace, var_names=varnames)
86
d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\stats\__init__.py in wrapped(*args, **kwargs)
22 )
23 kwargs[new] = kwargs.pop(old)
---> 24 return func(*args, **kwargs)
25
26 return wrapped
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\stats\diagnostics.py in ess(data, var_names, method, relative, prob)
184 raise TypeError(msg)
185
--> 186 dataset = convert_to_dataset(data, group="posterior")
187 var_names = _var_names(var_names, dataset)
188
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\converters.py in convert_to_dataset(obj, group, coords, dims)
166 xarray.Dataset
167 """
--> 168 inference_data = convert_to_inference_data(obj, group=group, coords=coords, dims=dims)
169 dataset = getattr(inference_data, group, None)
170 if dataset is None:
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\converters.py in convert_to_inference_data(obj, group, coords, dims, **kwargs)
87 return from_pystan(**kwargs)
88 elif obj.__class__.__name__ == "MultiTrace": # ugly, but doesn't make PyMC3 a requirement
---> 89 return from_pymc3(trace=kwargs.pop(group), **kwargs)
90 elif obj.__class__.__name__ == "EnsembleSampler": # ugly, but doesn't make emcee a requirement
91 return from_emcee(sampler=kwargs.pop(group), **kwargs)
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in from_pymc3(trace, prior, posterior_predictive, coords, dims, model)
283 ):
284 """Convert pymc3 data into an InferenceData object."""
--> 285 return PyMC3Converter(
286 trace=trace,
287 prior=prior,
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in to_inference_data(self)
270 **{
271 "posterior": self.posterior_to_xarray(),
--> 272 "sample_stats": self.sample_stats_to_xarray(),
273 "posterior_predictive": self.posterior_predictive_to_xarray(),
274 **self.priors_to_xarray(),
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
34 if all([getattr(cls, prop_i) is None for prop_i in prop]):
35 return None
---> 36 return func(cls, *args, **kwargs)
37
38 return wrapped
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in sample_stats_to_xarray(self)
147 name = rename_key.get(stat, stat)
148 data[name] = np.array(self.trace.get_sampler_stats(stat, combine=False))
--> 149 log_likelihood, dims = self._extract_log_likelihood()
150 if log_likelihood is not None:
151 data["log_likelihood"] = log_likelihood
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
34 if all([getattr(cls, prop_i) is None for prop_i in prop]):
35 return None
---> 36 return func(cls, *args, **kwargs)
37
38 return wrapped
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
34 if all([getattr(cls, prop_i) is None for prop_i in prop]):
35 return None
---> 36 return func(cls, *args, **kwargs)
37
38 return wrapped
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in _extract_log_likelihood(self)
124 chain_likelihoods = []
125 for chain in self.trace.chains:
--> 126 log_like = [log_likelihood_vals_point(point) for point in self.trace.points([chain])]
127 chain_likelihoods.append(np.stack(log_like))
128 return np.stack(chain_likelihoods), coord_name
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in <listcomp>(.0)
124 chain_likelihoods = []
125 for chain in self.trace.chains:
--> 126 log_like = [log_likelihood_vals_point(point) for point in self.trace.points([chain])]
127 chain_likelihoods.append(np.stack(log_like))
128 return np.stack(chain_likelihoods), coord_name
d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in log_likelihood_vals_point(point)
118 log_like_val = utils.one_de(log_like(point))
119 if var.missing_values:
--> 120 log_like_val = log_like_val[~var.observations.mask]
121 log_like_vals.append(log_like_val)
122 return np.concatenate(log_like_vals)
IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed