Automatic imputation - array dimension problem

I read a lot about automatic missing data imputation on PyMC discourse and the net but I’m still struggling to get the model right.

I have 10 measurements for each of ou_length (~90) observed variables. The values are stored in np.array measured.shape (10, ou_length).Some of the values in measured are missing (filled with NaN). The model is a simple Bayesian linear and RSM regression. When I run it with full, complete set of data (no NaN) it executes correctly but when missing data is injected and masked I just can’t get it right. The sampling is completed but it trips on post-processing the chains due to array dimensionality problem. Below is the sample of the code and error message. Any suggestions would be much appreciated.

masked_measured = np.ma.masked_invalid(measured)

with pm.Model() as model:  # model specifications in PyMC3 are wrapped in a with-statement
    # Define priors for the unknown parameters
    priors = []
    for i, pri in enumerate(pri_inputs):
        # getting my priors sorted here                 
 
    priors = tt.stack(priors)[:, None]

    # Define the surrogate models for the outputs using the priors defined above
    linear = sm_linear * priors
    rsm = tt.sum(sm_rsm * priors * priors.T, axis=2)
    out_mu = sm_intercept + tt.sum(linear, axis=0) + tt.sum(rsm, axis=1)

    # Define loglikelihood as Multivariate Normal with defined covariance matrix and observations
    loglikelihood = pm.MvNormal(
        "loglikelihood", mu=out_mu, cov=true_cov, observed=masked_measured, shape=ou_length
    )

with model:
    # Inference
    step = pm.NUTS()  # using NUTS sampling
    trace = pm.sample(
        draws=samples[0],
        tune=samples[1],
        discard_tuned_samples=False,
        step=step,
        cores=2,
        progressbar=True,
    )

Error message:

Multiprocess sampling (2 chains in 2 jobs)
NUTS: [loglikelihood_missing, priors_4, priors_3, priors_2, priors_1, priors_0]
Sampling 2 chains, 0 divergences: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5000/5000 [02:41<00:00, 30.91draws/s]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-85-84299ce1f9f2> in <module>
      2     # Inference
      3     step = pm.NUTS()  # using NUTS sampling
----> 4     trace = pm.sample(
      5         draws=samples[0],
      6         tune=samples[1],

d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, **kwargs)
    496             warnings.warn("The number of samples is too small to check convergence reliably.")
    497         else:
--> 498             trace.report._run_convergence_checks(trace, model)
    499 
    500     trace.report._log_summary()

d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\backends\report.py in _run_convergence_checks(self, trace, model)
     82                 varnames.append(rv_name)
     83 
---> 84         self._ess = ess = ess(trace, var_names=varnames)
     85         self._rhat = rhat = rhat(trace, var_names=varnames)
     86 

d:\Anaconda\envs\jupyter-env\lib\site-packages\pymc3\stats\__init__.py in wrapped(*args, **kwargs)
     22                 )
     23                 kwargs[new] = kwargs.pop(old)
---> 24             return func(*args, **kwargs)
     25 
     26     return wrapped

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\stats\diagnostics.py in ess(data, var_names, method, relative, prob)
    184             raise TypeError(msg)
    185 
--> 186     dataset = convert_to_dataset(data, group="posterior")
    187     var_names = _var_names(var_names, dataset)
    188 

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\converters.py in convert_to_dataset(obj, group, coords, dims)
    166     xarray.Dataset
    167     """
--> 168     inference_data = convert_to_inference_data(obj, group=group, coords=coords, dims=dims)
    169     dataset = getattr(inference_data, group, None)
    170     if dataset is None:

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\converters.py in convert_to_inference_data(obj, group, coords, dims, **kwargs)
     87             return from_pystan(**kwargs)
     88     elif obj.__class__.__name__ == "MultiTrace":  # ugly, but doesn't make PyMC3 a requirement
---> 89         return from_pymc3(trace=kwargs.pop(group), **kwargs)
     90     elif obj.__class__.__name__ == "EnsembleSampler":  # ugly, but doesn't make emcee a requirement
     91         return from_emcee(sampler=kwargs.pop(group), **kwargs)

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in from_pymc3(trace, prior, posterior_predictive, coords, dims, model)
    283 ):
    284     """Convert pymc3 data into an InferenceData object."""
--> 285     return PyMC3Converter(
    286         trace=trace,
    287         prior=prior,

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in to_inference_data(self)
    270             **{
    271                 "posterior": self.posterior_to_xarray(),
--> 272                 "sample_stats": self.sample_stats_to_xarray(),
    273                 "posterior_predictive": self.posterior_predictive_to_xarray(),
    274                 **self.priors_to_xarray(),

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
     34                 if all([getattr(cls, prop_i) is None for prop_i in prop]):
     35                     return None
---> 36             return func(cls, *args, **kwargs)
     37 
     38         return wrapped

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in sample_stats_to_xarray(self)
    147             name = rename_key.get(stat, stat)
    148             data[name] = np.array(self.trace.get_sampler_stats(stat, combine=False))
--> 149         log_likelihood, dims = self._extract_log_likelihood()
    150         if log_likelihood is not None:
    151             data["log_likelihood"] = log_likelihood

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
     34                 if all([getattr(cls, prop_i) is None for prop_i in prop]):
     35                     return None
---> 36             return func(cls, *args, **kwargs)
     37 
     38         return wrapped

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\base.py in wrapped(cls, *args, **kwargs)
     34                 if all([getattr(cls, prop_i) is None for prop_i in prop]):
     35                     return None
---> 36             return func(cls, *args, **kwargs)
     37 
     38         return wrapped

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in _extract_log_likelihood(self)
    124         chain_likelihoods = []
    125         for chain in self.trace.chains:
--> 126             log_like = [log_likelihood_vals_point(point) for point in self.trace.points([chain])]
    127             chain_likelihoods.append(np.stack(log_like))
    128         return np.stack(chain_likelihoods), coord_name

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in <listcomp>(.0)
    124         chain_likelihoods = []
    125         for chain in self.trace.chains:
--> 126             log_like = [log_likelihood_vals_point(point) for point in self.trace.points([chain])]
    127             chain_likelihoods.append(np.stack(log_like))
    128         return np.stack(chain_likelihoods), coord_name

d:\Anaconda\envs\jupyter-env\lib\site-packages\arviz\data\io_pymc3.py in log_likelihood_vals_point(point)
    118                 log_like_val = utils.one_de(log_like(point))
    119                 if var.missing_values:
--> 120                     log_like_val = log_like_val[~var.observations.mask]
    121                 log_like_vals.append(log_like_val)
    122             return np.concatenate(log_like_vals)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

I recommend reviewing the discussion at these threads:

In particular, the first thread recommends using trace = pm.sample(return_inferencedata=False, compute_convergence_checks=False) so that the postprocessing doesn’t execute.

2 Likes

Thank you very much, the above worked for me. I am constrained to PyMC3 v3.8 and the return_inferencedata was not implemented yet. I am really interested in a bit more details on how automatic imputation is implemented in PyMC3. I will do some digging.