Error with sample posterior predictive function

Hi all,

I am currently learning Bayesian Additive Regression Trees and would like to replicate the model which is in your website named Bayesian Additive Regression Trees: Introduction
(link: [Bayesian Additive Regression Trees: Introduction — PyMC example gallery]

When I am trying to do out-of-sample predictions, it gives error message related to different dimensions on data. I am using PyMC v5.10.2, so I am wondering if there is any update that causes it?

Following codes, I have a problem with pm.sample_posterior_predictive function.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)

with pm.Model() as model_oos_regression:
    X = pm.MutableData("X", X_train)
    Y = Y_train
    α = pm.Exponential("α", 1)
    μ = pmb.BART("μ", X, np.log(Y))
    y = pm.NegativeBinomial("y", mu=pm.math.exp(μ), alpha=α, observed=Y, shape=μ.shape)
    idata_oos_regression = pm.sample(random_seed=RANDOM_SEED)
    posterior_predictive_oos_regression_train = pm.sample_posterior_predictive(
        trace=idata_oos_regression, random_seed=RANDOM_SEED
    )

Error messages:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[36], line 8
      6 y = pm.NegativeBinomial("y", mu=pm.math.exp(μ), alpha=α, observed=Y, shape=μ.shape)
      7 idata_oos_regression = pm.sample(random_seed=RANDOM_SEED)
----> 8 posterior_predictive_oos_regression_train = pm.sample_posterior_predictive(
      9     trace=idata_oos_regression, random_seed=RANDOM_SEED
     10 )

File /opt/conda/envs/python3/lib/python3.9/site-packages/pymc/sampling/forward.py:673, in sample_posterior_predictive(trace, model, var_names, sample_dims, random_seed, progressbar, return_inferencedata, extend_inferencedata, predictions, idata_kwargs, compile_kwargs)
    671         ikwargs.setdefault("inplace", True)
    672     return pm.predictions_to_inference_data(ppc_trace, **ikwargs)
--> 673 idata_pp = pm.to_inference_data(posterior_predictive=ppc_trace, **ikwargs)
    675 if extend_inferencedata and idata is not None:
    676     idata.extend(idata_pp)

File /opt/conda/envs/python3/lib/python3.9/site-packages/pymc/backends/arviz.py:512, in to_inference_data(trace, prior, posterior_predictive, log_likelihood, coords, dims, sample_dims, model, save_warmup, include_transformed)
    509 if isinstance(trace, InferenceData):
    510     return trace
--> 512 return InferenceDataConverter(
    513     trace=trace,
    514     prior=prior,
    515     posterior_predictive=posterior_predictive,
    516     log_likelihood=log_likelihood,
    517     coords=coords,
    518     dims=dims,
    519     sample_dims=sample_dims,
    520     model=model,
    521     save_warmup=save_warmup,
    522     include_transformed=include_transformed,
    523 ).to_inference_data()

File /opt/conda/envs/python3/lib/python3.9/site-packages/pymc/backends/arviz.py:433, in InferenceDataConverter.to_inference_data(self)
    423 def to_inference_data(self):
    424     """Convert all available data to an InferenceData object.
    425 
    426     Note that if groups can not be created (e.g., there is no `trace`, so
    427     the `posterior` and `sample_stats` can not be extracted), then the InferenceData
    428     will not have those groups.
    429     """
    430     id_dict = {
    431         "posterior": self.posterior_to_xarray(),
    432         "sample_stats": self.sample_stats_to_xarray(),
--> 433         "posterior_predictive": self.posterior_predictive_to_xarray(),
    434         "predictions": self.predictions_to_xarray(),
    435         **self.priors_to_xarray(),
    436         "observed_data": self.observed_data_to_xarray(),
    437     }
    438     if self.predictions:
    439         id_dict["predictions_constant_data"] = self.constant_data_to_xarray()

File /opt/conda/envs/python3/lib/python3.9/site-packages/arviz/data/base.py:65, in requires.__call__.<locals>.wrapped(cls)
     63     if all((getattr(cls, prop_i) is None for prop_i in prop)):
     64         return None
---> 65 return func(cls)

File /opt/conda/envs/python3/lib/python3.9/site-packages/pymc/backends/arviz.py:344, in InferenceDataConverter.posterior_predictive_to_xarray(self)
    342 data = self.posterior_predictive
    343 dims = {var_name: self.sample_dims + self.dims.get(var_name, []) for var_name in data}
--> 344 return dict_to_dataset(
    345     data, library=pymc, coords=self.coords, dims=dims, default_dims=self.sample_dims
    346 )

File /opt/conda/envs/python3/lib/python3.9/site-packages/arviz/data/base.py:307, in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
    305 data_vars = {}
    306 for key, values in data.items():
--> 307     data_vars[key] = numpy_to_data_array(
    308         values,
    309         var_name=key,
    310         coords=coords,
    311         dims=dims.get(key),
    312         default_dims=default_dims,
    313         index_origin=index_origin,
    314         skip_event_dims=skip_event_dims,
    315     )
    316 return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))

File /opt/conda/envs/python3/lib/python3.9/site-packages/arviz/data/base.py:254, in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
    252 # filter coords based on the dims
    253 coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254 return xr.DataArray(ary, coords=coords, dims=dims)

File /opt/conda/envs/python3/lib/python3.9/site-packages/xarray/core/dataarray.py:450, in DataArray.__init__(self, data, coords, dims, name, attrs, indexes, fastpath)
    448 data = _check_data_shape(data, coords, dims)
    449 data = as_compatible_data(data)
--> 450 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
    451 variable = Variable(dims, data, attrs, fastpath=True)
    453 if not isinstance(coords, Coordinates):

File /opt/conda/envs/python3/lib/python3.9/site-packages/xarray/core/dataarray.py:173, in _infer_coords_and_dims(shape, coords, dims)
    171     dims = tuple(dims)
    172 elif len(dims) != len(shape):
--> 173     raise ValueError(
    174         "different number of dimensions on data "
    175         f"and dims: {len(shape)} vs {len(dims)}"
    176     )
    177 else:
    178     for d in dims:

ValueError: different number of dimensions on data and dims: 3 vs 2

Thank you in advance for your help.

Might be dimension mismatch between arguments of the likelihood. What do you get when you print
μ.shape.eval() and Y.shape?

Thanks for the quick reply. I’ve printed them and it gave the same dimension. So it might be another problem there. Actually I copied the same code which you can find in the above link and it should have run without error message, right? That’s why I do not understand why it does not work for me.

print(μ.shape.eval())
print(Y.shape)
[278]
(278,)

Hmm I can’t replicate your error. When I run the following code

from pathlib import Path

import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
import pymc_bart as pmb

from sklearn.model_selection import train_test_split


RANDOM_SEED = 5781
np.random.seed(RANDOM_SEED)
az.style.use("arviz-darkgrid")

try:
    bikes = pd.read_csv(Path("..", "data", "bikes.csv"))
except FileNotFoundError:
    bikes = pd.read_csv(pm.get_data("bikes.csv"))

features = ["hour", "temperature", "humidity", "workingday"]

X = bikes[features]
Y = bikes["count"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=RANDOM_SEED)

with pm.Model() as model_oos_regression:
    X = pm.MutableData("X", X_train)
    Y = Y_train
    α = pm.Exponential("α", 1)
    μ = pmb.BART("μ", X, np.log(Y))
    y = pm.NegativeBinomial("y", mu=pm.math.exp(μ), alpha=α, observed=Y, shape=μ.shape)
    idata_oos_regression = pm.sample(random_seed=RANDOM_SEED)
    posterior_predictive_oos_regression_train = pm.sample_posterior_predictive(
        trace=idata_oos_regression, random_seed=RANDOM_SEED
    )

It samples fine:

Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>NUTS: [α]
>PGBART: [μ]
 |████████████| 100.00% [8000/8000 00:49<00:00 Sampling 4 chains, 0 divergences]Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 50 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
█Sampling: [y]
 |████████████████████| 100.00% [4000/4000 00:00<00:00]

I ran this in a separate environment where I just installed pymc_bar, scikit-learn and spyder-kernels=2.5.0. The installed pymc version is 5.10.2. What is your version?

Thanks for your help. It works now:)

I think I installed the wrong packages (arviz, pymc3) that’s why it didn’t work before. It would be good if the website/document can mention about the packages to run the codes.