I have created a smallish example of my model. I’m trying to predict on new data for a specific group.
But after sampling, I’m getting conflicting sizes for dimension 'observation': length 33 on the data but length 2 on coordinate 'observation'
:
Toy Data
import pymc as pm
import arviz as az
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n_groups = 3
group_intercept = 0.0 + np.random.normal(0, 0.1, n_groups)
group_trend = 1.0 + np.random.normal(0, 0.1, n_groups)
x = np.linspace(-1, 1, 11)
df = pd.DataFrame()
for i in np.arange(n_groups):
y_mu = group_intercept[i] - group_trend[i]*x
y = np.random.normal(y_mu, 0.01)
new_df = pd.DataFrame({'x': x, 'y': y, 'group': i})
df = pd.concat([df, new_df], ignore_index = True)
df['observation'] = np.arange(len(df))
Model
with pm.Model() as model:
model.add_coord('group', df['group'].unique(), mutable = True)
model.add_coord('observation', df['observation'], mutable = True)
x = pm.MutableData('x', df['x'], dims = 'observation')
y = pm.MutableData('y', df['y'], dims = 'observation')
group_idx = pm.MutableData('group_idx', df['group'], dims = 'observation')
intercept = pm.Normal('intercept', 0.0, 1.0)
trend = pm.HalfNormal('trend', 1.0)
error = pm.HalfNormal('error', 1.0)
group_intercept = pm.Normal('group_intercept', intercept, 1.0, dims = 'group')
group_trend = pm.HalfNormal('group_trend', trend, dims = 'group')
mu = pm.Deterministic('mu', group_intercept[group_idx] - group_trend[group_idx]*x, dims = 'observation')
likelihood = pm.Normal('likelihood', mu, error, observed = y, dims = 'observation')
print('Sample posterior...')
inference_data = pm.sample()
print('Sample prior predictive...')
inference_data.extend(pm.sample_prior_predictive())
print('Sample posterior predictive...')
inference_data.extend(pm.sample_posterior_predictive(inference_data))
Sample posterior...
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, trend, error, group_intercept, group_trend]
100.00% [8000/8000 00:07<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 8 seconds.
Sampling: [error, group_intercept, group_trend, intercept, likelihood, trend]
Sample prior predictive...
Sample posterior predictive...
Sampling: [likelihood]
100.00% [4000/4000 00:00<00:00]
Prediction
new_x = np.array([-2.0, 2.0])
new_group_idx = np.full_like(new_x, df['group'].max()).astype(int)
new_observation = df['observation'].max() + np.arange(len(new_x)) + 1
with model:
pm.set_data(new_data = {'x': new_x,
'group_idx': new_group_idx},
coords = {'observation': new_observation})
pred_inference_data = pm.sample_posterior_predictive(inference_data, predictions = True)
Sampling: [likelihood]
100.00% [4000/4000 00:00<00:00]
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/mnt/c/Users/1437886/OneDrive - Danaher/Documents/Git Repositories/Bayesian-Velocity-Profiling/PyMC Toy Model.ipynb Cell 3 in <cell line: 5>()
5 with model:
6 pm.set_data(new_data = {'x': new_x,
7 'group_idx': new_group_idx},
8 coords = {'observation': new_observation})
---> 10 pred_inference_data = pm.sample_posterior_predictive(inference_data, predictions = True)
File ~/miniconda3/envs/pymc/lib/python3.10/site-packages/pymc/sampling.py:2041, in sample_posterior_predictive(trace, samples, model, var_names, keep_size, random_seed, progressbar, return_inferencedata, extend_inferencedata, predictions, idata_kwargs, compile_kwargs)
2039 ikwargs.setdefault("idata_orig", trace)
2040 ikwargs.setdefault("inplace", True)
-> 2041 return pm.predictions_to_inference_data(ppc_trace, **ikwargs)
2042 converter = pm.backends.arviz.InferenceDataConverter(posterior_predictive=ppc_trace, **ikwargs)
2043 converter.nchains = nchain
File ~/miniconda3/envs/pymc/lib/python3.10/site-packages/pymc/backends/arviz.py:656, in predictions_to_inference_data(predictions, posterior_trace, model, coords, dims, idata_orig, inplace)
654 aelem = next(iter(predictions.values()))
655 converter.nchains, converter.ndraws = aelem.shape[:2]
--> 656 new_idata = converter.to_inference_data()
657 if idata_orig is None:
658 return new_idata
File ~/miniconda3/envs/pymc/lib/python3.10/site-packages/pymc/backends/arviz.py:526, in InferenceDataConverter.to_inference_data(self)
516 id_dict = {
...
163 f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} "
164 "matching the dimension size"
165 )
ValueError: conflicting sizes for dimension 'observation': length 33 on the data but length 2 on coordinate 'observation'
The issue occurs in the part where it converts it to inference data, because setting return_inferencedata = False
lets it complete without issues.