Issue Summary
I have defined a hierarchical Bayesian model using PyMC and successfully trained it on the train_df
dataset. Sampling and posterior predictive checks on the training data work correctly.
However, when I attempt to update the model with test data using pm.set_data()
and run posterior predictions, I get a shape mismatch error.
Model Definition
import numpy as np
import pandas as pd
import pymc as pm
# Avoid zeros in the target variable
train_df['disease_count_per_100k'] += 1e-6
# Define features
district_features = [
'Population', 'Literacy Rate', 'income', 'Temperature', 'Relative Humidity',
'Lag_Temp_hum_Product', 'forest_count_per_100k', 'farmland_count_per_100k',
'water_bodies_per_100k', 'parks_per_100k', 'hospitals_per_100k',
'sanitation_per_100k', 'doctors_count_per_100k'
]
# Create hierarchical indices
train_df['district_month'] = train_df['District'].astype(str) + "_" + train_df['Month'].astype(str)
# Factorize indices
district_month_idx_train, district_months = pd.factorize(train_df['district_month'], sort=True)
district_idx_train, districts = pd.factorize(train_df['District'], sort=True)
# Define PyMC coordinates
coords = {
'district': districts, # Unique districts
'district_month': district_months, # Unique district-month combinations
'district_feature': district_features
}
with pm.Model(coords=coords) as Hierarchical_model:
# **Data Inputs**
X_district = pm.Data('X_district', train_df[district_features],mutable = True)
y = pm.Data('y', train_df['disease_count_per_100k'],mutable = True)
district_idx = pm.Data('district_idx', district_idx_train,mutable = True)
district_month_idx = pm.Data('district_month_idx', district_month_idx_train,mutable = True )
# **District-level Intercept**
sigma_a_district = pm.HalfNormal('sigma_a_district', sigma=0.5)
mu_a_district = pm.Normal('mu_a_district', mu=-1.5, sigma=0.5)
a_district = pm.Normal('a_district', mu=mu_a_district, sigma=sigma_a_district,
dims=['district'], shape=(len(districts),))
# **District-level Coefficients**
mu_b_district = pm.Normal('mu_b_district', mu=-0.5, sigma=0.6, dims=['district_feature'])
sigma_b_district = pm.HalfNormal('sigma_b_district', sigma=0.5)
b_district = pm.Normal('b_district', mu=mu_b_district, sigma=sigma_b_district,
dims=['district', 'district_feature'], shape=(len(districts), len(district_features)))
# **District-Month Intercept (nested under districts)**
sigma_a_monthly = pm.HalfNormal('sigma_a_monthly', sigma=0.1)
a_district_month = pm.Normal('a_district_month', mu=a_district[district_idx],
sigma=sigma_a_monthly, dims=['district_month'],
shape=(len(district_months),))
# **District-Month Feature Coefficients (nested under districts)**
sigma_b_monthly = pm.HalfNormal('sigma_b_monthly', sigma=0.05)
b_district_month = pm.Normal('b_district_month', mu=b_district[district_idx],
sigma=sigma_b_monthly, dims=['district_month', 'district_feature'],
shape=(len(district_months), len(district_features)))
# **Model error**
sigma = pm.HalfNormal('sigma', sigma=0.3) # Adjust sigma for tighter variance
# **Expected value**
district_contrib = (X_district * b_district_month[district_month_idx]).sum(axis=1)
mu = pm.math.maximum(a_district_month[district_month_idx] + district_contrib, 1e-6)
# **Likelihood (Normal Distribution)**
Y_obs = pm.Normal('Y_obs', mu=mu, sigma=sigma, observed=y)
after this i did sampling
with Hierarchical_model:
trace = pm.sample(
draws=1000,
tune=2000, # Increase tuning iterations
chains=4,
return_inferencedata=True, # Ensure InferenceData format
random_seed=42,
)
# Ensure log-likelihood is computed
trace.extend(pm.compute_log_likelihood(trace))
Prediction with Training Data
# with hierarchical_model:
# trace = pm.sample_posterior_predictive(trace,extend_inferencedata=True)
# Ensure `trace` is the result of prior MCMC sampling
with Hierarchical_model:
posterior_predictive = pm.sample_posterior_predictive(
trace,
extend_inferencedata=True # Add posterior predictive samples to the `trace`
)
train_pred = pd.DataFrame(posterior_predictive.posterior_predictive['Y_obs'].mean(axis=1)[3])
Setting Test Data
# 1. Create district_month identifier in test data (same as training)
test_df['district_month'] = test_df['District'].astype(str) + "_" + test_df['Month'].astype(str)
# 2. Align test data indices with training categories
district_month_idx_test = pd.Categorical(
test_df['district_month'],
categories=district_months # <-- Use training's district_months order
).codes
district_idx_test = pd.Categorical(
test_df['District'],
categories=districts # <-- Use training's district order
).codes
# 3. Verify no unknown categories (-1)
assert (district_month_idx_test != -1).all(), "New district_month in test data!"
assert (district_idx_test != -1).all(), "New district in test data!"
# 4. Set test data in PyMC model
with Hierarchical_model:
pm.set_data({
"X_district": test_df[district_features].values,
"y": test_df['disease_count_per_100k'],
"district_idx": district_idx_test,
"district_month_idx": district_month_idx_test
})
Posterior Prediction on Test Data (Error)
# Generate predictions
with Hierarchical_model:
post_pred_test = pm.sample_posterior_predictive(
trace,
var_names=["Y_obs"]
)
I get the following shape mismatch error:
_common.pyx in numpy.random._common.cont_broadcast_2()
__init__.cython-30.pxd in numpy.PyArray_MultiIterNew3()
ValueError: shape mismatch: objects cannot be broadcast to a single shape. Mismatch is between arg 0 with shape (441,) and arg 1 with shape (92,).
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/pytensor/tensor/random/op.py in rng_fn(self, rng, *args, **kwargs)
173 def rng_fn(self, rng, *args, **kwargs) -> int | float | np.ndarray:
174 """Sample a numeric random variate."""
--> 175 return getattr(rng, self.name)(*args, **kwargs)
176
177 def __str__(self):
numpy/random/_generator.pyx in numpy.random._generator.Generator.normal()
_common.pyx in numpy.random._common.cont()
_common.pyx in numpy.random._common.cont_broadcast_2()
__init__.cython-30.pxd in numpy.PyArray_MultiIterNew3()
ValueError: shape mismatch: objects cannot be broadcast to a single shape. Mismatch is between arg 0 with shape (441,) and arg 1 with shape (92,).
Apply node that caused the error: normal_rv{"(),()->()"}(RNG(<Generator(PCG64) at 0x7A31878C0D60>), [441], AdvancedSubtensor1.0, ExpandDims{axis=0}.0)
Toposort index: 9
Inputs types: [RandomGeneratorType, TensorType(int64, shape=(1,)), TensorType(float64, shape=(None,)), TensorType(float64, shape=(1,))]
Inputs shapes: ['No shapes', (1,), (92,), (1,)]
Inputs strides: ['No strides', (8,), (8,), (8,)]
Inputs values: [Generator(PCG64) at 0x7A31878C0D60, array([441]), 'not shown', array([0.1186301])]
Outputs clients: [[output[2](normal_rv{"(),()->()"}.0)], [AdvancedSubtensor1(a_district_month, district_month_idx)]]
Backtrace when the node is created (use PyTensor flag traceback__limit=N to make it longer):
File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
coro.send(None)
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-34-d757e5ad86c7>", line 52, in <cell line: 0>
a_district_month = pm.Normal('a_district_month', mu=a_district[district_idx],
File "/usr/local/lib/python3.11/dist-packages/pymc/distributions/distribution.py", line 511, in __new__
rv_out = cls.dist(*args, **kwargs)
File "/usr/local/lib/python3.11/dist-packages/pymc/distributions/continuous.py", line 495, in dist
return super().dist([mu, sigma], **kwargs)
File "/usr/local/lib/python3.11/dist-packages/pymc/distributions/distribution.py", line 580, in dist
rv_out = cls.rv_op(*dist_params, size=create_size, **kwargs)
HINT: Use the PyTensor flag `exception_verbosity=high` for a debug print-out and storage map footprint of this Apply node.
Here i am getting error please tell how to correct it
Data Information
train_df.shape
=(441, 31)
test_df.shape
=(92, 31)
- Number of features used =
14
- All unique district_month indices from test data exist in training data
i also triespm.MutableData()
but not working .
Please tell me why my test test data is not getting set and giving error. @ricardoV94 @jessegrabowski @cluhmann