I am running into the classical shape issue when sampling the posterior predictive distribution on out-of-sample data, as described in this pymc discourse video by @lucianopaz . The reproducible example is as follows:
import numpy as np
import pymc as pm
import pandas as pd
import arviz as az
from pydataset import data
import matplotlib as plt
#get data
df = data('faithful')
#define factory function
def rolling_regression_model_factory(coords,x,y):
with pm.Model(coords=coords) as model:
regressor = pm.Data('x',x,dims='obs_ind')
beta_1 = pm.GaussianRandomWalk('beta_1',dims='obs_ind')
beta_0 = pm.Normal('beta_0',mu=0,sigma=10)
mu = beta_0 + beta_1 * regressor
output = pm.Data('y',y)
sigma = pm.HalfCauchy('sigma',10)
observed = pm.Normal('observed',mu=mu,sigma=sigma,observed=output,dims='obs_ind')
return model
#split train and test
df_train = df[0:200]
x_train = df_train.waiting
y_train = df_train.eruptions
coords_train = {'obs_ind':np.arange(len(df_train))}
df_test = df[201:272]
x_test = df_test.waiting
y_test = df_test.eruptions
coords_test = {'obs_ind':np.arange(len(df_test))}
#sample on training data
with rolling_regression_model_factory(coords=coords_train,x=x_train,y=y_train):
trace = pm.sample()
#sample pp on test data
with rolling_regression_model_factory(coords=coords_test,x=x_test,y=y_test):
pp = pm.sample_posterior_predictive(trace)
ValueError: Input dimension mismatch. One other input has shape[0] = 200, but input[2].shape[0] = 71.
Apply node that caused the error: Elemwise{Composite{(i0 + (i1 * i2))}}(InplaceDimShuffle{x}.0, beta_1, x)
Toposort index: 1
Inputs types: [TensorType(float64, (1,)), TensorType(float64, (None,)), TensorType(int32, (None,))]
Inputs shapes: [(1,), (200,), (71,)]
Inputs strides: [(8,), (8,), (4,)]
Inputs values: [array([-4.42644186]), 'not shown', 'not shown']
Outputs clients: [[normal_rv{0, (0, 0), floatX, True}(RandomGeneratorSharedVariable(<Generator(PCG64) at 0x7F7ECA41D9E0>), TensorConstant{[]}, TensorConstant{11}, Elemwise{Composite{(i0 + (i1 * i2))}}.0, sigma)]]