Hello, I am making mixed effect model using Pymc ver5
This is my code:
def build_logistic_mixed_effect_model(X_train, X_test, y_train, y_test, predictor1, predictor2, predictor3, ar_structure=False, ar_lags=1, filename=‘model’):
print(X_train.values.shape)
print(X_test.values.shape)
# Set multi-level index for panel data
X_train.index = pd.MultiIndex.from_frame(X_train[['ID', 'Wave']])
time_idxs, times = pd.factorize(X_train.index.get_level_values(1))
region_idxs, regions = pd.factorize(X_train['HomeRegion'])
id_idxs, ids = pd.factorize(X_train['ID'])
# One-hot encoding for categorical predictors
X_train_encoded = pd.get_dummies(X_train, columns=predictor1 + predictor2 + predictor3, drop_first=True)
# Filter columns based on predictor groups
feature_tij_columns = X_train_encoded.columns[
X_train_encoded.columns.str.contains('|'.join(predictor1))
]
feature_ij_columns = X_train_encoded.columns[
X_train_encoded.columns.str.contains('|'.join(predictor2))
]
n_ids = len(np.unique(id_idxs))
n_regions = len(np.unique(region_idxs))
coords = {
'obs_idx': np.arange(len(X_train_encoded.index)),
'feature_tij': np.arange(len(feature_tij_columns)),
'feature_ij': np.arange(len(feature_ij_columns)),
'ID_dim': np.arange(n_ids),
'Region_dim': np.arange(n_regions)
}
with pm.Model(coords_mutable=coords) as model:
# Extracting predictors and response
X_tij = pm.Data("X_tij", X_train_encoded[feature_tij_columns].values, mutable=True, dims=["obs_idx", "feature_tij"])
X_ij = pm.Data("X_ij", X_train_encoded[feature_ij_columns].values, mutable=True, dims=["obs_idx", "feature_ij"])
# Updated random intercepts with dimensionality
intercept_ind = pm.Normal("intercept_ind", mu=0, sigma=1, dims=["ID_dim"])
intercept_region = pm.Normal("intercept_region", mu=0, sigma=1, dims=["Region_dim"])
# Fixed effects for predictors
beta_ij = pm.Normal("beta_ij", mu=0, sigma=1, shape=X_ij.shape[1], dims=['feature_ij'])
beta_tij = pm.Normal("beta_tij", mu=0, sigma=1, shape=X_tij.shape[1], dims=['feature_tij'])
# Linear predictor
linear_combination = (
intercept_ind[id_idxs] +
intercept_region[region_idxs] +
pm.math.dot(X_ij, beta_ij) +
pm.math.dot(X_tij, beta_tij)
)
# AR structure for residuals
if ar_structure:
error_ar = pm.AR("error_ar", rho=np.ones(max(ar_lags, 2)), sigma=1, constant=True, observed=y_train.values)
logistic_prob = pm.math.invlogit(linear_combination + error_ar)
else:
logistic_prob = pm.math.invlogit(linear_combination)
# Observed data
y_obs = pm.Bernoulli('y_obs', p=logistic_prob, observed=y_train.values, dims=['obs_idx'])
# Sampling
trace = pm.sample(3000, tune=1000, chains=4, return_inferencedata=True)
# Compute log-likelihood
pm.compute_log_likelihood(trace)
# Save model graph
pm.model_to_graphviz(model).render(filename, format='jpg', cleanup=True)
# Data preprocessing for test set
X_test.index = pd.MultiIndex.from_frame(X_test[['ID', 'Wave']])
time_idxs, times = pd.factorize(X_test.index.get_level_values(1))
region_idxs, regions = pd.factorize(X_test['HomeRegion'])
id_idxs, ids = pd.factorize(X_test['ID'])
X_test_encoded = pd.get_dummies(X_test, columns=predictor1 + predictor2 + predictor3, drop_first=True)
# Update coords for test data
test_feature_tij_columns = X_test_encoded.columns[
X_test_encoded.columns.str.contains('|'.join(predictor1))
]
test_feature_ij_columns = X_test_encoded.columns[
X_test_encoded.columns.str.contains('|'.join(predictor2))
]
n_ids = len(np.unique(id_idxs))
n_regions = len(np.unique(region_idxs))
test_coords = {
'obs_idx': np.arange(len(X_test_encoded.index)),
'feature_tij': np.arange(len(test_feature_tij_columns)),
'feature_ij': np.arange(len(test_feature_ij_columns)),
'ID_dim': np.arange(n_ids),
'Region_dim': np.arange(n_regions)
}
with model:
# Update model with test data
pm.set_data({
"X_tij": X_test_encoded[test_feature_tij_columns].values,
"X_ij": X_test_encoded[test_feature_ij_columns].values
}, coords=test_coords)
# Sample posterior predictive
posterior_predictive = pm.sample_posterior_predictive(trace, predictions=True).prediction
y_pred = np.where(posterior_predictive['y_obs'].mean(('chain', 'draw')) > 0.5, 1, 0)
posterior_prediction(y_test, y_pred, filename)
return model, trace
and this is my error code,
ValueError: Shape mismatch: A.shape[0] != y.shape[0]
Apply node that caused the error: CGemv{inplace}(AdvancedSubtensor1.0, 1.0, X_ij, beta_ij, 1.0)
Toposort index: 9
Inputs types: [TensorType(float64, shape=(None,)), TensorType(float64, shape=()), TensorType(float64, shape=(None, None)), TensorType(float64, shape=(None,)), TensorType(float64, shape=())]
Inputs shapes: [(9298,), (), (2325, 5), (5,), ()]
Inputs strides: [(8,), (), (8, 18600), (8,), ()]
Inputs values: [‘not shown’, array(1.), ‘not shown’, array([ 0.86050279, -0.08097759, -1.13584385, 1.33682547, -1.28805127]), array(1.)]
Outputs clients: [[Composite{sigmoid((i0 + i1))}(CGemv{inplace}.0, CGemv{inplace}.0)]]
I think this error occured by multiindex mismatch between Test and Train data, but I don’t have no clue to fix it.