Hey I am trying to set up a model, but I can not predict with it. The discussions before did not really helped me. I just wanna draw some samples from my posterior distribution. pm.set_data sets up my data correctly, but the posterior_predictive function works just on my training set. I briefed all possible mistakes, there are no invalid groups, both columns from my test dataset having the same length, but It wont work. Please help me It drives me crazy.
import pandas as pd
import numpy as np
import pymc3 as pm
#
dtype_dict = {
'FEATURE_1': 'float',
'CATEGORY': 'category',
'COUNT': 'int',
'TIME': 'int'
}
usecols_list = list(dtype_dict.keys())
data = pd.read_csv(
'path/to/your/data.csv',
sep=';',
header=0,
dtype=dtype_dict,
usecols=usecols_list
)
# Filter
data = data[data['FEATURE_1'].isin([0, 1]) & data['FEATURE_1'].notna()]
data_prior_analysis = data[(data['TIME'] >= 1994) & (data['TIME'] < 1996)]
data_train = data[(data['TIME'] > =1996) & (data['TIME'] <= 1998)]
data_test = data[data['TIME'] > 1999]
# function Prior knowledge
def observing_prior_knowledge(df, count_col, category_col, feature_col):
epsilon = 1e-8
selected_data = df[[count_col, category_col, feature_col]]
category_stats = selected_data.groupby(category_col)[count_col].agg(['mean', 'median', 'std'])
category_stats['log_mean'] = np.log(category_stats['mean'] + epsilon)
category_stats['log_median'] = np.log(category_stats['median'] + epsilon)
feature_stats = selected_data[feature_col].agg(['mean', 'median', 'std'])
feature_stats['log_mean'] = np.log(feature_stats['mean'] + epsilon)
feature_stats['log_median'] = np.log(feature_stats['median'] + epsilon)
feature_df = pd.DataFrame(feature_stats).transpose()
feature_df.index = ['FEATURE_STATS']
result_df = pd.concat([category_stats, feature_df])
return result_df
# prior knowledge
data_prior = observing_prior_knowledge(data_prior_analysis, 'COUNT', 'CATEGORY', 'FEATURE_1')
# preparing data train
cat_idx, groups = pd.factorize(data_train['CATEGORY'], sort=True)
feature_data = data_train['FEATURE_1'].values
counts = data_train['COUNT'].values
# Priors
group_means = data_prior.loc[groups, 'mean'].fillna(0).values
group_stds = data_prior.loc[groups, 'std'].fillna(1).values
beta_cont_mean = data_prior.loc['FEATURE_STATS', 'mean']
beta_cont_std = data_prior.loc['FEATURE_STATS', 'std']
# Model
coords = {"group": groups, "feature": feature_data}
with pm.Model(coords=coords) as frequency_model:
category_data = pm.Data("CATEGORY", cat_idx, mutable=True)
feature_data = pm.Data("FEATURE_DATA", feature_data, mutable=True)
# Priors
Intercept = pm.Normal("Intercept", mu=0, sigma=1)
beta_cat = pm.Normal("beta_cat", mu=group_means, sigma=group_stds, dims="group")
beta_cont = pm.Normal("beta_cont", mu=beta_cont_mean, sigma=beta_cont_std)
# Model
lambda_freq = pm.math.exp(Intercept + beta_cat[category_data] + beta_cont * feature_data)
# Likelihood
counts_obs = pm.Poisson("counts", mu=lambda_freq, observed=counts)
# Sampling
trace = pm.sample(1000, chains=4, tune=1000, return_inferencedata=True)
posterior_predictive = pm.sample_posterior_predictive(trace, extend_inferencedata=True)
Here starts the Problem. It sets up my data correctly but it does not sample from the posterior distrubtion. I just run into the same error again and again.
# Test data
feature_data_test = data_test[data_test['TIME'] == 1999]['FEATURE_1'].values
category_test = data_test[data_test['TIME'] == 1999]['CATEGORY'].values
#
cat_idx_test = pd.Categorical(category_test, categories=groups).codes
with frequency_model:
pm.set_data({"CATEGORY": cat_idx_test, "FEATURE_DATA": feature_data_test})
# Vorhersagen generieren
posterior_predictive_test = pm.sample_posterior_predictive(trace, extend_inferencedata=True)
This is the error I recive:
File: c:\...\site-packages\pytensor\tensor\random\op.py, line 378, in RandomVariable.perform
smpl_val = self.rng_fn(rng, *args + [size])
ValueError: Inputs values: [Generator(PCG64) at 0x253519E9D60, array([141352], dtype=int64), array([4], dtype=int64), 'not shown']
Outputs clients: [['output_1'], ['output_1']]```