Attached is a synthetic example showing where I am having trouble! (Table for the moment that a linear model is not the best fit for Likert-scale data).
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
import theano
# generate Likert y; 1-7
obs_y = np.random.randint(7, size=(100, 1)) + 1
# generate treatment and control group
treatment = np.random.randint(3, size=(100,))
# convert to one-hot;
treatment = pd.get_dummies(data=treatment, drop_first=True)
# generate many categorical controls
covariates = np.random.randint(2, size=(100, 56))
# coords for pymc3
coords = {
'obs_id': np.arange(len(obs_y))
}
# specify model
with pm.Model(coords=coords) as main_lm_model:
'''
linear model replication
'''
# priors
alpha = pm.Normal('alpha', mu=0, sigma=1)
treatment_betas = pm.Normal("treatment_betas", mu=0, sigma=1, shape=treatment.shape[1])
covariates_betas = pm.Normal("covariates_betas", mu=0, sigma=1, shape=covariates.shape[1])
# model error
sigma = pm.HalfNormal("sigma", sigma=1)
# matrix-dot products
m1 = pm.math.matrix_dot(treatment, treatment_betas)
# i separate these here so its easier to just look at these values only
# through the var_names argument for different checks so I dont have to look
# at 56 covariates and draw their chains, etc
m2 = pm.math.matrix_dot(covariates, covariates_betas)
# expected value of y
mu = alpha + m1 + m2
# Likelihood: Normal
y = pm.Normal("y", mu=mu, sigma=sigma, observed=obs_y, dims='obs_id')
# set step and sampler
step = pm.NUTS([alpha, treatment_betas, covariates_betas, sigma], target_accept=0.9)
# prior analysis
with main_lm_model:
prior_pc = pm.sample_prior_predictive(500)
# Inference button (TM)!
with main_lm_model:
lm_trace = pm.sample(draws=1000,
step=step,
init='jitter+adapt_diag',
cores=4,
tune=500, # burn in
return_inferencedata=False) # seemingly have to do this to get idata?
# ppc
with main_lm_model:
ppc = pm.fast_sample_posterior_predictive(trace=lm_trace,
random_seed=1,
# have to specify the params here unlike prior
# or will only return 'y'
var_names=['alpha', 'treatment_betas', 'covariates_betas', 'y'])
# build idata - this will fail if return_inferencedata=True
lm_idata = az.from_pymc3(trace=lm_trace,
model=main_lm_model,
prior=prior_pc,
posterior_predictive=ppc)
# stack posterior samples
stacked = lm_idata.posterior.stack(draws=("chain", "draw"))
# try to calculate mu across the posterior like:
# https://docs.pymc.io/notebooks/posterior_predictive.html
mu_pp = ( stacked['alpha'] + stacked['covariates_betas'] + (stacked['treatment_betas'] * treatment) )
# yields dataframe coercion errors