I’ve written a couple PYMC3 hierarchical linear regressions, and I want to check my intuition around them and how credible intervals expand and contract at the group level when making predictions.
Data Generation
def build_toy_data_set():
def calc_line(x_length, y_int, slope, noise):
return y_int + np.log(np.arange(1, x_length)) * slope + np.random.normal(0, noise, len(np.arange(1,x_length)))
def make_df(x_length, level_1, level_2, y_int, slope, noise):
return pd.DataFrame({
'level_1':level_1,
'x':np.arange(1, x_length),
'level_2':level_2,
'y': calc_line(x_length, y_int, slope, noise)
})
df = pd.DataFrame()
df = df.append(
make_df(
x_length=100,
level_1=1,
level_2=0,
y_int=0.4,
slope=-0.05,
noise=0.04
)
)
df = df.append(
make_df(
x_length=100,
level_1=1,
level_2=1,
y_int=0.5,
slope=-0.1,
noise=0.5
)
)
df = df.append(
make_df(
x_length=100,
level_1=1,
level_2=2,
y_int=0.2,
slope=-0.01,
noise=0.02
)
)
df['x_log'] = np.log(df['x'])
return df.reset_index(drop=True)
Here is what the data looks like for my run:
Right now level 1 is just ignored, and the model estimates parameters for level 2 only.
First Model
level_2_count = len(df['level_2'].unique())
with pm.Model() as model:
mu_a = pm.Normal('mu_a', mu=0., sigma=1)
sigma_a = pm.HalfCauchy('sigma_a', beta=5)
mu_b = pm.Normal('mu_b', mu=0., sigma=1)
sigma_b = pm.HalfCauchy('sigma_b', beta=5)
y_int = pm.Normal('y_int', mu=0, sigma=1, shape=level_2_count)
slope = pm.Normal('slope', mu=0, sigma=1, shape=level_2_count)
eps = pm.HalfCauchy('eps', 1.)
y = y_int[df['level_2'].values] + slope[df['level_2'].values]*df['x_log'].values
y_like = pm.Normal('y_like',
mu=y,
sigma=eps,
observed=df['y'])
When I predict with this model, this is what the ppc outputs look like:
It’s clear to me that the uncertainty isn’t being captured at the group level, so I decided to allow eps
to vary by group:
Second Model
level_2_count = len(df['level_2'].unique())
with pm.Model() as model:
mu_a = pm.Normal('mu_a', mu=0., sigma=1)
sigma_a = pm.HalfCauchy('sigma_a', beta=5)
mu_b = pm.Normal('mu_b', mu=0., sigma=1)
sigma_b = pm.HalfCauchy('sigma_b', beta=5)
y_int = pm.Normal('y_int', mu=0, sigma=1, shape=level_2_count)
slope = pm.Normal('slope', mu=0, sigma=1, shape=level_2_count)
eps = pm.HalfCauchy('eps', 1., shape=level_2_count)
y = y_int[df['level_2'].values] + slope[df['level_2'].values]*df['x_log'].values
y_like = pm.Normal('y_like',
mu=y,
sigma=eps[df['level_2'].values],
observed=df['y'])
Which yields:
Which is great, however I’m not sure this is the right way to do this. The reason I say that is because when I scale this up to my full dataset, the model takes FOREVER to fit. It also stalls out due to zero derivatives of RVs. For more context, my full data set is much noisier, and the groups are all of different sizes.
Any pointers would be greatly appreciate!