I’m learning PyMC. I’m trying to specify a hierarchical linear model, but something in my specification must be wrong, because I’m ending up with essentially the same values of parameters being estimated for every data group and every data subgroup.
I’m generating synthetic data - two explanatory variables and one explained variable that is a linear combination of the two plus intercept. There are two levels - observations are partitioned into 4 groups, each group is partitioned further into 3 subgroups, thus resulting in 12 clusters. The betas for particular clusters are generated as follows.
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
from sklearn.linear_model import LinearRegression
rng = np.random.default_rng(1231)
n = 200 #number of observations
m = 2 #number of explanatory variables
n_groups = 4 #four big groups
n_subgroups = 3 #three subgroups per group
n_clusters = n_groups * n_subgroups
clusters_sizes = [5, 5, 20, 9, 13, 14, 30, 17, 18, 19, 20, 30]
# clusters: 00, 01, 02, ..., 31, 32
## generate betas
global_beta_mean = 10
group_level_beta_var = 10
# sample mean betas for each of the 4 groups
group_level_beta_mean = rng.normal(global_beta_mean, group_level_beta_var, (n_groups, m+1))
subgroup_level_beta_var = 5
# sample betas for each of the clusters
beta_real = []
for item in group_level_beta_mean:
for i in range(n_subgroups):
beta_real.append(rng.normal(item, subgroup_level_beta_var))
beta_real = np.round(beta_real, 2)
And the (labeled) observations are generated as follows.
X = rng.normal(0, 12, (n,m))
df = pd.DataFrame({"y": np.nan,
"intercept": 1,
"x1": X[:,0],
"x2": X[:,1],
"group": np.nan,
"subgroup": np.nan,
"cluster": np.nan,
"cluster_n": np.nan})
group_ = []
subgroup_ = []
cluster_ = []
cluster_n_ = []
current_group = 0
current_subgroup = 0
for i in range(n_clusters):
current_cluster = "{}{}".format(current_group, current_subgroup)
current_cluster_size = clusters_sizes[i]
for j in range(current_cluster_size):
group_.append(current_group)
subgroup_.append(current_subgroup)
cluster_.append(current_cluster)
cluster_n_.append(i)
current_subgroup = (current_subgroup + 1)%3
if current_subgroup == 0:
# moving to the next group
current_group = (current_group + 1)%4
df["group"] = group_
df["subgroup"] = subgroup_
df["cluster"] = cluster_
df["cluster_n"] = cluster_n_
for i in range(n_clusters):
tempdf = df[df["cluster_n"]==i].copy()
df.loc[df["cluster_n"]==i, "y"] = np.matmul(tempdf[["intercept", "x1", "x2"]], beta_real[i])
df = df.sort_values(by="cluster")
I’m using globally fitted linear regression coefficients as prior global means, and my model specification is:
# get priors on global means
reg = LinearRegression().fit(df[["x1", "x2"]], df["y"])
intercept = np.round(reg.intercept_, 2)
prior_global_beta_mean = np.round(reg.coef_, 2)
prior_global_beta_mean = np.insert(prior_global_beta_mean, 0, intercept)
# create indices for broadcasting in models
cluster_idx = list(df["cluster_n"])
# map clusters on corresponding groups
cluster_on_group = {c: df[df["cluster"]==c]["group"].unique()[0] for c in df["cluster"].unique()}
group_indices_of_clusters = np.array([x for x in cluster_on_group.values()])
with pm.Model() as hierarchical_model:
# hyperpriors
beta_global = pm.Normal('beta_mu_global', mu=prior_global_beta_mean, sd=10, shape=m+1)
# priors: groups
beta_group = pm.Normal('beta_group', mu=beta_global, sd=5, shape=(n_groups, m+1))
# priors: clusters
beta_cluster = pm.Normal('beta_cluster', mu=beta_group[group_indices_of_clusters,:],
sd=5, shape=(n_clusters, m+1))
eps = pm.HalfCauchy('eps', 5)
nu = pm.Exponential('nu', 1/30)
mu = pm.math.dot(beta_cluster[cluster_idx,:], df[["intercept", "x1", "x2"]].T)
y_pred = pm.StudentT('y_pred',
mu=mu,
sd=eps, nu=nu, observed=df["y"])
trace_hmlm = pm.sample(2000, cores=2, chains=2)
I guess I’m not telling PyMC what I want to say, but I can’t see where the error is. Any help will be appreciated.