Summary
I am faced with an unexpected difference in posterior estimates when sampling from two linear hierarchical models which should be equivalent as far as I understand.
Non-vectorized model
In the first model, let’s call it the “non-vectorized” model, I am defining a separate slope parameter for each predictor, e.g.
beta1 = pm.Normal("beta1",...)
beta2 = pm.Normal("beta2",...)
...
as well as separate “across-groups” variabilities of the slopes, e.g.
tau1 = pm.Gamma("tau1",...)
tau2 = pm.Gamma("tau2",...)
...
and group-specific slopes
epsilon_beta1_g = pm.ZeroSumNormal("epsilon_beta1_g", dims="groups")
beta1_g = beta1 + tau1 * epsilon_beta1_g
...
and the linear model specification is then
mu = beta1_g[g] * x1 + beta_2_g[g] * x2 + ...
Vectorized model
In the second model, let’s call it the “vectorized” model, I am defining vectors of slopes and across-group slope variabilities as well as matrices (2d arrays) of (group, predictor)-specific slopes, e.g
beta = pm.Normal("beta",...,dims="group")
tau = pm.Normal("tau",...,dims="group")
epsilon_beta_g = pm.ZeroSumNormal("epsilon_beta_g",...,dims=["group", "predictor"])
beta_g = beta + tau * epsilon_beta_g
mu = (beta_g[g, :] * x).sum(axis=1)
...
Priors for the weakly informative setting
I am in a weakly informative setting, i.e. the likelihood does not provide a lot of evidence about the group level parameter beta_g, but I am using a large number of groups for pooling in order to obtain (reasonably) informed estimates of my slopes (beta) and slope variabilities (tau). I am also using Gamma priors restricting the range of (a-priori) likely values of (tau) in order to avoid funnel problems, as for example described in the excellent piece by M. Betancourt, cf. Hierarchical Modeling.
Results: Qualitatively different posterior estimates
I’d expect the two model specifications to result in qualitatively similar posterior estimates (given identical data), but they actually differ a lot.
For the non-vectorized specification I obtain the following summaries, which seem to reasoably bracket the parameters of my data generating process
mean hdi_3% hdi_97% ess_bulk ess_tail r_hat
beta1 1.080 0.667 1.516 4046.0 2704.0 1.0
beta2 1.254 0.835 1.655 3377.0 2579.0 1.0
beta3 0.798 0.361 1.217 3442.0 2399.0 1.0
tau1 9.534 8.947 10.138 1255.0 1602.0 1.0
tau2 12.000 11.312 12.670 944.0 1612.0 1.0
tau3 7.744 7.140 8.318 1207.0 1573.0 1.0
sigma 99.930 99.657 100.255 3464.0 2603.0 1.0
For the vectorized specification I am observing too low values for tau[x3] and also significantly reduced effective sample sizes,
mean hdi_3% hdi_97% ess_bulk ess_tail r_hat
beta[x1] 1.121 0.413 1.810 1148.0 1662.0 1.00
beta[x2] 1.197 0.390 2.068 827.0 1381.0 1.01
beta[x3] 0.850 0.389 1.305 2692.0 2814.0 1.00
tau[x1] 11.734 10.829 12.624 931.0 1608.0 1.00
tau[x2] 15.375 14.477 16.349 931.0 1507.0 1.00
tau[x3] 3.179 2.306 4.110 689.0 1167.0 1.01
sigma 100.256 99.938 100.547 4403.0 2917.0 1.00
Vectorised case: Choosing the right model specification
I am wondering if pm.ZeroSumNormal
is correctly configured, i.e. does the above definition enforce a sum of zero across the ‘group’ and not across the ‘predictor’ dimension? I tried with n_zerosum_axis=2
but that did not “improve” the posterior estimates.
Also I am not sure if the specification mu = (beta_g[g, :] * x).sum(axis=1)
is correct (and efficient) for a hierarchical linear regression model with a multi-variate predictor x
?
I am attaching the full code examples below. Need to use at least 1000 groups to observe qualitative effect due to the weak information in each group
Non vectorized model - full code
import time
import pytensor
pytensor.config.gcc__cxxflags += " -isysroot /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk"
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
def generate_data(num_groups: int = 5, num_samples_per_group: int = 20) -> pd.DataFrame:
rng = np.random.default_rng(seed=42)
beta_1 = 1
beta_2 = 1.2
beta_3 = 0.8
tau_1 = 10
tau_2 = 12
tau_3 = 8
sigma = 100
result = {"group": [], "y": [], "x1": [], "x2": [], "x3": []}
for i in range(num_groups):
beta_1g = rng.normal(loc=beta_1, scale=tau_1)
beta_2g = rng.normal(loc=beta_2, scale=tau_2)
beta_3g = rng.normal(loc=beta_3, scale=tau_3)
x1 = rng.normal(size=num_samples_per_group)
x2 = rng.normal(size=num_samples_per_group)
x3 = rng.normal(size=num_samples_per_group)
eps = rng.normal(size=num_samples_per_group)
y = beta_1g * x1 + beta_2g * x2 + beta_3g * x3 + sigma * eps
result["group"] += [i] * num_samples_per_group
result["y"] += y.tolist()
result["x1"] += x1.tolist()
result["x2"] += x2.tolist()
result["x3"] += x3.tolist()
return pd.DataFrame(result)
def make_model(frame: pd.DataFrame) -> pm.Model:
coords = {"group": frame["group"].unique()}
with pm.Model(coords=coords) as model:
# Data
g = pm.MutableData("g", frame["group"])
x1 = pm.MutableData("x1", frame["x1"])
x2 = pm.MutableData("x2", frame["x2"])
x3 = pm.MutableData("x3", frame["x3"])
y = pm.ConstantData("y", frame["y"])
# Population level
beta1 = pm.Normal("beta1", sigma=2.0)
beta2 = pm.Normal("beta2", sigma=2.0)
beta3 = pm.Normal("beta3", sigma=2.0)
tau1 = pm.Gamma("tau1", mu=10.0, sigma=5.0)
tau2 = pm.Gamma("tau2", mu=10.0, sigma=5.0)
tau3 = pm.Gamma("tau3", mu=10.0, sigma=5.0)
sigma = pm.HalfNormal("sigma", sigma=200.0)
# Group level
epsilon_beta1_g = pm.ZeroSumNormal("epsilon_beta1_g", dims="group")
beta1_g = beta1 + tau1 * epsilon_beta1_g
epsilon_beta2_g = pm.ZeroSumNormal("epsilon_beta2_g", dims="group")
beta2_g = beta2 + tau2 * epsilon_beta2_g
epsilon_beta3_g = pm.ZeroSumNormal("epsilon_beta3_g", dims="group")
beta3_g = beta3 + tau3 * epsilon_beta3_g
# Linear model
mu = beta1_g[g] * x1 + beta2_g[g] * x2 + beta3_g[g] * x3
# Likelihood
pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y, shape=mu.shape)
return model
if __name__ == "__main__":
frame = generate_data(num_groups=1000, num_samples_per_group=200)
model = make_model(frame)
t0 = time.time()
with model:
trace = pm.sample(nuts_sampler="nutpie")
t = time.time() - t0
print(f"Time for sampling is {t=:.3f}s.")
var_names=["beta1", "beta2", "beta3", "tau1", "tau2", "tau3", "sigma"]
summary = az.summary(trace, var_names=var_names).loc[:, ["mean", "hdi_3%", "hdi_97%", "ess_bulk", "ess_tail", "r_hat"]]
print(summary)
Vectorized model - full code
import time
import pytensor
pytensor.config.gcc__cxxflags += " -isysroot /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk"
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
def generate_data(num_groups: int = 5, num_samples_per_group: int = 20) -> pd.DataFrame:
rng = np.random.default_rng(seed=42)
beta_1 = 1
beta_2 = 1.2
beta_3 = 0.8
tau_1 = 10
tau_2 = 12
tau_3 = 8
sigma = 100
result = {"group": [], "y": [], "x1": [], "x2": [], "x3": []}
for i in range(num_groups):
beta_1g = rng.normal(loc=beta_1, scale=tau_1)
beta_2g = rng.normal(loc=beta_2, scale=tau_2)
beta_3g = rng.normal(loc=beta_3, scale=tau_3)
x1 = rng.normal(size=num_samples_per_group)
x2 = rng.normal(size=num_samples_per_group)
x3 = rng.normal(size=num_samples_per_group)
eps = rng.normal(size=num_samples_per_group)
y = beta_1g * x1 + beta_2g * x2 + beta_3g * x3 + sigma * eps
result["group"] += [i] * num_samples_per_group
result["y"] += y.tolist()
result["x1"] += x1.tolist()
result["x2"] += x2.tolist()
result["x3"] += x3.tolist()
return pd.DataFrame(result)
def make_model(frame: pd.DataFrame) -> pm.Model:
predictors = [col for col in frame.columns if col.startswith("x")]
coords = {"group": frame["group"].unique(), "predictor": predictors}
with pm.Model(coords=coords) as model:
# Data
g = pm.MutableData("g", frame["group"])
x = pm.MutableData("x", frame[predictors])
y = pm.ConstantData("y", frame["y"])
# Population level
beta = pm.Normal("beta", sigma=2.0, dims="predictor")
tau = pm.Gamma("tau", mu=10.0, sigma=5.0, dims="predictor")
sigma = pm.HalfNormal("sigma", sigma=200.0)
# Group level
epsilon_beta_g = pm.ZeroSumNormal("epsilon_beta_g", dims=["group", "predictor"])
beta_g = beta + tau * epsilon_beta_g
# Linear model
mu = (beta_g[g] * x).sum(axis=1)
# Likelihood
pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y, shape=mu.shape)
return model
if __name__ == "__main__":
frame = generate_data(num_groups=1000, num_samples_per_group=200)
model = make_model(frame)
t0 = time.time()
with model:
trace = pm.sample(nuts_sampler="nutpie")
t = time.time() - t0
print(f"Time for sampling is {t=:.3f}s.")
var_names=["beta", "tau", "sigma"]
summary = az.summary(trace, var_names=var_names).loc[:, ["mean", "hdi_3%", "hdi_97%", "ess_bulk", "ess_tail", "r_hat"]]
print(summary)