I previously asked about improving model sampling speed fitting about 14K observations to about 200 groups.
I am fitting a hierarchical model with Negative Binomial(alpha, mu) distributed likelihood (observations underneath level_2). The priors for the parameters alpha and mu (level_2) are Gamma(hyper_alpha_mu, hyper_alpha_sd) and Gamma(hyper_mu_mu, hyper_mu_sd) and the priors for the two levels above (level_1, level_0) are weakly informative HalfNormal distributions.
I am now fitting approximately 120K observations with 2000 classes at level_2 and 7 classes at level_1. The observations are count data on the order of magnitude of 10^5 - 10^7.
On a modestly powered laptop, the model goes through about 40 iterations a second during the initialization and it takes 3 hours to achieve 3000 samples using NUTS. When I try the sampling on a more powerful AWS instance, the sampling would take even longer.
Is there anything I can change in the model or the data to improve sampling speed?
See below for model specification.
Here is the dataset.
anonymized_dataset.csv (3.0 MB)
import pandas as pd
import pymc3 as pm
def create_association(df, parent_col, child_col):
renamed = df.rename(
columns={parent_col: 'parent', child_col: 'child'}
)
association = renamed[['parent', 'child']].drop_duplicates()
association.set_index('child', inplace=True)
association.sort_index(inplace=True)
return association
data = pd.read_csv('anonymized_dataset.csv')
associations = {
'level_0': {'parent_col': 'level_0', 'child_col': 'level_1'},
'level_1': {'parent_col': 'level_1', 'child_col': 'level_2'},
'level_2': {'parent_col': 'level_2', 'child_col': 'observation'}
}
for level in associations:
associations[level] = create_association(data, **associations[level])
print('number of groups at level_0: ', len(associations['level_0']['parent'].unique()))
print('number of associations of level_1 to level_0: ', len(associations['level_0']['parent'].values))
print('number of groups at level_1: ', len(associations['level_1']['parent'].unique()))
print('number of associations of level_2 to level_1: ', len(associations['level_1']['parent'].values))
print('number of groups at level_2: ', len(associations['level_2']['parent'].unique()))
print('number of observations associated to level_2: ', len(associations['level_2']['parent'].values))
observed = pd.merge(
associations['level_2'], data[['observation', 'counts']],
left_index=True, right_on='observation'
)['counts']
with pm.Model() as model:
hyper_hyper_alpha_mu = pm.HalfNormal(
'hyper_hyper_alpha_mu',
sd=10,
shape=len(associations['level_0']['parent'].unique())
)
hyper_hyper_alpha_sd = pm.HalfNormal(
'hyper_hyper_alpha_sd',
sd=10,
shape=len(associations['level_0']['parent'].unique())
)
hyper_hyper_mu_mu = pm.HalfNormal(
'hyper_hyper_mu_mu',
sd=10,
shape=len(associations['level_0']['parent'].unique())
)
hyper_hyper_mu_sd = pm.HalfNormal(
'hyper_hyper_mu_sd',
sd=10,
shape=len(associations['level_0']['parent'].unique())
)
hyper_alpha_mu = pm.HalfNormal(
'hyper_alpha_mu',
hyper_hyper_alpha_mu[associations['level_0']['parent'].values],
shape=len(associations['level_1']['parent'].unique())
)
hyper_alpha_sd = pm.HalfNormal(
'hyper_alpha_sd',
hyper_hyper_alpha_sd[associations['level_0']['parent'].values],
shape=len(associations['level_1']['parent'].unique())
)
hyper_mu_mu = pm.HalfNormal(
'hyper_mu_mu',
hyper_hyper_mu_mu[associations['level_0']['parent'].values],
shape=len(associations['level_1']['parent'].unique())
)
hyper_mu_sd = pm.HalfNormal(
'hyper_mu_sd',
hyper_hyper_mu_sd[associations['level_0']['parent'].values],
shape=len(associations['level_1']['parent'].unique())
)
mu = pm.Gamma(
'mu',
mu=hyper_mu_mu[associations['level_1']['parent'].values],
sd=hyper_mu_sd[associations['level_1']['parent'].values],
shape=len(associations['level_2']['parent'].unique())
)
alpha = pm.Gamma(
'alpha',
mu=hyper_alpha_mu[associations['level_1']['parent'].values],
sd=hyper_alpha_sd[associations['level_1']['parent'].values],
shape=len(associations['level_2']['parent'].unique())
)
y_est = pm.NegativeBinomial(
'y_est',
mu=mu[associations['level_2']['parent'].values],
alpha=alpha[associations['level_2']['parent'].values],
observed=observed
)
with model:
hierarchical_trace = pm.sample(3000, progressbar=True)