Hi! I’m trying to train a model very similar to what @AlexAndorra described in his blog post when he modeled latent presidential popularity across time using a random walk. The model definition is described below. However, the training set train_df
has around 27 million rows and even trying to do pm.sample_prior_predictive()
fails. Are there any approaches that I should consider? In a previous question, @ckrapu mentioned that he:
Not sure if this is a viable option or if there are any other references out there on how to handle large datasets.
with pm.Model(coords=COORDS) as hierarchical_model:
baseline = pm.Normal("baseline", mu=1, sigma=0.9)
artifact_effect = pm.Normal("artifact_effect", sigma=0.15, dims="artifact")
region_effect = pm.Normal("region_effect", sigma=0.15, dims="region")
holiday_effect = pm.Normal("holiday_effect", sigma=0.15, dims=("region", "holiday"))
dow_effect = pm.Normal("dow_effect", sigma=0.15, dims=("region", "dow"))
woy_effect = pm.Normal("woy_effect", sigma=0.15, dims=("region", "woy"))
# need the cumsum parametrization to properly control the init of the GRW
region_rw_init = pt.zeros(shape=(len(COORDS["artifact"]), len(COORDS["region"]), 1))
region_drifts = pm.Normal(
"region_drifts",
sigma=0.05,
dims=("artifact", "region", "week_origin"),
)
region_rw_innovations = pm.Normal(
"region_rw_innovations",
dims=("artifact", "region", "week_minus_origin"),
)
region_rw_raw = pm.Deterministic(
"region_rw_raw",
pt.cumsum(pt.concatenate([region_rw_init, region_drifts + region_rw_innovations], axis=-1), axis=-1),
dims=("artifact", "region", "week")
)
region_sd = pm.HalfNormal("region_shrinkage_pop", 0.1)
artifact_region_week_effect = pm.Deterministic(
"region_week_effect",
region_rw_raw * region_sd,
dims=("artifact", "region", "week")
)
expected_qty = pm.math.exp(
baseline
+ artifact_effect[artifact_id]
+ region_effect[region_id]
+ holiday_effect[region_id, holiday_id]
+ dow_effect[region_id, dow_id]
+ woy_effect[region_id, woy_id]
+ artifact_region_week_effect[artifact_id, region_id, week_id]
)
α = pm.Exponential("alpha", 10)
# Negative Binomial likelihood
qty = pm.NegativeBinomial("qty", mu=expected_qty, alpha=α, observed=train_df.qty.values, dims='observation')