I think you might be running into compilation time issues.
You can avoid all recompilation if you use nutpie to sample:
import pymc as pm
import numpy as np
import pytensor.tensor as pt
dataBaseline = np.random.randn(8000)
dataTreatment = np.random.randn(8000)
with pm.Model() as model:
dataBaseline = pm.Data("dataBaseline", dataBaseline)
dataTreatment = pm.Data("dataTreatment", dataTreatment)
# Priors for the mean of treatment and baseline groups
muBaseline = pm.Normal('muBaseline', mu=pt.mean(dataBaseline), sigma=10)
muTreatment = pm.Normal('muTreatment', mu=pt.mean(dataBaseline), sigma=10)
# Priors for the standard deviation of treatment and baseline groups
sigmaTreatment = pm.HalfNormal('sigmaTreatment', sigma=10)
sigmaBaseline = pm.HalfNormal('sigmaBaseline', sigma=10)
Treatment_likelihood = pm.Normal('Treatment_likelihood', mu=muTreatment, sigma=sigmaTreatment, observed=dataTreatment)
Baseline_likelihood = pm.Normal('Baseline_likelihood', mu=muBaseline, sigma=sigmaBaseline, observed=dataBaseline)
diff_means = pm.Deterministic('diff_means', muTreatment - muBaseline)
# Compile once (takes about 8s on my machine)
compiled = nutpie.compile_pymc_model(model)
# Run with some new dataset (takes about 80ms)
compiled_data1 = compiled.with_data(dataBaseline=np.random.randn(1000), dataTreatment=np.random.randn(1000))
tr = nutpie.sample(compiled_data1)
You can then change the dataset in the loop and resample without having to recompile.
Ignoring the technical side of how to do this, maybe it is worth thinking a bit about why you want to do this in the first place. I don’t think this is wrong in any way, but it is a bit unusual.
If you are interested in how the treatment effect changes over time for instance, would it not be better to fit the whole dataset, but allow the treatment effect to vary?
So for instance you could do something like
import pymc as pm
import numpy as np
import pytensor.tensor as pt
import pandas as pd
data = pd.DataFrame({
"week": ["week1", "week1", "week1", "week2", "week2"],
"batch": list(range(5)),
"group": ["treatment", "baseline", "baseline", "treatment", "baseline"],
"yield_value": np.random.rand(5) * 100,
})
week_idx, weeks = pd.factorize(data["week"])
is_treatment = (data["group"] == "treatment").values
coords = {
"week": weeks
}
with pm.Model(coords=coords) as model:
# What is the mean over time of the baseline value
intercept = pm.Normal("intercept", sigma=10)
# How much does the baseline differ between weeks?
week_effect_sd = pm.HalfNormal("week_effect_sd", sigma=3)
# How much does each individual weeks baseline differ from the mean?
# This uses a non-centered parametrization
week_effect_centered = pm.ZeroSumNormal("week_effect_centered", dims="week")
week_effect = pm.Deterministic(
"week_effect",
week_effect_sd * week_effect_centered,
dims="week",
)
# This would be a centered parametrization. Sometimes one of those
# samples faster and more stable, sometimes the other
#week_effect = pm.ZeroSumNormal("week_effect", sigma=week_effect_sd, dims="week")
# How much bigger is the treament value on average?
treatment_effect = pm.Normal("treatment_effect", sigma=5)
# How does the per-week value differ from that average?
week_treatment_effect_sd = pm.HalfNormal("week_treatment_effect_sd", sigma=3)
week_treatment_effect_centered = pm.ZeroSumNormal("week_treatment_effect_centered", dims="week")
week_treatment_effect = pm.Deterministic(
"week_treatment_effect",
week_effect_sd * week_effect_centered,
dims="week",
)
mu = (
intercept
+ week_effect[week_idx]
+ is_treatment * treatment_effect
+ is_treatment * week_treatment_effect[week_idx]
)
sigma = pm.HalfNormal("sigma")
pm.Normal("yield_value", mu=mu, sigma=sigma, observed=data["yield_value"])
If your observations are percentages, maybe you also want to change the likelihood?
Assuming the yield is always strictly between 0 and 100, you could for instance use
sigma = pm.HalfNormal("sigma")
pm.LogitNormal("yield_ratio", mu=mu, sigma=sigma, observed=data["yield_value"] / 100)