I agree that the model is mis-specificied somehow. If I set p0 and just try to infer p1, it still cannot estimate it properly. I assume I am not mixing them properly in pm.Mixture?
With p0 set to 1e-4, and mixing_fraction fixed, the only free parameter is p1, and that should be fittable from the data. p1 is getting fit to 0.3
%%capture --no-display
scores = simulated_data.reset_index(drop=True)
coords = {"observation": scores.index.values}
for mode in ["fit","predict"]:
model = pm.Model(coords=coords)
with model:
n = pm.ConstantData("n",value=scores["N"],dims="observation")
mix_frac = pm.ConstantData("mix_frac",value=scores["fraction"],dims="observation")
if mode == "fit":
p_type0 = 1e-4
p_type1 = pm.Uniform("p_type1")
observed = scores["observed_counts"]
dims = "observation"
else:
p_type0 = 1e-4
p_type1 = params["p_type1"]
observed = None
dims = None
component_t0 = pm.Binomial.dist(
n = n,
p = p_type0,
)
component_t1 = pm.Binomial.dist(
n = n,
p = p_type1,
)
observed_counts = pm.Mixture(
"counts",
w = pt.stack([1-mix_frac,mix_frac],axis=1),
comp_dists = [
component_t0,
component_t1
],
observed = observed,
)
idata = pm.sample()
pm.sample_posterior_predictive(idata, extend_inferencedata=True)
params = save_model_params(idata)
display(params)
if "fit" in mode:
display(az.plot_ppc(idata))
else:
predicted_counts = list(idata.posterior.data_vars["counts"].mean(axis=(0,1)).astype(int).values+1)
observed_counts = list(scores["observed_counts"])
tf = list(scores["fraction"])
df = scores.copy()
df["observed"] = observed_counts
df["predicted"] = predicted_counts
df["sample_type"] = df["sample_type"].astype(str)
df["obs-pred_sign"] = (((df["observed"] - df["predicted"]))>0).apply(lambda x: 1 if x else -1)
df["obs-pred_value"] = ((df["observed"] - df["predicted"]).abs()+1).apply(np.log10)
df["obs-pred"] = df["obs-pred_sign"] * df["obs-pred_value"]
fig = px.scatter(
data_frame = df,
x = "observed",
y = "predicted",
color = "sample_type",
width = 750,
height = 750,
log_x=True,
log_y=True,
)
fig.add_shape(type="line",line=dict(dash="dash",color="black"),x0=1,y0=1,x1=10_000,y1=10_000)
display(fig)
fig = px.scatter(
data_frame = df,
x = "observed",
y = "fraction",
color = "sample_type",
width = 750,
height = 750,
log_x=True,
log_y=True,
)
display(fig)
plt.figure(figsize=(7,5))
display(sns.distplot(df["obs-pred"]))
display(df["obs-pred"].abs().describe(percentiles=[i/100 for i in range(0,100,10)]))
