In performing some diagnostics on a model, I realized a gap in my conceptual understanding.
data_array = np.array([1.726184, 1.567740, 1.637396, 1.584314])
def build_model_and_return_stats(obs_data):
with pm.Model() as model_mix:
mu1 = pm.Normal('mu1', mu = 1.73, sigma = 0.035)
sigma1 = pm.HalfNormal('sigma1', .03)
mu2 = pm.Normal('mu2', mu=1.85, sigma = 0.025)
sigma2 = pm.HalfNormal('sigma2', .02)
norm1 = pm.Normal.dist(mu=mu1, sigma = sigma1)
norm2 = pm.Normal.dist(mu=mu2, sigma = sigma2)
w = pm.Dirichlet('w', a=np.array([1, 1]))
like = pm.Mixture('like', w=w, comp_dists = [norm1, norm2], observed=obs_data)
trace_mix = pm.sample()
with model_mix:
trace_mix.extend(pm.sample_prior_predictive())
trace_mix.extend(pm.sample_posterior_predictive(trace_mix))
prior_pred = trace_mix['prior_predictive']['like'].to_numpy().reshape(-1)
post_pred = trace_mix['posterior_predictive']['like'].to_numpy().reshape(-1)
return np.median(prior_pred), np.median(post_pred)
prior_pred_frag, post_pred_frag = build_model_and_return_stats(data_array)
print(prior_pred_frag)
print(post_pred_frag)
print(f'% change: {1 - post_pred_frag/prior_pred_frag}')
We see that just 4 data points decrease the median of the prior distribution by 6.2%. This seems like a pretty drastic influence just from 4 points.
That go me thinking about what the actual likelihood distribution would look like? Since there are only 4 datapoints, does that mean that all values that donât appear among these 4 will have a 0% probability? Is there a way to visualize what the likelihood distribution looks like? If we had 100 points, how would our distribution and the conclusions change?