Hi everyone! I am writing again because I am getting crazy with this model. I am trying to test the performance of a hierarchical model on a simulated dataset. I am creating a set of measurements with a Hurdle distribution, already described here. Here the code for the creation of the simulated dataset
def simulate_mixed_data(pi, mean, sigma, tot_size=1000, rs=1, shuffle=True):
n_zeros = int(tot_size*pi)
zeros = np.zeros(n_zeros)
np.random.seed(rs)
non_zeros = np.random.lognormal(mean=mean, sigma=sigma, size=tot_size-n_zeros)
mix = np.concatenate((zeros, non_zeros))
if shuffle:
np.random.shuffle(mix)
return mix
def extract_bias(rev_n, biases):
return np.random.choice(biases[rev_n])
pi_real_data = 0.3
mu_real_data = 1.3
sigma_real_data = 0.8
n_raters = 3
n_data = 2000
rs = 2
np.random.seed(rs)
real_values = simulate_mixed_data(pi_real_data, mu_real_data, sigma_real_data, n_data, rs=rs)
mu_bias_data = np.array([0.5, .8, 1.2])
sigma_bias_data = np.array([.8, 0.1, 0.3])
pi_bias_data = np.array([0.08, 0.1, 0.02])
biases = np.array([simulate_mixed_data(pi=pi_bias_data[i], mean=mu_bias_data[i], sigma=sigma_bias_data[i],
tot_size=n_data, rs=rs)
for i in range(n_raters)])
df = pd.DataFrame(columns=['real'], data=real_values)
df.loc[:, 'rater'] = np.random.randint(0, n_raters, size=len(real_values))
df.loc[:, 'bias'] = df.loc[:, 'rater'].apply(extract_bias, biases=biases)
df.loc[:, 'biased'] = df.bias * df.real
raters_idxs = df.rater.values
obs_values = df.biased.values
So basically I am saying that I have a distribution of real_values
rated by 3 different raters, each of which will “bias” the set of real_values
. I am also supposing that this bias is distributed with the same kind of Hurdle distribution and that the bias is multiplicative. So observations = real_values * bias
. I am building the model using the analytical relationship between variables: the observed values are the product of two hurdle distributions, for which I have a probability \pi to be 0 and 1-\pi to belong to a lognormal distribution with mean \mu and sigma \sigma. So, if
\begin{equation} real\_values \sim HLN(\pi_r, \mu_r, \sigma_r)\\bias \sim HLN(\pi_\beta, \mu_\beta, \sigma_\beta)\end{equation}
then obs (= real\_values \times bias) \sim HLN(\pi_o, \mu_o, \sigma_o)
with
\pi_o=1-(1-\pi_r)(1-\pi_\beta)
\mu_o=\mu_r + \mu_\beta
\sigma_o = \sqrt{\sigma_r^2+\sigma_\beta^2}
with pm.Model() as bias_model:
positiveNormal = pm.Bound(pm.Normal, lower=0.)
confinedNormal = pm.Bound(pm.Normal, lower=0.001, upper=.999)
# priors on real values
pi_real = pm.Beta('pi_real', alpha=1.5, beta=3. , shape=1)
mu_real = positiveNormal('mu_real',mu=2., sigma=1., shape=1)
sigma_real = pm.HalfNormal('sigma_real', sigma=1., shape=1)
# priors on raters' bias
pi_bias = pm.Beta('pi_bias', alpha=1.5, beta=3. , shape=n_raters)
mu_bias = positiveNormal('mu_bias', mu=2., sigma=1., shape=n_raters)
sigma_bias = pm.HalfNormal('sigma_bias', sigma=1., shape=n_raters)
# combination of parameters (prod of lognormally distributed)
pi_obs = confinedNormal('pi_obs', mu=1-((1-pi_real)*(1-pi_bias)), sigma=1., shape=n_raters)
mu_obs = positiveNormal('mu_obs', mu=mu_real + mu_bias, sigma=1., shape=n_raters)
sigma_obs = positiveNormal('sigma_obs', mu=pm.math.sqrt(pm.math.sqr(sigma_real)+pm.math.sqr(sigma_bias)),
sigma=1., shape=n_raters)
# Likelihood
obs = custom_dists.HurdleLognormal('obs', pi=pi_obs[raters_idxs], mu=mu_obs[raters_idxs],
sigma=sigma_obs[raters_idxs], observed=obs_values)
data = pm.sample(500, chains=3, tune=2000, return_inferencedata=True)
(the custom_dists.HurdleLognormal distribution is defined in my previous post)
Here is the issue. The model can infer pretty well the combined parameters (pi_obs, mu_obs, sigma_obs) even if it seems to understand nothing about the priors (pi/mu/sigma_real and pi/mu/sigma_bias), as you can see from the traceplot.
pi_obs = 1-((1-pi_real_data)*(1-pi_bias_data))
mu_obs = mu_real_data + mu_bias_data
sigma_obs = np.sqrt(sigma_real_data**2+sigma_bias_data**2)
lines = [('pi_obs', {}, pi_obs), ('mu_obs', {}, mu_obs), ('sigma_obs', {}, sigma_obs)]
with bias_model:
az.plot_trace(data, compact=True, var_names=['pi_obs', 'mu_obs', 'sigma_obs'],
legend=True, lines=lines)
lines = [('pi_bias', {}, pi_bias_data), ('mu_bias', {}, mu_bias_data), ('sigma_bias', {}, sigma_bias_data)]
with bias_model:
az.plot_trace(data, compact=True, var_names=['pi_bias', 'mu_bias', 'sigma_bias'],
legend=True, lines=lines)
lines = [('pi_real', {}, pi_real_data), ('mu_real', {}, mu_real_data), ('sigma_real', {}, sigma_real_data)]
with bias_model:
az.plot_trace(data, compact=True, var_names=['pi_real', 'mu_real', 'sigma_real'],
legend=True, lines=lines)
To me it looks like the traces for the real and bias parameters are basically due to the initial priors that I set. They are almost not updated at all. Despite this, the model seems to solve the problem, because with a posterior predictive check I can see that the model reproduces pretty well the observed values distribution, also rater by rater
with bias_model:
ppc = pm.sample_posterior_predictive(data, samples=1)
for n_rater in range(n_raters):
print('rev:%d, pi=%.2f mu=%.2f sigma=%.2f'%(n_rater, 1-(1-pi_real_data)*(1-pi_bias_data[n_rater]),
mu_real_data+mu_bias_data[n_rater],
np.sqrt(sigma_real_data**2+sigma_bias_data[n_rater]**2)))
idxs = np.where(raters_idxs==n_rater)
observed = obs_values[idxs]
reproduced = ppc['obs'][0, idxs].flatten()
fig, ax = plt.subplots(figsize=(12,8))
sns.histplot(observed, label='observed', stat='density')
sns.histplot(reproduced, label='reproduced', color='orange', alpha=0.4, stat='density')
print('reproduced proportion of zeros: %.2f'%(len(reproduced[reproduced==0])/len(reproduced)))
non_zeros = reproduced[reproduced>0]
print('reproduced mean and sigma of non zeros: mu=%.2f sigma=%.2f'%(np.mean(np.log(non_zeros)),
np.std(np.log(non_zeros))))
plt.xlabel('rates')
plt.legend()
plt.title('rater %d'%n_rater)
plt.show()
Given that I am mostly interested in bias parameters, and I’d like to introduce another source of error later, it is a not negligible issue for me.
Do you have any idea about
- how could be possible that the model infers the obs parameters with high accuracy while having no idea about the “parents” (real and bias parameters) in the hierarchical model?
- how could I solve this issue and have reliable information about the parameters I am interested in?
Thank you so much if anyone can help!!