Hi all. I’m experiencing a strange issue when trying to predict out-of-sample unobserved data with pm.sample_posterior_predicitve. I have the following model (sorry I cannot provide data atm):
coords = {"year":list(year_look.keys()),
"country":list(country_look.keys()),
"zone":list(zone_look.keys()),
"location":df.index.values,
"feature":["longitude", "latitude"]}
with pm.Model(coords=coords) as mod:
c_idx = pm.ConstantData("country_idx", country_idx, dims="location")
t_idx = pm.ConstantData("year_idx", year_idx, dims="location")
z_idx = pm.ConstantData("zone_idx", zone_idx, dims="location")
X = pm.ConstantData("X", xy_country.T, dims=("country", "feature"))
T = pm.ConstantData("T", years, dims="year")
zeta_z = pm.Normal("zeta_z", 0, 1, dims="zone")
zeta_l = pm.Normal("zeta_l", 0, 1)
zeta_s = pm.HalfNormal("zeta_s", 1)
zeta = pm.Deterministic("zeta", zeta_l + zeta_z*zeta_s, dims="zone") #intercept space
alpha = pm.Normal("alpha", 0, 1)
s = pm.Gamma("s", mu=300, sigma=100)
k = Matern32Chordal(input_dim=2, ls=s)
latent_s = pm.gp.Latent(cov_func=k,)
chi = latent_s.prior("chi", X, dims="country")
l = pm.HalfNormal("l", 1)
c = pm.gp.cov.ExpQuad(input_dim=1, ls=l)
latent_t = pm.gp.Latent(cov_func=c,)
tau = latent_t.prior("tau", T[:,None], dims="year")
psi = pm.Deterministic("psi", pm.math.invlogit(chi[c_idx] + tau[t_idx]))
p = pm.Deterministic("p", pm.math.invlogit(alpha + zeta[z_idx]*areas ))
y = pm.Bernoulli('y', p=p*psi, observed=detected, dims="location")
with mod:
idata = pm.sample(1000, tune=1000, chains=4, cores=12, random_seed=33)
with mod:
z = pm.Bernoulli("z", p=psi)
preds = pm.sample_posterior_predictive(idata, var_names=["psi", "p", "y","z"])
This samples with no issues and the posterior predictive checks are quite okay:
pred_y_year = np.array([pred_y[yea_idx[i]].sum(axis=0) for i in range(len(year_look))])
pred_z_year = np.array([pred_z[yea_idx[i]].sum(axis=0) for i in range(len(year_look))])
pred_y_country = np.array([pred_y[con_idx[i]].sum(axis=0) for i in range(len(country_look))])
pred_z_country = np.array([pred_z[con_idx[i]].sum(axis=0) for i in range(len(country_look))])
samps = np.random.randint(pred_y.shape[1], size=100)
...code for plotting below...etc...
From this model I get the unobserved data predictions as follow (all very standard):
with mod:
mod.add_coords({"year_u":list(year_u_look.keys()), "country_u":list(country_u_look.keys()),
"zone_u":list(zone_u_look.keys()), "location_u":u_idx})
c_u_idx = pm.ConstantData("country_u_idx", country_u_idx, dims="location_u")
t_u_idx = pm.ConstantData("year_u_idx", year_u_idx, dims="location_u")
z_u_idx = pm.ConstantData("zone_u_idx", zone_u_idx, dims="location_u")
Xu = pm.ConstantData("Xu", xy_country_u.T, dims=("country_u", "feature"))
Tu = pm.ConstantData("Tu", years_u, dims="year_u")
zeta_z_u = pm.Normal("zeta_z_u", 0, 1, dims="zone_u")
zeta_u = pm.Deterministic("zeta_u", zeta_l + zeta_s * zeta_z_u, dims="zone_u")
alpha_u = pm.Normal("alpha_u", 0, 1)
chi_u = latent_s.conditional("chi_u", Xu, dims="country_u")
tau_u = latent_t.prior("tau_u", Tu[:,None], dims="year_u")
psi_u = pm.Deterministic("psi_u", pm.math.invlogit(chi_u[c_u_idx] + tau_u[t_u_idx]))
p_u = pm.Deterministic("p_u", pm.math.invlogit(alpha_u + zeta_u[z_u_idx]*areas_u))
z_u = pm.Bernoulli("z_u", p=psi_u)
y_u = pm.Bernoulli('y_u', p=p_u*z_u, dims="location_u")
with mod:
preds_u = pm.sample_posterior_predictive(idata, predictions=True, random_seed=33,
var_names=["y_u", "p_u", "psi_u", "z_u"])
But somehow the predictions seem to be in reverse:
The values should be increasing rather than decreasing, seems that if the shape of the image above (especially over time) is flipped, the predictions would be capturing the data more appropriately.
I know it’s not ideal without example data, I’ll try to come up with a toy example when I can. But, just in case someone has experienced this issue or a similar one before and has some suggestions. Many thanks in advance.