player_idxs, players = pd.factorize(stats.PLAYER)
coords = {
"player": players, # 464 unique players
"obs_id": np.arange(len(player_idxs)), # 6899 rows
}
stats[['PLAYER', 'EXPECTED', 'OBSERVED', 'RESIDUAL']].head()
From this I would like to be able to get the posterior distribution of residuals for a given player.
with pm.Model(coords=coords) as partially_pooled_model:
# Independent parameters for each player
player_idx = pm.Data("player_idx", player_idxs, dims="obs_id")
# Hyperpriors for group nodes
mu_a = pm.Normal("mu_a", mu=0.0, sigma=7, dims="player")
sigma_a = pm.HalfNormal("sigma_a", 5.0, dims="player")
# Data likelihood
y_pred = pm.Normal("y_pred", mu_a[player_idx], sigma_a[player_idx], observed=stats.RESIDUAL, shape=stats['Name'].shape)
posterior = pm.sample()
posterior_pred = pm.sample_posterior_predictive(posterior)
The shape of posterior_pred['y_pred']
is (4000, 6899), to get the posterior distribution for a player would I have to iterate through the 4000 arrays and get all the values at each index that correspond to a player? For example the indices for the player “Joel Embiid” in the stats
DataFrame are
[0, 179, 674, 1087, 1399, 1520, 1842, 2280, 2442, 2777, 3090, 3461, 3676, 4082, 4394, 4694, 4988, 5866, 6124, 6397, 6628]