Sorry for the slow response. I think I have something that works better. It uses the fact that you have multiple observations for each book and rating value.
Full code:
import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
from pymc.sampling.jax import sample_numpyro_nuts
def calc_rating(book_id_intercept):
b01, b02, b03, b04 = -1, -0.5, 0.5, 1
# Calculate log odds
logodds1 = book_id_intercept - b01
logodds2 = book_id_intercept - b02
logodds3 = book_id_intercept - b03
logodds4 = book_id_intercept - b04
# Convert log odds to probabilities
inv_logit = lambda logit: np.exp(logit) / (1 + np.exp(logit))
prob_2to5 = inv_logit(logodds1)
prob_3to5 = inv_logit(logodds2)
prob_4to5 = inv_logit(logodds3)
prob_5 = inv_logit(logodds4)
# Calculate individual category probabilities
prob_1 = 1 - prob_2to5
prob_2 = prob_2to5 - prob_3to5
prob_3 = prob_3to5 - prob_4to5
prob_4 = prob_4to5 - prob_5
return np.random.choice([1, 2, 3, 4, 5], p=[prob_1, prob_2, prob_3, prob_4, prob_5])
np.random.seed(42)
num_books = 200 # Only 200 to test quicker, but you can try with the full data size
df_agg = pd.DataFrame({
'book_id': [i for i in range(num_books)],
'a': np.random.normal(0, 1, num_books),
'n': np.random.exponential(20, num_books).astype('int') + 1
})
df = df_agg.loc[df_agg.index.repeat(df_agg['n'])].reset_index(drop=True).drop(columns='n')
df['rating'] = df['a'].apply(calc_rating)
# Count number of observations for every book_id and rating
df_agg = df.groupby(["book_id", "rating"]).size().to_frame("count").reset_index()
# Create model, using potential so we can use the weights (a Custom distribution could be used too)
with pm.Model() as model:
abar = pm.Normal("abar", 0, 1)
sigma = pm.Gamma("sigma", alpha=2, beta=5) # Changed it so doesn't put mode at zero
za = pm.Normal("za", 0, 1, shape=df["book_id"].nunique())
a = abar + za * sigma # Avoid deterministics
cutpoints = pm.Normal(
"cutpoints", mu=0, sigma=1, shape=4,
transform=pm.distributions.transforms.ordered,
initval=np.linspace(-1, 1, 4)
)
eta = a[df_agg["book_id"].to_numpy()]
cumulative_probs = pm.math.sigmoid(cutpoints - pt.shape_padright(eta))
p = pt.concatenate(
[
pt.shape_padright(cumulative_probs[..., 0]),
cumulative_probs[..., 1:] - cumulative_probs[..., :-1],
pt.shape_padright(1 - cumulative_probs[..., -1]),
],
axis=-1,
)
pm.Potential(
"likelihood",
df_agg["count"].to_numpy() * pm.Categorical.logp((df_agg["rating"].to_numpy() - 1), p)
)
idata = sample_numpyro_nuts(draws=1000, tune=1000, chains=2, random_seed=1234)
idata.posterior["a"] = idata.posterior["abar"] + idata.posterior["za"] * idata.posterior["sigma"]
az.plot_trace(idata, var_names="a");
Edit
Last updated: Mon Oct 02 2023
Python implementation: CPython
Python version : 3.10.10
IPython version : 8.12.0
numpy : 1.24.2
pymc : 5.8.2
pandas : 1.5.3
pytensor: 2.16.3
arviz : 0.15.1
Watermark: 2.4.3
Edit 2
For the record, on my computer, using 1000 books, the original model takes ~10 minutes to sample, while the new implementation takes ~2:15 minutes.