I’m trying to understand the role of a particular line of code in a model that includes data with missing categorical values. This is taken from the Code 15.30 section of the Statistical Rethinking 2 repo.
import warnings
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
from numpy.random import default_rng
from scipy.special import expit as invlogit
N_houses = 100 # Number of houses to simulate
alpha = 5 # Average number of notes when cat is absent
beta = -3 # Difference in number of notes when cat is present
k_true = 0.5 # Probability of cat present
r = 0.2 # Probability of not knowing whether cat present/absent
cat_true = rng.binomial(n=1, p=k_true, size=N_houses)
notes = rng.poisson(lam=alpha + beta * cat_true, size=N_houses)
R_C = rng.binomial(n=1, p=r, size=N_houses)
cat = cat_true.copy()
cat[R_C == 1] = -9
with pm.Model() as m15_8:
# priors
a = pm.Normal("a", 0, 1)
b = pm.Normal("b", 0, 0.5)
k = pm.Beta("k", 2, 2)
custom_logp = pm.math.logsumexp(
pm.math.log(k)
+ pm.logp(pm.Poisson.dist(pm.math.exp(a + b)), notes[cat == -9])
+ pm.math.log(1 - k)
+ pm.logp(pm.Poisson.dist(pm.math.exp(a)), notes[cat == -9])
)
# Using pm.Potential to add custom term to model logp
notes_RC_1 = pm.Potential("notes|RC==1", custom_logp)
cat_RC_0 = pm.Bernoulli("cat|RC==0", k, observed=cat[cat != -9]) # Note how k will be updated
lam = pm.math.exp(a + b * cat_RC_0)
notes_RC_0 = pm.Poisson("notes|RC==0", lam, observed=notes[cat != -9])
idata_m15_8 = pm.sample(return_inferencedata=True, random_seed=RANDOM_SEED)
az.summary(idata_m15_8, var_names=["a", "b", "k"], round_to=2)
mean sd hdi_5.5% hdi_94.5% mcse_mean mcse_sd ess_bulk ess_tail r_hat
a 1.53 0.07 1.42 1.64 0.0 0.0 3549.82 3093.47 1.0
b -0.93 0.14 -1.16 -0.72 0.0 0.0 3590.55 2995.64 1.0
k 0.45 0.05 0.37 0.53 0.0 0.0 4435.70 2852.56 1.0
My confusion is from notes_RC_1 not being used further in the code. I understand that this is how we model the categorical missing values, but we do we need it if we don’t use it further on?
Also, when we comment out the custom_logp and notes_RC_1 definition, we get different values for b and k (not a). Why is that?