How to marginalize latent discrete variables in CDMs (cognitive diagnosis models)?

Hi!I’m trying to marginalize DINA and the higher-order DINA model, but the results look terrible. I think I’m marginalizing it in the wrong way.
DINA is a classification model for educational assessment. See:
https://mc-stan.org/documentation/case-studies/dina_independent.html
The code for the simulated data:

import theano.tensor as tt
import pymc3 as pm
import numpy as np
import itertools
import arviz as az


N = 150   # student count
I = 14    # item count
K = 3     # skill count
# all class
all_class = np.array(list(itertools.product(*[[0,1] for k in range(K)])))
# random sample from all possible class
class_idx = np.random.randint(0,len(all_class),size=N)
student_attribute = all_class[class_idx]
# Q-matrix
Q = np.concatenate([all_class[1:] for i in range(2)],axis=0)
# ideal response
eta = (student_attribute.reshape(N,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
s = np.random.uniform(0,0.2,(1,I))
g = np.random.uniform(0,0.2,(1,I))
Y = np.random.binomial(1,eta*(1-s-g)+g)

DINA model

The original version

In the original version, I needed to convert the probability of a student belonging to a certain category into a discrete category.


# ideal response for all classes
all_c_eta = (all_class.reshape(-1,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
all_c_eta = tt.as_tensor(all_c_eta)
with pm.Model() as DINA:

    student_class_p = pm.Dirichlet("student_class_p",tt.ones(8),shape=(N,8))
    student_class = pm.Categorical("student_class",student_class_p,shape=N)
   
   
    s_sample = pm.Beta("s",alpha=1,beta=1,shape=(1,I))
    g_sample = pm.Beta("g",1,1,shape=(1,I))
    
    # 所有可能的属性掌握模式作答正确的概率,shape=C*I
    all_c_p = pm.Deterministic("all_class_p",all_c_eta*(1-s_sample-g_sample)+g_sample)
    p_sample = pm.Deterministic("p_sample",all_c_p[student_class])
    
    pm.Bernoulli("Y",p_sample,observed=Y)

with DINA:
    tr = pm.sample(1000,tune=3000,chains=2,return_inferencedata=True)

The posterior of student_class

Marginalized version

Then I try to marginalize student_class.

P(Y) = \sum_1^c \pi_c P(Y|c)

\pi_c is the probability that a student falls into a certain category (student_class_p in code), P(Y|c) is the probability of a student’s answer correct in category c (all_class_p in code).

all_c_eta = (all_class.reshape(-1,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
all_c_eta = tt.as_tensor(all_c_eta)
with pm.Model() as DINA:
    # The probability that a student falls into a certain category
    student_class_p = pm.Dirichlet("student_class_p",tt.ones(8),shape=(N,8))
 
   
    s_sample = pm.Beta("s",alpha=1,beta=1,shape=(1,I))
    g_sample = pm.Beta("g",1,1,shape=(1,I))
    
    # Probability of correct answer for all categories
    all_c_p = pm.Deterministic("all_class_p",all_c_eta*(1-s_sample-g_sample)+g_sample)

    p_sample = pm.Deterministic("p_sample",student_class_p.dot(all_c_p))
    
    pm.Bernoulli("Y",p_sample,observed=Y)

The posterior mean of student_class_p


After my attempt at marginalization, the categorization seemed to work poorly.The probability of students falling into different categories is very close

Higher-order DINA

In addition, I also tried Higher-Order DINA. Whether students master skills is determined by higher levels of ability.

skill\_P_{nk} = logit^{-1}(\lambda_k\theta+\lambda0_k)

The difference lies in that whether students master specific skills is determined by higher-order ability. The probability of mastering skill K is as above, and lambda0 and lambdak are intercept and slope respectively.

In the original version, the probability of mastering a particular skill had to be converted into a dichotomous variable (skill_p → skill) before the probability of a student answering correctly was calculated. In the current version, the probability of a student mastering a specific skill is converted into the probability of a student belonging to a certain category, and then the probability of a student answering correctly is calculated from the probability of a student belonging to a certain category.

all_c_eta = (all_class.reshape(-1,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
all_c_eta = tt.as_tensor(all_c_eta)
with pm.Model() as HODINA:
    # higher-order ability
    theta_sample = pm.Normal("theta",0,1,shape=(N,1))
    # intercept and slope
    lambda_0_sample = pm.Normal("lambda0",0,1,shape=(1,K))
    lambda_k_sample = pm.TruncatedNormal("lambdak",1,1,lower=0,shape=(1,K))
    
    # The probability of mastering a particular skill
    skill_p_sample = pm.Deterministic("skill_p",pm.math.sigmoid(lambda_k_sample*theta_sample+lambda_0_sample))
    
    # Convert the probability that students belong to a 
    # certain skill into the probability that students belong to a certain category
    student_class_p =pm.Deterministic("student_class_p",tt.exp(tt.log(skill_p_sample).dot(all_class.T)
                                        +tt.log(1-skill_p_sample).dot(1-all_class.T)))
    
    s_sample = pm.Beta("s",alpha=1,beta=1,shape=(1,I))
    g_sample = pm.Beta("g",1,1,shape=(1,I))
    
    
    all_p_p = pm.Deterministic("all_pattern_p",all_c_eta*(1-s_sample-g_sample)+g_sample)
    p_sample = pm.Deterministic("p_sample",student_class_p.dot(all_p_p))
    
    pm.Bernoulli("Y",p_sample,observed=Y)

However, the classification results of the model do not seem to be good. The probability of some students falling into different categories is close

The posterior mean of student_class_p:

However, when I added residuals to SKILL_P, the classification results improved

skill\_P_{nk} = logit^{-1}(\lambda_k\theta+\lambda0_k+\epsilon_{nk})
with pm.Model() as HODINA:
    # higher-order ability
    theta_sample = pm.Normal("theta",0,1,shape=(N,1))
    # intercept and slope
    lambda_0_sample = pm.Normal("lambda0",0,1,shape=(1,K))
    lambda_k_sample = pm.TruncatedNormal("lambdak",1,1,lower=0,shape=(1,K))
    
    # Skill_p residuals
    sig = pm.HalfNormal("sig",2)
    epsilon = pm.Normal("epsilon",0,sig,shape=(N,K))
    
    # The probability of mastering a particular skill
    skill_p_sample = pm.Deterministic("skill_p",pm.math.sigmoid(lambda_k_sample*theta_sample
                                 +lambda_0_sample+epsilon))

The posterior mean of student_class_p:


And the residuals are very large

END

In short, I have 2 questions:
q1: How to correctly marginalize the potential discrete parameters of the DINA model?
q2: Why can residual item improve classification performance, and is it necessary to add a residual item when marginalizing? Are there any references to this?

Thanks for your time!
Any help would be appreciated!

I don’t see where you marginalized the categorical variable. You would have to enumerate over all the possibilities and collect the respective conditional probabilities (or use a helper that does this like pm.Mixture), but you seem to have simply dropped that variable?

I think I misunderstand marginalization.I will try pm.Mixture().

The code I rewrote is as follows

# ideal response for all student categories
all_c_eta = (all_class.reshape(-1,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
all_c_eta = tt.as_tensor(all_c_eta)
with pm.Model() as DINA:
    # number of categories is 8
    C = 8
    # students categories, shape=(N,C)
    student_class_p = pm.Dirichlet("student_class_p",tt.ones(C),shape=(N,C))
    
    # repeat all_c_p I times and transform the shape of all_c_p as (N,I,C) 
    # the possibility in category c
    w = tt.repeat(student_class_p,I).reshape((N,C,I)).transpose(0,2,1)
    
    s_sample = pm.Beta("s",alpha=1,beta=1,shape=(1,I))
    g_sample = pm.Beta("g",1,1,shape=(1,I))
    
    # correct possibility of all possible categories, shape=(C,I)
    all_c_p = pm.Deterministic("all_class_p",all_c_eta*(1-s_sample-g_sample)+g_sample)
    
    # repeat all_c_p N times and transform the shape of all_c_p as (N,I,C)
    # correct possibility 
    n_i_c = tt.repeat(all_c_p,N).reshape((C,I,N)).transpose(2,1,0)
    compond_dist = pm.Bernoulli.dist(p=n_i_c,shape=(N,I,C))
    
    pm.Mixture("Y",w=w,comp_dists=compond_dist,observed=Y)
    # Y.shape=(N,I)

But judging from the results, there seems to be something wrong. :thinking:
The correct classification rate is 78.7%, but it should be closer to 85.3%( directly sample from potential discrete variables ).

Thanks for your help and time :grin:~

To marginalize the latent discrete variable, I’ve translated a DINA model from a Stan tutorial DINA Independent Case Study to PyMC recently. However, I am still seeing a significant difference in the results. The correlation between the estimated ‘slip’ (s) and ‘guess’ (g) parameters and their true values is nearly zero, which diverges from the Stan model’s output. Here’s the crux of my PyMC model:

import numpy as np
import pymc as pm
from pytensor import tensor as pt
import itertools
import arviz as az

# Define the number of students, items, and skills
N = 150   # Total number of students
I = 30    # Total number of items
K = 3     # Total number of skills

# Generate all possible combinations of skill mastery (2^K possible classes)
all_class = np.array(list(itertools.product(*[[0, 1] for _ in range(K)])))

# Randomly assign students to one of the possible classes
class_idx = np.random.randint(0, len(all_class), size=N)
student_attribute = all_class[class_idx]

# Generate a Q-matrix that maps items to the skills they require
Q = np.concatenate([all_class[1:] for _ in range(10)], axis=0)[:I]

# Calculate the ideal response for each student based on their attribute mastery and the Q-matrix
eta = (student_attribute.reshape(N, 1, K) >= Q.reshape(1, I, K)).prod(axis=-1)

# Define slip and guess parameters for each item
s = np.random.uniform(0, 0.1, (1, I))
g = np.random.uniform(0, 0.1, (1, I))

# Generate observed data (Y) based on the ideal response with some noise from slip and guess parameters
Y = np.random.binomial(1, eta * (1 - s - g) + g)

# Compute the ideal response for all possible classes
all_c_eta = (all_class.reshape(-1, 1, K) >= Q.reshape(1, I, K)).prod(axis=-1)
all_c_eta = pt.as_tensor(all_c_eta)

# Begin PyMC model definition
with pm.Model() as DINA:

    # Define the prior for the probability of each student belonging to each class using a Dirichlet distribution
    student_class_p = pm.Dirichlet("student_class_p", pt.ones(2**K), shape=(1, 2**K))
    
    # Define slip parameter for each item with a Beta distribution
    s_sample = pm.Beta("s", alpha=5, beta=25, shape=(1, I))
    
    # Define guess parameter for each item with a Beta distribution
    g_sample = pm.Beta("g", alpha=5, beta=25, shape=(1, I))
    
    # Calculate the probability for all classes, considering the slip and guess parameters
    all_c_p = pm.Deterministic("all_class_p", all_c_eta * (1 - s_sample - g_sample) + g_sample)
    
    # Calculate the log likelihood of the observed data (Y) given the model
    logL_pY = pm.logp(pm.Bernoulli.dist(all_c_p.reshape((2**K, 1, I))), Y.reshape((1, N, I)))
    
    # Add log likelihoods as potentials to the model to inform the posterior
    pm.Potential("logL_Y", logL_pY)
    pm.Potential("logL_C", pt.log(student_class_p))
    
    # Compute the joint log probability across all classes and normalize to get probabilities
    joint_logP = pt.exp(logL_pY.sum(axis=-1)).T * student_class_p
    prob_resp_class = pm.Deterministic("prob_resp_class", joint_logP / joint_logP.sum(axis=-1, keepdims=True))
    
    # Calculate the probability of each student mastering each skill
    pm.Deterministic("att_p", prob_resp_class.dot(all_class))

# Perform sampling from the model
with DINA:
    tr = pm.sample(1000, tune=3000, chains=2, return_inferencedata=True)

I got the result:


Additionally, the prob_resp_class estimates suggest that the probabilities of a student belonging to any class are very similar, which doesn’t seem right.

Could I have possibly missed something vital in my translation that might account for this discrepancy? I would be deeply grateful for any advice or insights that could steer me toward a resolution.

Thanks for the time and help in advance!

I would suggest you try and write the correct model for a single participant and a single question. That should be easy to check if you are getting the right thing. Then gradually expand to multiple questions for a single participant, and finally to the whole dataset.

It will also be easier for folks here to provide feedback on the intermediate versions.

An advantage of the smaller scenario is that you can include the discrete variables before any marginalization. You should be able to sample such a model.

If you write them as separate variables (instead of as a vector): You can even use MarginalModel — pymc_experimental 0.0.14 documentation. This allows you to check if the logp you are getting with your manual marginalization matches with the one from the MarginalModel.

Here is a simplified scenario notebook: Google Colab

import pymc as pm
import pymc_experimental as pmx
import numpy as np

q_skills = np.array([
    [1, 0, 1],
    [1, 1, 0],
    [0, 0, 1],
    [1, 1, 1],
])  # Skills needed to answer q1-q4 correctly

with pmx.MarginalModel(coords={"skills": range(3), "q": range(4)}) as m:
    # We need to create separate alpha's so that we can use `sum` and still marginalize the variables
    alphas = [pm.Bernoulli(f"alpha_{i}", p=0.3) for i in range(3)]
    s = pm.Beta("s", 1, 5)
    g = pm.Beta("g", 2, 1)

    # shapes: sum((q, skills) * (skills,), axis=-1) -> (q,)
    all_skills = pm.math.sum(pm.math.stack([alphas]) * q_skills, axis=-1)
    prob_correct = pm.math.where(all_skills, s, g)
    obs = pm.Bernoulli("obs", p=prob_correct, observed=[1, 1, 0, 1])

# Unmarginalized probability
print(m.point_logps())
# {'alpha_0': -0.36, 'alpha_1': -0.36, 'alpha_2': -0.36, 's': -1.09, 'g': -1.22, 'obs': -2.32}

m.marginalize(alphas)

# Marginalized probability: alphas are gone
print(m.point_logps()) 
# {'s': -1.09, 'g': -1.22, 'obs': -3.27}

MarginalModel will not be able to cope with the full model here, because of the interaction between the distinct Bernoulli variables (that’s why we had to create them separately). However it can be useful to check you are defining the right model

1 Like

I’m deeply grateful for the invaluable advice I received. Following the guidance provided, I successfully amended my code and achieved the desired results. Below is my revised code:

import pytensor
import pymc as pm
from pytensor import tensor as pt

import numpy as np
import itertools
import arviz as az

N = 500   # student count
I = 10    # item count
K = 3     # skill count
# all class
all_class = np.array(list(itertools.product(*[[0,1] for k in range(K)])))
# random sample from all possible class
class_idx = np.random.randint(0,len(all_class),size=N)
student_attribute = all_class[class_idx]
# Q-matrix
Q = np.concatenate([all_class[1:] for i in range(10)],axis=0)[:I]
# ideal response
eta = (student_attribute.reshape(N,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
s = np.random.uniform(0,0.3,(1,I))
g = np.random.uniform(0,0.3,(1,I))
Y = np.random.binomial(1,eta*(1-s-g)+g)


# ideal response for all classes
all_c_eta = (all_class.reshape(-1,1,K)>=Q.reshape(1,I,K)).prod(axis=-1)
all_c_eta = pt.as_tensor(all_c_eta)


with pm.Model() as DINA:

    student_class_p = pm.Dirichlet("student_class_p", pt.ones(2**K), shape=( N, 2**K))
    s_sample = pm.Beta("s",alpha=5,beta=25,shape=(1,I))
    g_sample = pm.Beta("g",5,25,shape=(1,I))
    
    # shape=C*I
    all_c_p = pm.Deterministic("all_class_p",all_c_eta*(1-s_sample-g_sample)+g_sample)
    logL_pY = pm.logp(pm.Bernoulli.dist(all_c_p.reshape((2**K,1,I))), Y.reshape((1,N,I))) # (C,N,I)
    
    # loglikelihood
    ps = pt.log(student_class_p) + logL_pY.sum(axis=-1).T
    pm.Potential("logL_Y", pt.log(pt.exp(ps).sum(axis=-1)))
    
    
    prob_resp_class = pm.Deterministic("prob_resp_class",pt.exp(ps)/pt.exp(ps).sum(axis=-1, keepdims=True))
    pm.Deterministic("att_p", prob_resp_class.dot(all_class))

with DINA:
    tr = pm.sample(1000,tune=3000,chains=2,return_inferencedata=True)

And the results:


2 Likes