Latent Variable Approach to Modeling Missing Not at Random MNAR

I am toying with a model that has a missing not at random model structure. The outcome variables takes values 0,1 and -1 (-1 is to indicate missing). I am testing a basic model with fake data. The model looks like this.

#Imports

import pymc3 as pm
import numpy as np
np.random.seed(seed=42)
import theano.tensor as tt


# Fake Data
Xtrain = np.random.rand(1000,10)
missing = np.random.choice([0, 1], size=[1000],p=[.1, .9])

nclass=3
y_train = np.random.choice([0, 1,-1], size=[1000],p=[.1, .8,0.1])
nclass=3

M = Xtrain.shape[1]
N =Xtrain.shape[0]


# model
with pm.Model() as model:
    pred = pm.Data("pred",Xtrain)
    miss = pm.Data("miss",missing)
    
    σ_ = pm.HalfNormal("σ_", 2.5)
    β_ = pm.Normal('β_', mu=0, sd=1, shape=M)
    β0_ = pm.Normal("β0_",mu=0, sd=10, shape=1)
    mu_ = pm.Deterministic('mu_',β0_ + pm.math.dot( pred,β_) )  
    
    
    latent = pm.Normal("latent",mu=mu_ ,sd= σ_, shape=N )
    
    
    β_missing = pm.Normal('β_missing', mu=0, sd=1, shape=M)
    β_missing_lat = pm.Normal('β_missing_lat', mu=0, sd=1, shape=1)
    
    β0_missing = pm.Normal("β0_missing", 0, 10.)
    mu_missing = pm.Deterministic('mu_missing',β0_missing + pm.math.dot(pred, β_missing) +β_missing_lat*latent )  
    
   
    missing_obs = pm.Bernoulli("missing_obs",p= pm.math.exp(mu_missing)/(1+pm.math.exp(mu_missing)) , observed=miss)
    
    #mu = pm.Deterministic('mu',mu_missing)
    #p
    alfa = pm.Normal('alfa', mu=0, sd=1, shape=nclass)
    beta = pm.Normal('beta', mu=0, sd=1, shape=( nclass, 1)) 
    gamma = pm.Normal('gamma', mu=0, sd=1, shape=nclass)
    mu_1 = beta[0]*mu_missing  + alfa[0]*latent +gamma[0]
    mu_2 = beta[1]*mu_missing   + alfa[1]*latent +gamma[1]
    mu_3= beta[2]*mu_missing   + alfa[2]*latent+gamma[2]
    p = tt.nnet.softmax(pm.math.stack([mu_1,mu_2,mu_3], axis=1)) # recall latent is size 1000, we want to apply softmax for each row across columns . so p should be shape 1000 X 3
    obs = pm.Categorical('obs', p=p, observed=y_train)

The model looks like this

` SEED = 123456789 # for reproducibility

rng = np.random.default_rng(SEED)   
CHAINS = 2

SAMPLE_KWARGS = {
    'cores': CHAINS,
    'target_accept': 0.99,
    'max_treedepth': 15,
    'random_seed': [SEED + i for i in range(CHAINS)],
    'return_inferencedata': True,
    'tune':1000,
    'draws':1000,
    'discard_tuned_samples': True,
    #'callback': MyCallback()
    
}
with pm.Model() as model:
    trace = pm.sample(**SAMPLE_KWARGS)`

The model reports this as an error.

ValueError: The model does not contain any free variables.

q1: Any help on looking the issue would be appreciated.
q2: Is there an easy way to get the shape of rv’s?

Thanks for looking - this feels like a simple model.

Hi, maybe you should transform the observed data from [-1,0,1] to [0,1,2] due to the sample result of cat distribution being >= 0.
Like obs = pm.Categorical('obs', p=p, observed=y_train+1)

For q2, we can get the shape of RV by RV.tag.test_value.shape.

2 Likes