Sampler gets stuck in hierarchical logistic regression

Hi there,

I’m using pymc v5.22 to estimate coefficients for a hierarchical logistic regression. However, my sampler sometimes gets stuck in some sort of degeneracy at the start of the run and rushes through the sampling steps with 0 step size, resulting in parameter samples that are all one value. Not all starting points are like this, some sample just fine. I figure the places where the sampler fails are points where my posterior gradient is zero, or perhaps there is an over/underflow issue due to the use of the inverse logit. I’ve tried tweaking my prior variances to avoid these areas to no effect.

Here is some simulated data and model code that reproduces the effect

import pymc as pm
import numpy as np
from scipy import stats

#################
##Simulate Data##
#################

#Population params
alpha_mu = -2
alpha_sigma = 0.2

beta_1_mu = 0.1
beta_1_sigma = 0.1

beta_2_mu = -0.2
beta_2_sigma = 0.1

levels = 5
N = 1000

#group indices
idx = np.repeat(np.arange(levels), repeats=N // levels)

#Coefficients for each group
coef_0 = stats.norm(alpha_mu, alpha_sigma).rvs(levels)
coef_1 = stats.norm(beta_1_mu, beta_1_sigma).rvs(levels)
coef_2 = stats.norm(beta_2_mu, beta_2_sigma).rvs(levels)

X = stats.norm(0, 10).rvs((N, 2))

p_sim = np.ones(N) * -5

#Compute p
for i in range(N):
    l = idx[i]
    p_sim[i] = 1 / (1 + np.exp(-1 * (coef_0[l] + coef_1[l] * X[i, 0] + coef_2[l] * X[i, 1])))

assert np.all((p_sim>=0) & (p_sim<=1))
obs = stats.binom(n=1, p=p_sim).rvs(N)
#################
#Simulation Done#
#################

with pm.Model(coords={"group": np.arange(levels)}) as model:
    X_0 = pm.Data("X_0", X[:, 0].flatten())
    X_1 = pm.Data("X_1", X[:, 1].flatten())
    convert = pm.Data("convert", obs.flatten())
    index = pm.Data("index", idx)

    mu_beta_X_0 = pm.Normal("mu_beta_X_0", 0, 1)
    sigma_beta_X_0 = pm.Exponential("sigma_beta_X_0", 1)
    z_beta_X_0 = pm.Normal("z_beta_X_0", 0, 1, dims="group")

    mu_beta_X_1 = pm.Normal("mu_beta_X_1", 0, 1)
    sigma_beta_X_1 = pm.Exponential("sigma_beta_X_1", 1)
    z_beta_X_1 = pm.Normal("z_beta_X_1", 0, 1, dims="group")

    mu_alpha = pm.Normal("mu_alpha", 0, 1)
    sigma_alpha = pm.Exponential("sigma_alpha", 1)
    z_alpha = pm.Normal("z_alpha", 0, 1, dims="group")

    beta_X_0 = pm.Deterministic(
        "beta_X_0", mu_beta_X_0 + z_beta_X_0 * sigma_beta_X_0, dims="group"
    )  

    beta_X_1 = pm.Deterministic(
        "beta_X_1", mu_beta_X_1 + z_beta_X_1 * sigma_beta_X_1, dims="group"
    )  
    alpha = pm.Deterministic(
        "alpha", mu_alpha + z_alpha * sigma_alpha, dims="group"
    )  

    p = pm.Deterministic(
        "p", 
        pm.math.invlogit(
            alpha[idx]
            + beta_X_0[idx] * X_0
            + beta_X_1[idx] * X_1) 
    )
    likelihood = pm.Bernoulli("likelihood", p=p, observed=convert)

    samples = pm.sample(draws=1000, tune=1000, chains=4)

The model graph is

When I run this code. There are chains where the mu_beta_..., mu_alpha and sigma_... variables have samples that do not vary over the iterations of the sampler.

Does anyone know why this might be happening?

Thanks

Hey @severian,

I think the priors that you have defined in your model are putting too much density on the extremes for your probability (p). Below you can see the prior predictive check for p in your original model.

with pm.Model(coords={"group": np.arange(levels)}) as model:
    X_0 = pm.Data("X_0", X[:, 0].flatten())
    X_1 = pm.Data("X_1", X[:, 1].flatten())
    convert = pm.Data("convert", obs.flatten())
    index = pm.Data("index", idx)

    mu_beta_X_0 = pm.Normal("mu_beta_X_0", 0, 1)
    sigma_beta_X_0 = pm.Exponential("sigma_beta_X_0", 1)
    z_beta_X_0 = pm.Normal("z_beta_X_0", 0, 1, dims="group")

    mu_beta_X_1 = pm.Normal("mu_beta_X_1", 0, 1)
    sigma_beta_X_1 = pm.Exponential("sigma_beta_X_1", 1)
    z_beta_X_1 = pm.Normal("z_beta_X_1", 0, 1, dims="group")

    mu_alpha = pm.Normal("mu_alpha", 0, 1)
    sigma_alpha = pm.Exponential("sigma_alpha", 1)
    z_alpha = pm.Normal("z_alpha", 0, 1, dims="group")

    beta_X_0 = pm.Deterministic(
        "beta_X_0", mu_beta_X_0 + z_beta_X_0 * sigma_beta_X_0, dims="group"
    )  

    beta_X_1 = pm.Deterministic(
        "beta_X_1", mu_beta_X_1 + z_beta_X_1 * sigma_beta_X_1, dims="group"
    )  
    alpha = pm.Deterministic(
        "alpha", mu_alpha + z_alpha * sigma_alpha, dims="group"
    )  

    p = pm.Deterministic(
        "p", 
        pm.math.invlogit(
            alpha[idx]
            + beta_X_0[idx] * X_0
            + beta_X_1[idx] * X_1) 
    )
    likelihood = pm.Bernoulli("likelihood", p=p, observed=convert)

    prior_predictive_sample = pm.sample_prior_predictive()

    # samples = pm.sample(draws=1000, tune=1000, chains=4)
az.plot_dist(prior_predictive_sample.prior['p'])

EDITED - for improved priors:

I think that you want to spread out your density over that space. You can try this model:

with pm.Model(coords={"group": np.arange(levels)}) as model:
    X_0 = pm.Data("X_0", X[:, 0].flatten())
    X_1 = pm.Data("X_1", X[:, 1].flatten())
    convert = pm.Data("convert", obs.flatten())
    index = pm.Data("index", idx)

    mu_beta_X_0 = pm.Normal("mu_beta_X_0", 0, 0.05)
    sigma_beta_X_0 = pm.HalfNormal("sigma_beta_X_0", 1e-2)
    z_beta_X_0 = pm.Normal("z_beta_X_0", 0, 1, dims="group")

    mu_beta_X_1 = pm.Normal("mu_beta_X_1", 0, 0.05)
    sigma_beta_X_1 = pm.HalfNormal("sigma_beta_X_1", 1e-2)
    z_beta_X_1 = pm.Normal("z_beta_X_1", 0, 1, dims="group")

    mu_alpha = pm.Normal("mu_alpha", 0, 0.5)
    sigma_alpha = pm.HalfNormal("sigma_alpha", 1e-2)
    z_alpha = pm.Normal("z_alpha", 0, 1, dims="group")

    beta_X_0 = pm.Deterministic(
        "beta_X_0", mu_beta_X_0 + z_beta_X_0 * sigma_beta_X_0, dims="group"
    )  

    beta_X_1 = pm.Deterministic(
        "beta_X_1", mu_beta_X_1 + z_beta_X_1 * sigma_beta_X_1, dims="group"
    )  
    alpha = pm.Deterministic(
        "alpha", mu_alpha + z_alpha * sigma_alpha, dims="group"
    )  

    p = pm.Deterministic(
        "p", 
        pm.math.invlogit(
            alpha[idx]
            + beta_X_0[idx] * X_0
            + beta_X_1[idx] * X_1) 
    )
    likelihood = pm.Bernoulli("likelihood", p=p, observed=convert)

    prior_predictive_sample = pm.sample_prior_predictive()

    samples = pm.sample(draws=1000, tune=1000, chains=4)

Which results in this prior predictive check over p:

Again, that might be placing a bit more density than you’d like for your problem at the 0.5 region. You may want to tweak the priors for your use case.

Also, you can increase target_accept with sample(target_accept=0.9) to try and reduce any divergences.