Choice-Based Conjoint Data Using A Hierarchical Logit Model

Hello, I am trying to write an implementation for this model for the camera dataset bayesm Overview, my model already runs for 2 clients and for 332 clients it takes to long to start the sampling. Can you give me a hint, so that I can write the model in a more efficient way?
This is my code:

import json
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm
import networkx as nx
figsize = (16, 10)
plt.style.use('dark_background')
palette = "Spectral"
sns.set_palette(palette)

# Load the JSON file
with open("data/camera.json", "r") as f:
    data = json.load(f)

columns = [
     "canon"
    ,"sony"
    ,"nikon"
    ,"panasonic"
    ,"pixels"
    ,"zoom"
    ,"video"
    ,"swivel"
    ,"wifi"
    ,"price"
]

print(len(data))
data = data[:2]
# Convert data to NumPy array
a = np.array([[data[k]["X"][l*5:(l+1)*5] for l in range(len(data[k]["y"]))] for k in range(len(data))])

# Define Bayesian Model in PyMC
n_features = len(columns)
nvar = n_features  # Number of random coefficients
nu = nvar + 3  # Degrees of freedom for the IW prior



# Get observed output vectors
y_obs = np.array([np.array(data[k]["y"])-1 for k in range(len(data))])

with pm.Model() as model:
    a_mu = 0.01  # Small precision factor
    st_dist = pm.MvNormal.dist(mu=np.ones(n_features) * 1/a_mu, cov=np.eye(n_features), shape=n_features)

    chol, corr, stds = pm.LKJCholeskyCov(
        "chol", n=n_features, eta=4.0,
        sd_dist=st_dist,
    )

    # Recover the full covariance matrix
    v_beta = pm.Deterministic("v_beta", chol @ chol.T)

    # Define the mean of the random coefficients
    mu_bar = np.zeros(nvar)  # Prior mean
    mu_beta = pm.MvNormal("mu_beta", mu=mu_bar, chol=chol)

    beta = pm.MvNormal("beta", mu=mu_beta, cov=v_beta, shape=(len(data), len(columns)))

    # Track progress using tqdm
    y = []
    p_ = []
    for k in tqdm(range(len(data)), desc="Processing Customers", colour="yellow"):
        p = []
        for l in tqdm(range(16), desc=f"Processing questions for customer {k}", colour="green"):
            utilities = pm.math.dot(a[k, l], beta[k])  # Compute utilities
            # p.append(pm.math.exp(utilities) / pm.math.sum(pm.math.exp(utilities), axis=0))  # Convert to probabilities
            p.append(pm.math.softmax(utilities))
            # Observed choices
        p_.append(p)
        y.append(pm.Categorical(f"y_{k}", p=p, observed=y_obs[k]))
    p_per_user = pm.Deterministic("p", pm.math.stack(p_, axis=0))
    y_per_user = pm.Deterministic("y", pm.math.stack(y, axis=0))
    model.debug()
    # Sample from the posterior with a progress bar
    #idata = pm.sample(1000, tune=1000, target_accept=0.95, progressbar=True)
    idata = pm.sample(2000, tune=1000, target_accept=0.95, progressbar=True,nuts_sampler="numpyro", chain_method = "vectorized")

# Convert model to networkx graph
graph = pm.model_to_networkx(model)
# Create plot
plt.figure(figsize=(10,8))
nx.draw(graph, with_labels=True, node_color='lightblue',
       node_size=2000, arrowsize=20)
plt.savefig("app/data/graph.png")
plt.close()
# Display results
print(az.summary(idata))
az.plot_trace(idata, figsize=(10, 7), var_names=["mu_beta", "beta", "y"], combined=True)
plt.savefig("app/data/plot.png")
plt.close()

I fed your question into a PyMC-assistant we’re toying around with and this was the response:

Based on the provided information, here are the key recommendations to improve the efficiency of the hierarchical logit model implementation:

  1. Remove Loops and Vectorize:
  • Never use loops inside a PyMC model definition
  • Vectorize any computations over the whole dataset in one shot
  1. Use Proper PyTensor Functions:
  • Replace numpy functions with PyTensor functions which mirror the numpy API
  • Never use .eval() on random variables inside a PyMC model definition
  1. Consider Alternative Backends:
  • Try using the numpyro or JAX backend through PyTensor for faster execution
  1. Model Structure:
  • Include more groups per model instead of one model per group
  • Use chol instead of cov for MvNormals for better performance
  1. Hardware Considerations:
  • For large hierarchical models (1700+ observations, 30+ variables), significant memory (100+GB RAM) may be required

The current implementation shows nested loops for processing customers and questions, which should be vectorized for better performance The code already uses the numpyro sampler and vectorized chain method, which is a good practice

Vectorization attempt:

import json
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

# Load and prepare data
with open("data/camera.json", "r") as f:
    data = json.load(f)

columns = ["canon", "sony", "nikon", "panasonic", "pixels", 
           "zoom", "video", "swivel", "wifi", "price"]

# Prepare data arrays (vectorized)
n_customers = len(data)
n_questions = len(data[0]["y"])  # Usually 16
n_alternatives = 5  # Number of alternatives per choice set
n_features = len(columns)

# Reshape X data into (n_customers, n_questions, n_alternatives, n_features)
X = np.zeros((n_customers, n_questions, n_alternatives, n_features))
for i in range(n_customers):
    for j in range(n_questions):
        X[i, j] = np.array(data[i]["X"][j*n_alternatives:(j+1)*n_alternatives])

# Prepare Y data (n_customers, n_questions)
Y = np.array([np.array(data[i]["y"])-1 for i in range(n_customers)])

# Model specification
with pm.Model() as model:
    # Prior for variance components
    a_mu = 0.01
    st_dist = pm.MvNormal.dist(
        mu=np.ones(n_features) * 1/a_mu, 
        cov=np.eye(n_features), 
        shape=n_features
    )

    # LKJ prior for correlation matrix
    chol, corr, stds = pm.LKJCholeskyCov(
        "chol", 
        n=n_features, 
        eta=4.0,
        sd_dist=st_dist,
    )

    # Covariance matrix
    v_beta = pm.Deterministic("v_beta", chol @ chol.T)

    # Population-level parameters
    mu_beta = pm.MvNormal(
        "mu_beta", 
        mu=np.zeros(n_features),
        chol=chol
    )

    # Individual-level parameters
    beta = pm.MvNormal(
        "beta", 
        mu=mu_beta, 
        cov=v_beta, 
        shape=(n_customers, n_features)
    )

    # Compute utilities (vectorized)
    # Reshape beta to broadcast correctly: (n_customers, 1, 1, n_features)
    beta_expanded = beta[:, None, None, :]
    
    # X shape: (n_customers, n_questions, n_alternatives, n_features)
    # Result shape: (n_customers, n_questions, n_alternatives)
    utilities = pm.math.sum(X * beta_expanded, axis=-1)

    # Apply softmax across alternatives
    # Result shape: (n_customers, n_questions, n_alternatives)
    p = pm.math.softmax(utilities, axis=-1)

    # Likelihood
    y = pm.Categorical("y", p=p, observed=Y)

    # Sample from the posterior
    idata = pm.sample(
        2000, 
        tune=1000, 
        target_accept=0.95, 
        progressbar=True,
        nuts_sampler="numpyro", 
        chain_method="vectorized"
    )

# Visualize results
graph = pm.model_to_networkx(model)
plt.figure(figsize=(10,8))
nx.draw(graph, with_labels=True, node_color='lightblue',
       node_size=2000, arrowsize=20)
plt.savefig("app/data/graph.png")
plt.close()

# Display results
print(az.summary(idata))
az.plot_trace(idata, figsize=(10, 7), var_names=["mu_beta", "beta"], combined=True)
plt.savefig("app/data/plot.png")
plt.close()

@twiecki Thank you very much for your help! It was not clear to me how to vectorize in this case. Is your assistant available in any way?

Yeah, I’m working on that.

Did the model from the AI actually work?

1 Like

Yes, indeed! I was testing many priors, and finally, the model converged.

Damn, claude 3.5 just single-shotted this :exploding_head:

1 Like

That’s really impressive!