Multinomial logistic regression on conjoint choice data

Dear all, I am a new user of PyMC (previously using Stan). I am in the process of converting what I had before with Stan to PyMC. Now I am trying to write the PyMC model to estimate the partworth utilities based on a conjoint choice data.

Specifically, the input data would be a tensor with shape (N, C, K), where N is the total number of tasks; C is the number of alternatives/options within one task; K is the number of attributes of one alternative/option.

The process I am thinking is that:

  1. theta is a matrix such that \theta \in \mathcal{R}^{K}, where K is the number of attribtues in each choice task.
  2. X is a tensor, such that X \in \mathcal{R}^{N, C, K}, where N is the total number of tasks; C is the number of alternatives in each choice task.
  3. For each task x_i\in X where x_i \in \mathcal{R}^{C, K}, we can compute the utility of each alternative as u_i = x_i \theta ; in this case, u_i\in \mathcal{R}^{C}
  4. Then I can just input u_i into a softmax function p_i = \text{softmax}(u_i) such that \sum p_i = 1
  5. Finally, these would be used as the probablity in a Multinomial distribution

Based on this, I wrote this following code:

with pm.Model() as multi_logit:
    # priors: 
    theta = pm.Normal(name='theta', mu=0, sigma=1, shape=(1,K))
    # use scan to compute xi*θ
    U, updates = pytensor.scan(fn=lambda xi:,
    # softmax of each row
    U_normed = pm.Deterministic('U_normed', pt.special.softmax(U, axis=0))
    # distribution of the choice
    y_obs = pm.Categorical('y_obs', p=U_normed, observed=y)

where X is the choice data;

array([[[1, 1, 1],
        [1, 0, 0]],

       [[1, 1, 1],
        [0, 0, 0]],

       [[1, 1, 0],
        [1, 0, 1]],


       [[0, 1, 0],
        [1, 0, 0]],

       [[0, 0, 1],
        [0, 1, 1]],

       [[0, 0, 0],
        [1, 0, 0]]])

and y is the observed choices:

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, ..., 1, 1])

However, I was getting the following errors:

ValueError                                Traceback (most recent call last)
Cell In[82], line 12
      9 print(U_normed.shape)
     11 # sample multinomial
---> 12 y_obs = pm.Categorical('y_obs', p=U_normed, observed=y)

File /opt/homebrew/lib/python3.11/site-packages/pymc/distributions/, in Discrete.__new__(cls, name, *args, **kwargs)
    455 if kwargs.get("transform", None):
    456     raise ValueError("Transformations for discrete distributions")
--> 458 return super().__new__(cls, name, *args, **kwargs)

File /opt/homebrew/lib/python3.11/site-packages/pymc/distributions/, in Distribution.__new__(cls, name, rng, dims, initval, observed, total_size, transform, *args, **kwargs)
    307     elif observed is not None:
    308         kwargs["shape"] = tuple(observed.shape)
--> 310 rv_out = cls.dist(*args, **kwargs)
    312 rv_out = model.register_rv(
    313     rv_out,
    314     name,
    319     initval=initval,
    320 )
    322 # add in pretty-printing support

File /opt/homebrew/lib/python3.11/site-packages/pymc/distributions/, in Categorical.dist(cls, p, logit_p, **kwargs)
   1115         p_ = p_ / at.sum(p_, axis=-1, keepdims=True)
   1116         p = at.as_tensor_variable(p_)
-> 1117 return super().dist([p], **kwargs)

File /opt/homebrew/lib/python3.11/site-packages/pymc/distributions/, in Distribution.dist(cls, dist_params, shape, **kwargs)
    387     ndim_supp = cls.rv_op(*dist_params, **kwargs).owner.op.ndim_supp
    388 create_size = find_size(shape=shape, size=size, ndim_supp=ndim_supp)
--> 389 rv_out = cls.rv_op(*dist_params, size=create_size, **kwargs)
    391 rv_out.logp = _make_nice_attr_error("rv.logp(x)", "pm.logp(rv, x)")
    392 rv_out.logcdf = _make_nice_attr_error("rv.logcdf(x)", "pm.logcdf(rv, x)")

File /opt/homebrew/lib/python3.11/site-packages/pytensor/tensor/random/, in CategoricalRV.__call__(self, p, size, **kwargs)
   1828 def __call__(self, p, size=None, **kwargs):
   1829     r"""Draw samples from a discrete categorical distribution.
   1831     Signature
   1846     """
-> 1847     return super().__call__(p, size=size, **kwargs)

File /opt/homebrew/lib/python3.11/site-packages/pytensor/tensor/random/, in RandomVariable.__call__(self, size, name, rng, dtype, *args, **kwargs)
    288 def __call__(self, *args, size=None, name=None, rng=None, dtype=None, **kwargs):
--> 289     res = super().__call__(rng, size, dtype, *args, **kwargs)
    291     if name is not None:
    292 = name

File /opt/homebrew/lib/python3.11/site-packages/pytensor/graph/, in Op.__call__(self, *inputs, **kwargs)
    253 r"""Construct an `Apply` node using :meth:`Op.make_node` and return its outputs.
    255 This method is just a wrapper around :meth:`Op.make_node`.
    293 """
    294 return_list = kwargs.pop("return_list", False)
--> 295 node = self.make_node(*inputs, **kwargs)
    297 if config.compute_test_value != "off":
    298     compute_test_value(node)

File /opt/homebrew/lib/python3.11/site-packages/pytensor/tensor/random/, in RandomVariable.make_node(self, rng, size, dtype, *dist_params)
    329 elif not isinstance(rng.type, RandomType):
    330     raise TypeError(
    331         "The type of rng should be an instance of either RandomGeneratorType or RandomStateType"
    332     )
--> 334 shape = self._infer_shape(size, dist_params)
    335 _, static_shape = infer_static_shape(shape)
    336 dtype = self.dtype or dtype

File /opt/homebrew/lib/python3.11/site-packages/pytensor/tensor/random/, in RandomVariable._infer_shape(self, size, dist_params, param_shapes)
    199     param_batched_dims = getattr(param, "ndim", 0) - param_ndim_supp
    200     if param_batched_dims > size_len:
--> 201         raise ValueError(
    202             f"Size length is incompatible with batched dimensions of parameter {i} {param}:\n"
    203             f"len(size) = {size_len}, len(batched dims {param}) = {param_batched_dims}. "
    204             f"Size length must be 0 or >= {param_batched_dims}"
    205         )
    207 if self.ndim_supp == 0:
    208     return size

ValueError: Size length is incompatible with batched dimensions of parameter 0 U_normed:
len(size) = 1, len(batched dims U_normed) = 2. Size length must be 0 or >= 2

I tested scan with following code, which worked out fine so I was confused. May I ask for help on this? Thank you!

X = pt.tensor3('X')
theta = pt.vector("theta")
results, updates = pytensor.scan(fn=lambda xi:,
compute_utility = pytensor.function(inputs=[X, theta], outputs=results)

# test values
X_1 = np.array([[[1, 0], [0, 1]], [[1, 1], [0, 1]]], dtype=np.int32)
theta = np.array([-1, 1], dtype=np.int32)

# test outcome
compute_utility(X_1, theta)

[[[1 0]
  [0 1]]

 [[1 1]
  [0 1]]]
[-1  1]
array([[-1.,  1.],
       [ 0.,  1.]])

The error you’re getting arises because the observed variable y is one-dimensional, but the shape inferred by the parameter U_normed is 2d. You could fix this by turning y into a column vector, but I don’t think that’s the root of things.

I think the core problem is that you are adding an extra dimension to theta. doesn’t conform, because xi should have shape (C, K), while theta is (1, K). change:

theta = pm.Normal(name='theta', mu=0, sigma=1, shape=K)

And I think it will work as expected.

You also don’t need the scan at all, because there’s no recursive computation going on. You can just use broadcast multiplication, followed by summation over the K dimension. This is equivalent to np.einsum('nck, k -> nc', X, theta):

U = (X * theta).sum(axis=-1)

1 Like

Thanks @jessegrabowski for your help! I now got another error, which I was confused.

The initial evaluation results {'theta': -2.88, 'y_obs': -inf} treated theta as a single number while the starting values are {'theta': array([-0.19010312, 0.01581751, -0.46704099])}. I did not know what this meant… Could you give me some hints? Thanks!

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
SamplingError                             Traceback (most recent call last)
Cell In[110], line 2
      1 with multi_logit:
----> 2     multi_logit_trace = pm.sample()

File /opt/homebrew/lib/python3.11/site-packages/pymc/sampling/, in sample(draws, tune, chains, cores, random_seed, progressbar, step, nuts_sampler, initvals, init, jitter_max_retries, n_init, trace, discard_tuned_samples, compute_convergence_checks, keep_warning_stat, return_inferencedata, idata_kwargs, callback, mp_ctx, model, **kwargs)
    606 ip: Dict[str, np.ndarray]
    607 for ip in initial_points:
--> 608     model.check_start_vals(ip)
    609     _check_start_shape(model, ip)
    611 # Create trace backends for each chain

File /opt/homebrew/lib/python3.11/site-packages/pymc/, in Model.check_start_vals(self, start)
   1776 initial_eval = self.point_logps(point=elem)
   1778 if not all(np.isfinite(v) for v in initial_eval.values()):
-> 1779     raise SamplingError(
   1780         "Initial evaluation of model at starting point failed!\n"
   1781         f"Starting values:\n{elem}\n\n"
   1782         f"Initial evaluation results:\n{initial_eval}"
   1783     )

SamplingError: Initial evaluation of model at starting point failed!
Starting values:
{'theta': array([-0.19010312,  0.01581751, -0.46704099])}

Initial evaluation results:
{'theta': -2.88, 'y_obs': -inf}

This was the code that I updated based on your suggestion:

with pm.Model() as multi_logit:
    # priors: 
    theta = pm.Normal(name='beta', mu=0, sigma=1, shape=K_syn)
    # use scan to compute xi*θ
    U = (X_syn * theta).sum(axis=-1)
    # softmax of each row
    #U_normed = pm.Deterministic('U_normed', pt.special.softmax(U, axis=0))
    U_normed = np.exp(U) / np.exp(U).sum()
    # distribution of the choice
    y_obs = pm.Categorical('y_obs', p=U_normed, observed=y_syn)
    #y_obs = pm.Multinomial('y_obs', n=1, p=U_normed, observed=y_syn_multi)

The initial evaluations are the log probabilities of each variable; that’s why theta is a single number there. y_obs is negative infinity because some of the observed data is impossible according to your starting values. My guess is that after applying the softmax you get exactly 0 for the probability of a class label that corresponds to the observed class (the observed class is impossible).

You can manually provide initial values to pm.sample using the initvals argument to try to avoid this (perhaps by setting initial theta to all zeros; that should give a uniform distribution over class labels i think).

That said, I would use pt.special.softmax instead of doing it “by hand”. Pytensor has graph optimizations that help induce numerical stability when using softmax that you won’t get otherwise. That might solve your problem without fussing with initial values. For the record, I try to never set initial values except as an absolute last resort.

A couple other comments:

  1. You print statements probably aren’t helpful, because they just show the name of the operation being performed. You can do U.shape.eval() to tell pytensor to actually compute what the value should be. .eval() is very nice in general for doing ad-hoc debugging.

  2. While pytensor is smart enough to automatically overload most numpy functions, it’s still best practice to directly call the pytensor equivalent. This will prevent you from being surprised when you stumble onto a gap in the overload coverage. So I’d replace np.exp with pt.exp.

  3. Look into named coordinates instead of using the shape argument on your random variables for nicer outputs and more readable code.

Thank you so much! It worked out well!

For the sake of completeness, I put the code of both the synthetic dataset generation and the model code below:

  • Dataset:
import numpy as np
import scipy as sp
from itertools import product, combinations
# ground truth theta's
theta_syn = np.array([0.5, 1, 0.1])

# possible values of each attribute: binary
attribute_vals = np.array([0, 1])
# 3 attributes
K_syn = 3
# 100 tasks
N_syn = 1000
# Each task has two alternatives
C_syn = 2

# Generate the possible alternatives: pick one of the two values (1/0) from each attribute
alternatives_syn = np.array(list(product(attribute_vals, repeat=K_syn)))

# All possible choice tasks: combination of two distinct alternatives
tasks_syn = np.array(list(product(alternatives_syn, repeat=2)))
# Drop those identical choice tasks
tasks_syn = np.array([[choice_A, choice_B] for choice_A, choice_B in tasks_syn if not np.all(choice_A==choice_B)])

# Generate choice task (two alternatives each)
## generate the index first
tasks_syn_idx = np.random.choice(np.arange(tasks_syn.shape[0]), size=N_syn, replace=True)
## indexing
X_syn = tasks_syn[tasks_syn_idx]

# compute utility
U_syn = (X_syn * theta_syn).sum(axis=2)
P_syn = sp.special.softmax(U_syn, axis=1)
y_syn_multi = np.array([np.random.multinomial(n=1, pvals=p_syn) for p_syn in P_syn])
y_syn = np.array([yi.argmax() for yi in y_syn_multi])
  • Model Code:
import pymc as pm
import pytensor
import pytensor.tensor as pt
import arviz as az
%config InlineBackend.figure_format = 'retina''arviz-darkgrid')

with pm.Model() as multi_logit:
    # priors: 
    theta = pm.Normal(name='theta', mu=0, sigma=1, shape=K_syn)
    # use scan to compute xi*θ
    U = (X_syn * theta).sum(axis=2)
    # softmax of each row
    U_normed = pt.special.softmax(U, axis=1)
    # distribution of the choice
    y_obs = pm.Categorical('y_obs', p=U_normed, observed=y_syn)

with multi_logit:
    multi_logit_trace = pm.sample()
  • Results:
az.plot_trace(multi_logit_trace, figsize=(6,2), combined=True);


az.summary(multi_logit_trace, round_to=2)
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
theta[0] 0.40 0.09 0.24 0.57 0.0 0.0 6129.47 3133.85 1.0
theta[1] 0.90 0.09 0.73 1.08 0.0 0.0 6586.45 3247.84 1.0
theta[2] 0.09 0.09 -0.07 0.26 0.0 0.0 6607.24 3475.27 1.0

We should rewrite that initial value failed message to emphasize it’s showing log probabilities. I’ve seen other users confused.