Create model matrix

Hello,

I’ve built a GLM with Wald response in Bambi and I wanted to replicate that in PyMC3. The first version of the model is more or less what Bambi does under the hood, and the second version is my attempt to have it in a more concise manner in PyMC3 (which does not work)

EDIT: I’m tagging @agustinaarroyuelo here since she is the one who originally tried to fit the same model using both libraries and commented to me about the problem.

Setup


import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt

data = pd.read_csv("https://courses.ms.ut.ee/2020/glm/spring/uploads/Main/carclaims.csv")
data = data[data["claimcst0"] > 0]
# data.head()

# Get components of the model matrix
intercept = np.ones((len(data), 1))
age = pd.get_dummies(data["agecat"], drop_first = True).to_numpy()
area = pd.get_dummies(pd.Categorical(data.area), drop_first = True).to_numpy()
gender = pd.get_dummies(pd.Categorical(data.gender), drop_first = True).to_numpy()

Model 1:

with pm.Model() as model_1:
    mu = 0.0
    
    # Build predictors
    β_0 = pm.Normal(
        "β_0", 
        mu=np.array(7.6), 
        sigma=np.array(2.7),
    )
    mu += pm.math.dot(intercept, β_0)[:, None]
    
    β_age = pm.Normal(
        "β_age",       
        mu=np.array([0] * 5), 
        sigma=np.array([0.32, 6.94, 1.13, 5.44, 9.01]),
        shape=5
    )
    mu += pm.math.dot(age, β_age)[:, None]
    
    β_gender = pm.Normal(
        "β_gender", 
        mu=np.array(0), 
        sigma=np.array(1.30),
    )
    mu += pm.math.dot(gender, β_gender)[:, None]
    
    β_area = pm.Normal(
        "β_area", 
        mu=np.array([0] * 5),  
        sigma=np.array([0.86, 0.25, 1.30, 0.76, 5.33]), 
        shape=5
    )
    
    mu += pm.math.dot(area, β_area)[:, None]
    # Log link
    mu = tt.exp(mu)
    
    # Get response
    response = np.array([data["claimcst0"]]).T
    
    pm.Wald("claim", mu=mu, lam = pm.HalfCauchy("claim_lam", beta=1), observed=response)
    idata_1 = pm.sample(return_inferencedata=True)

This model works, it takes around 30 seconds in my computer to finish and the results are the same than the ones I get with Bambi (as expected, since I replicated what happens within Bambi).

Model 2

Here I thought: “Ok, what if I concatenate the betas and do just one matrix multiplication?” And the code would be something like


with pm.Model() as model_2:    
    # Same as before
    β_0 = pm.Normal(
        "β_0", 
        mu=np.array(7.6), 
        sigma=np.array(2.7),
    )
    β_age = pm.Normal(
        "β_age",       
        mu=np.array([0] * 5), 
        sigma=np.array([0.32, 6.94, 1.13, 5.44, 9.01]),
        shape=5
    )
    β_gender = pm.Normal(
        "β_gender", 
        mu=np.array(0), 
        sigma=np.array(1.30),
    )
    β_area = pm.Normal(
        "β_area", 
        mu=np.array([0] * 5),  
        sigma=np.array([0.86, 0.25, 1.30, 0.76, 5.33]), 
        shape=5
    )
    
    # HERE somehow concatenate the betas to get a single column vector β
    # ...
    # mu = pm.math.dot(X, β)
    # Log link
    mu = tt.exp(mu)
    
    # Get response
    response = np.array([data["claimcst0"]]).T
    
    pm.Wald("claim", mu=mu, lam = pm.HalfCauchy("claim_lam", beta=1), observed=response)
    idata_1 = pm.sample(return_inferencedata=True)

What I tried:

  • tt.concatenate and using shape=(5, 1), as suggested here But it returned the following
TypeError: ('Non-unit value on shape on a broadcastable dimension.', (5, 5), (False, True))
  • Use tt.inc_subtensor as here, but my computer crashed.

  • I also attempted with just removing the [:, None] in model_1 to see what happened and my computer also crashed.

So, I don’t really know where to go next with this problem. I would really appreciate any suggestion that anyone can give here.

%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Wed May 12 2021

Python implementation: CPython
Python version       : 3.8.5
IPython version      : 7.18.1

numpy : 1.20.1
pymc3 : 3.11.2
theano: 1.0.5
pandas: 1.2.2
json  : 2.0.9

Watermark: 2.1.0
2 Likes

I got this working, still don’t know if it is the right approach.

import arviz as az
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt

# Import and subset data
data = pd.read_csv("docs/notebooks/data/carclaims.csv")
data = data[data["claimcst0"] > 0]

# Prepare data
intercept = np.ones((len(data), 1))
age = pd.get_dummies(data["agecat"], drop_first = True).to_numpy()
area = pd.get_dummies(pd.Categorical(data.area), drop_first = True).to_numpy()
gender = pd.get_dummies(pd.Categorical(data.gender), drop_first = True).to_numpy()

X = np.hstack([intercept, age, area, gender])
X = tt.as_tensor_variable(X)

# Create model and sample posterior
with pm.Model() as model_no_interaction:  
    # Build predictors
    β_0 = pm.Normal(
        "β_0", 
        mu=np.array([[7.60807869]]), 
        sigma=np.array([[2.73386528]]),
        shape=(1, 1)
    )  
    β_age = pm.Normal(
        "β_age",       
        mu=np.array([[0], [0], [0], [0], [0]]), 
        sigma=np.array([[0.31867117], [6.93840801], [1.13150523], [5.43977033], [9.01140827]]),
        shape=(5, 1)
    )
    β_gender = pm.Normal(
        "β_gender", 
        mu=np.array([[0]]), 
        sigma=np.array([[1.304491]]),
        shape=(1, 1)
    )
    β_area = pm.Normal(
        "β_area", 
        mu=np.array([[0], [0], [0], [0], [0]]), 
        sigma=np.array([[0.85686228], [0.25352232], [1.29795299], [0.76293516], [5.32647894]]), 
        shape=(5, 1)
    )
    # Compute linear predictor
    β = pm.math.concatenate([β_0, β_age, β_gender, β_area], axis=0)
    mu = tt.exp(X.dot(β))
    
    response = np.array([data["claimcst0"]]).T
    
    pm.Wald("claim", mu=mu, lam = pm.HalfCauchy("claim_lam", beta=1), observed=response)
    idata = pm.sample(return_inferencedata=True)

# Obtain summary
az.summary(idata)

1 Like

I’m thinking that if we could convert \beta onto a vector, instead of a (p, 1) matrix, the second dimension would disappear, but I don’t know how to do it. β.flatten() and β.reshape((β.shape[0], )) work but the sampler does not move and β.squeeze() does not work.

Did you try setting shape=(1, 1) to shape=(1, ) and shape=(5, 1) to shape=(5, )? I think that might work.
Otherwise, what you are doing is correct (ie same as the model 1)

Using shape=(1,) and shape=(5,) produces the same effect than using flatten() and reshape(). It seems to start working, but the sampler estimates it is going to take ~30 mins when the dot product using two objects with ndim=2 takes ~ 10 seconds.

I think I’m going to keep the dot product using objects with 2 dimensions and then I’m going to squeeze the posterior in the InferenceData object.

Thanks for the input anyway!

I am slightly suprised by the slowness - even with a vector \beta you can use dot product right?

Yes, I can still use dot product, but it is very slow (and sometimes my kernel is killed).

Example


import arviz as az
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt

# Import and subset data
data = pd.read_csv("docs/notebooks/data/carclaims.csv")

data = data[data["claimcst0"] > 0]
# Prepare data
intercept = np.ones((len(data), 1))
age = pd.get_dummies(data["agecat"], drop_first = True).to_numpy()
area = pd.get_dummies(pd.Categorical(data.area), drop_first = True).to_numpy()
gender = pd.get_dummies(pd.Categorical(data.gender), drop_first = True).to_numpy()

# IMPORTANT! Convert X to tensor
X = np.hstack([intercept, age, area, gender])
X = tt.as_tensor_variable(X)

# Create model and sample posterior
with pm.Model() as model_no_interaction:  
    # Build predictors
    β_0 = pm.Normal(
        "β_0", 
        mu=np.array([7.60807869]), 
        sigma=np.array([2.73386528]),
        shape=(1,)
    )  
    β_age = pm.Normal(
        "β_age",       
        mu=np.array([0] * 5), 
        sigma=np.array([0.31867117, 6.93840801, 1.13150523, 5.43977033, 9.01140827]),
        shape=(5,)
    )
    β_gender = pm.Normal(
        "β_gender", 
        mu=np.array([0]), 
        sigma=np.array([1.304491]),
        shape=(1,)
    )
    β_area = pm.Normal(
        "β_area", 
        mu=np.array([0] * 5), 
        sigma=np.array([0.85686228, 0.25352232, 1.29795299, 0.76293516, 5.32647894]), 
        shape=(5,)
    )
    # Compute linear predictor
    β = pm.math.concatenate([β_0, β_age, β_gender, β_area], axis=0)
    mu = tt.exp(X.dot(β))
    response = np.array([data["claimcst0"]]).T
    pm.Wald("claim", mu=mu, lam = pm.HalfCauchy("claim_lam", beta=1), observed=response)
    idata = pm.sample(return_inferencedata=True)
    
# Obtain summary
az.summary(idata)

An update here

This simplified version works the same way as the one doing matrix multiplications

response = data.claimcst0.values

# Create model and sample posterior
with pm.Model() as model_no_interaction:  
    # Build predictors
    β_0 = pm.Normal(
        "β_0", 
        mu=7.60807869, 
        sigma=2.73386528,
        shape=1
    )  
    β_age = pm.Normal(
        "β_age",       
        mu=np.array([0] * 5), 
        sigma=np.array([0.31867117, 6.93840801, 1.13150523, 5.43977033, 9.01140827]),
        shape=5
    )
    β_gender = pm.Normal(
        "β_gender", 
        mu=0, 
        sigma=1.304491,
        shape=1
    )
    β_area = pm.Normal(
        "β_area", 
        mu=np.array([0] * 5), 
        sigma=np.array([0.85686228, 0.25352232, 1.29795299, 0.76293516, 5.32647894]), 
        shape=5
    )
    
    # Compute linear predictor
    β = tt.concatenate([β_0, β_age, β_gender, β_area])
    mu = tt.exp(pm.math.dot(X, β))
    
    pm.Wald("claim", mu=mu, lam = pm.HalfCauchy("claim_lam", beta=1), observed=response)
    idata = pm.sample(return_inferencedata=True)

What changed?

It looks like my mistake was using a two-dimensional response when the random variables were one-dimensional. Now that everything is one-dimensional, the sampler runs like a charm.

Take-home message

Make sure all variables in the model have the same dimension (i.e. .ndim values on each RV match). Even the observed one.

1 Like