Issue passing data to a model with Custom Likelihood

Hello,

I’m ultimately trying to make a hierarchical model with a custom likelihood function and I’ve followed this great post as a guideline. To make my life easier, I’m going with an unpooled model to begin with, but I am having an issue pushing data into the custom likelihood function.

I’m fitting a beta geometric model to my data and wanted to start by doing several groups at the same time (unpooled). I replicated the data from Fader & Hardie a few times as a test. I’ve tried making a Pandas DataFrame to store and access the data, but that failed, so I’ve moved on to using a dictionary. My code is:

import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
from scipy.special import beta as beta_f
import theano.tensor as tt
import theano
from pymc3.distributions.dist_math import betaln

test_dict = {}
for i in range(n_groups):
    test_dict[i] = {}
    test_dict[i]['survived'] = np.array([869, 743, 653, 593, 551, 517, 491])
    test_dict[i]['churned'] = np.array([131, 126, 90, 60, 42, 34, 26])

n_groups = len(test_dict.keys())

with pm.Model() as unpooled_sbg_model:
    alpha = pm.Uniform('alpha', 0.0001, 100, shape = n_groups)
    beta = pm.Uniform('beta', 0.0001, 100, shape = n_groups)
    
    def logp(data_dict, alpha = alpha, beta = beta):
        log_like = tt.cast(0., 'float64')
        
        for i in range(n_groups):
            survived = data_dict[i]['survived']
            churned = data_dict[i]['churned']
            
            n_obs = np.shape(survived)[0]
            
            ii = theano.shared(i)  # From the otherr link I saw this needs to be a shared variable
            # Calculate the final survival probability (eq. 6)
            ln_surv_prob = betaln(alpha[ii], beta[ii] + n_obs) - betaln(alpha[ii], beta[ii])

            # Find the probability of churn for all values prior
            ln_prob_vec = []
            
            for j in range(1, n_obs + 1):
                ln_prob_vec.append(betaln(alpha[ii] + 1, beta[ii] + j - 1) - betaln(alpha[ii], beta[ii]))
            ln_prob_vec = tt.as_tensor_variable(ln_prob_vec, 'ln_prob_vec', ndim = 1)
            
            log_like += pm.math.dot(churned, ln_prob_vec) + survived[-1] * ln_surv_prob 

        return log_like
    
    likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
    unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

However I get the error:


TypeErrorTraceback (most recent call last)
<ipython-input-34-2cba9497e014> in <module>
     44         return log_like
     45 
---> 46     likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
     47     unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

/usr/local/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
    120         else:
    121             dist = cls.dist(*args, **kwargs)
--> 122         return model.Var(name, dist, data, total_size, dims=dims)
    123 
    124     def __getnewargs__(self):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
   1160         elif isinstance(data, dict):
   1161             with self:
-> 1162                 var = MultiObservedRV(
   1163                     name=name,
   1164                     data=data,

/usr/local/lib/python3.8/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
   1864         """
   1865         self.name = name
-> 1866         self.data = {
   1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }

/usr/local/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
   1865         self.name = name
   1866         self.data = {
-> 1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }
   1869 

/usr/local/lib/python3.8/site-packages/pymc3/model.py in as_tensor(data, name, model, distribution)
   1745 def as_tensor(data, name, model, distribution):
   1746     dtype = distribution.dtype
-> 1747     data = pandas_to_array(data).astype(dtype)
   1748 
   1749     if hasattr(data, "mask"):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in pandas_to_array(data)
   1740     # needed for uses of this function other than with pm.Data:
   1741     else:
-> 1742         return pm.floatX(ret)
   1743 
   1744 

/usr/local/lib/python3.8/site-packages/pymc3/theanof.py in floatX(X)
     81     """
     82     try:
---> 83         return X.astype(theano.config.floatX)
     84     except AttributeError:
     85         # Scalar passed

TypeError: float() argument must be a string or a number, not 'dict'

It seems the error is coming from the observed part of the pm.DensityDist call, but I can’t understand what the issue is and more importantly how to fix it.

Any assistance would be greatly appreciated!

I would assume that the observed data needs to be a passed in as a numpy array that could then be converted into a tt/aesara shared variable within the likelihood function. Have you tried that?

Hi there! Thanks for the suggestion. It sort of worked in the sense that it changed the error. I created an array of arrays:

test_array = np.array(test_dict[0])
for i in range (1, n_groups):
    test_array = np.append(test_array, test_dict[i])

and then modified the code to accept this and run it:

with pm.Model() as unpooled_sbg_model:
    alpha = pm.Uniform('alpha', 0.0001, 100, shape = n_groups)
    beta = pm.Uniform('beta', 0.0001, 100, shape = n_groups)
    
    def logp(data_array, alpha = alpha, beta = beta):
        log_like = tt.cast(0., 'float64')
        
        for i, group in enumerate(group_cats):
    
            survived = data_array[i]['survived']
            churned = data_array[i]['churned']
            
            n_obs = np.shape(survived)[0]
            
            ii = theano.shared(i)
            # Calculate the final surivival probability (eq. 6)
            ln_surv_prob = betaln(alpha[ii], beta[ii] + n_obs) - betaln(alpha[ii], beta[ii])

            # Find the probability of churn for all values prior
            ln_prob_vec = []
            
            for j in range(1, n_obs + 1):
                ln_prob_vec.append(betaln(alpha[ii] + 1, beta[ii] + j - 1) - betaln(alpha[ii], beta[ii]))
            ln_prob_vec = tt.as_tensor_variable(ln_prob_vec, 'ln_prob_vec', ndim = 1)
            
            log_like += pm.math.dot(churned, ln_prob_vec) + survived[-1] * ln_surv_prob 

        return log_like
    
    likelihood = pm.DensityDist('likelihood', logp, observed = {'data_array': test_array})
    unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

Only now I get:

TypeErrorTraceback (most recent call last)
<ipython-input-40-a27502279da0> in <module>
     28         return log_like
     29 
---> 30     likelihood = pm.DensityDist('likelihood', logp, observed = {'data_array': test_array})
     31     unpooled_sbg_trace = pm.sample(tune = 12000, draws = 2000, target_accept = 0.99, chains = 4, cores = 4) # just a sanity check to see if it even runs...

/usr/local/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
    120         else:
    121             dist = cls.dist(*args, **kwargs)
--> 122         return model.Var(name, dist, data, total_size, dims=dims)
    123 
    124     def __getnewargs__(self):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
   1160         elif isinstance(data, dict):
   1161             with self:
-> 1162                 var = MultiObservedRV(
   1163                     name=name,
   1164                     data=data,

/usr/local/lib/python3.8/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
   1864         """
   1865         self.name = name
-> 1866         self.data = {
   1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }

/usr/local/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
   1865         self.name = name
   1866         self.data = {
-> 1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }
   1869 

/usr/local/lib/python3.8/site-packages/pymc3/model.py in as_tensor(data, name, model, distribution)
   1745 def as_tensor(data, name, model, distribution):
   1746     dtype = distribution.dtype
-> 1747     data = pandas_to_array(data).astype(dtype)
   1748 
   1749     if hasattr(data, "mask"):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in pandas_to_array(data)
   1716         else:
   1717             # already a ndarray, but not masked
-> 1718             mask = np.isnan(data)
   1719             if np.any(mask):
   1720                 ret = np.ma.MaskedArray(data, mask)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

but I have no idea why it’s calling pandas to array and hating np.isnan. Do you have ideas?

Sorry, I guess what I meant was a numpy array of “raw” values (floats, ints, etc.). Right now, you’ve packaged your dictionaries into a numpy array, but then you still have to deal with dictionaries as the entries in that array (and run into trouble converting those arrays into tensors).

Brilliant! That seems to have fixed it generally (I did get an error where n_obs was somehow being converted into a Theano shared variable), but it is currently running.

I appreciate all your help!