Issue passing data to a model with Custom Likelihood

Hello,

I’m ultimately trying to make a hierarchical model with a custom likelihood function and I’ve followed this great post as a guideline. To make my life easier, I’m going with an unpooled model to begin with, but I am having an issue pushing data into the custom likelihood function.

I’m fitting a beta geometric model to my data and wanted to start by doing several groups at the same time (unpooled). I replicated the data from Fader & Hardie a few times as a test. I’ve tried making a Pandas DataFrame to store and access the data, but that failed, so I’ve moved on to using a dictionary. My code is:

import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
from scipy.special import beta as beta_f
import theano.tensor as tt
import theano
from pymc3.distributions.dist_math import betaln

test_dict = {}
for i in range(n_groups):
    test_dict[i] = {}
    test_dict[i]['survived'] = np.array([869, 743, 653, 593, 551, 517, 491])
    test_dict[i]['churned'] = np.array([131, 126, 90, 60, 42, 34, 26])

n_groups = len(test_dict.keys())

with pm.Model() as unpooled_sbg_model:
    alpha = pm.Uniform('alpha', 0.0001, 100, shape = n_groups)
    beta = pm.Uniform('beta', 0.0001, 100, shape = n_groups)
    
    def logp(data_dict, alpha = alpha, beta = beta):
        log_like = tt.cast(0., 'float64')
        
        for i in range(n_groups):
            survived = data_dict[i]['survived']
            churned = data_dict[i]['churned']
            
            n_obs = np.shape(survived)[0]
            
            ii = theano.shared(i)  # From the otherr link I saw this needs to be a shared variable
            # Calculate the final survival probability (eq. 6)
            ln_surv_prob = betaln(alpha[ii], beta[ii] + n_obs) - betaln(alpha[ii], beta[ii])

            # Find the probability of churn for all values prior
            ln_prob_vec = []
            
            for j in range(1, n_obs + 1):
                ln_prob_vec.append(betaln(alpha[ii] + 1, beta[ii] + j - 1) - betaln(alpha[ii], beta[ii]))
            ln_prob_vec = tt.as_tensor_variable(ln_prob_vec, 'ln_prob_vec', ndim = 1)
            
            log_like += pm.math.dot(churned, ln_prob_vec) + survived[-1] * ln_surv_prob 

        return log_like
    
    likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
    unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

However I get the error:


TypeErrorTraceback (most recent call last)
<ipython-input-34-2cba9497e014> in <module>
     44         return log_like
     45 
---> 46     likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
     47     unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

/usr/local/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
    120         else:
    121             dist = cls.dist(*args, **kwargs)
--> 122         return model.Var(name, dist, data, total_size, dims=dims)
    123 
    124     def __getnewargs__(self):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
   1160         elif isinstance(data, dict):
   1161             with self:
-> 1162                 var = MultiObservedRV(
   1163                     name=name,
   1164                     data=data,

/usr/local/lib/python3.8/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
   1864         """
   1865         self.name = name
-> 1866         self.data = {
   1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }

/usr/local/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
   1865         self.name = name
   1866         self.data = {
-> 1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }
   1869 

/usr/local/lib/python3.8/site-packages/pymc3/model.py in as_tensor(data, name, model, distribution)
   1745 def as_tensor(data, name, model, distribution):
   1746     dtype = distribution.dtype
-> 1747     data = pandas_to_array(data).astype(dtype)
   1748 
   1749     if hasattr(data, "mask"):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in pandas_to_array(data)
   1740     # needed for uses of this function other than with pm.Data:
   1741     else:
-> 1742         return pm.floatX(ret)
   1743 
   1744 

/usr/local/lib/python3.8/site-packages/pymc3/theanof.py in floatX(X)
     81     """
     82     try:
---> 83         return X.astype(theano.config.floatX)
     84     except AttributeError:
     85         # Scalar passed

TypeError: float() argument must be a string or a number, not 'dict'

It seems the error is coming from the observed part of the pm.DensityDist call, but I can’t understand what the issue is and more importantly how to fix it.

Any assistance would be greatly appreciated!

I would assume that the observed data needs to be a passed in as a numpy array that could then be converted into a tt/aesara shared variable within the likelihood function. Have you tried that?

Hi there! Thanks for the suggestion. It sort of worked in the sense that it changed the error. I created an array of arrays:

test_array = np.array(test_dict[0])
for i in range (1, n_groups):
    test_array = np.append(test_array, test_dict[i])

and then modified the code to accept this and run it:

with pm.Model() as unpooled_sbg_model:
    alpha = pm.Uniform('alpha', 0.0001, 100, shape = n_groups)
    beta = pm.Uniform('beta', 0.0001, 100, shape = n_groups)
    
    def logp(data_array, alpha = alpha, beta = beta):
        log_like = tt.cast(0., 'float64')
        
        for i, group in enumerate(group_cats):
    
            survived = data_array[i]['survived']
            churned = data_array[i]['churned']
            
            n_obs = np.shape(survived)[0]
            
            ii = theano.shared(i)
            # Calculate the final surivival probability (eq. 6)
            ln_surv_prob = betaln(alpha[ii], beta[ii] + n_obs) - betaln(alpha[ii], beta[ii])

            # Find the probability of churn for all values prior
            ln_prob_vec = []
            
            for j in range(1, n_obs + 1):
                ln_prob_vec.append(betaln(alpha[ii] + 1, beta[ii] + j - 1) - betaln(alpha[ii], beta[ii]))
            ln_prob_vec = tt.as_tensor_variable(ln_prob_vec, 'ln_prob_vec', ndim = 1)
            
            log_like += pm.math.dot(churned, ln_prob_vec) + survived[-1] * ln_surv_prob 

        return log_like
    
    likelihood = pm.DensityDist('likelihood', logp, observed = {'data_array': test_array})
    unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)

Only now I get:

TypeErrorTraceback (most recent call last)
<ipython-input-40-a27502279da0> in <module>
     28         return log_like
     29 
---> 30     likelihood = pm.DensityDist('likelihood', logp, observed = {'data_array': test_array})
     31     unpooled_sbg_trace = pm.sample(tune = 12000, draws = 2000, target_accept = 0.99, chains = 4, cores = 4) # just a sanity check to see if it even runs...

/usr/local/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
    120         else:
    121             dist = cls.dist(*args, **kwargs)
--> 122         return model.Var(name, dist, data, total_size, dims=dims)
    123 
    124     def __getnewargs__(self):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
   1160         elif isinstance(data, dict):
   1161             with self:
-> 1162                 var = MultiObservedRV(
   1163                     name=name,
   1164                     data=data,

/usr/local/lib/python3.8/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
   1864         """
   1865         self.name = name
-> 1866         self.data = {
   1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }

/usr/local/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
   1865         self.name = name
   1866         self.data = {
-> 1867             name: as_tensor(data, name, model, distribution) for name, data in data.items()
   1868         }
   1869 

/usr/local/lib/python3.8/site-packages/pymc3/model.py in as_tensor(data, name, model, distribution)
   1745 def as_tensor(data, name, model, distribution):
   1746     dtype = distribution.dtype
-> 1747     data = pandas_to_array(data).astype(dtype)
   1748 
   1749     if hasattr(data, "mask"):

/usr/local/lib/python3.8/site-packages/pymc3/model.py in pandas_to_array(data)
   1716         else:
   1717             # already a ndarray, but not masked
-> 1718             mask = np.isnan(data)
   1719             if np.any(mask):
   1720                 ret = np.ma.MaskedArray(data, mask)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

but I have no idea why it’s calling pandas to array and hating np.isnan. Do you have ideas?

Sorry, I guess what I meant was a numpy array of “raw” values (floats, ints, etc.). Right now, you’ve packaged your dictionaries into a numpy array, but then you still have to deal with dictionaries as the entries in that array (and run into trouble converting those arrays into tensors).

1 Like

Brilliant! That seems to have fixed it generally (I did get an error where n_obs was somehow being converted into a Theano shared variable), but it is currently running.

I appreciate all your help!

1 Like