Hello,
I’m ultimately trying to make a hierarchical model with a custom likelihood function and I’ve followed this great post as a guideline. To make my life easier, I’m going with an unpooled model to begin with, but I am having an issue pushing data into the custom likelihood function.
I’m fitting a beta geometric model to my data and wanted to start by doing several groups at the same time (unpooled). I replicated the data from Fader & Hardie a few times as a test. I’ve tried making a Pandas DataFrame to store and access the data, but that failed, so I’ve moved on to using a dictionary. My code is:
import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
from scipy.special import beta as beta_f
import theano.tensor as tt
import theano
from pymc3.distributions.dist_math import betaln
test_dict = {}
for i in range(n_groups):
test_dict[i] = {}
test_dict[i]['survived'] = np.array([869, 743, 653, 593, 551, 517, 491])
test_dict[i]['churned'] = np.array([131, 126, 90, 60, 42, 34, 26])
n_groups = len(test_dict.keys())
with pm.Model() as unpooled_sbg_model:
alpha = pm.Uniform('alpha', 0.0001, 100, shape = n_groups)
beta = pm.Uniform('beta', 0.0001, 100, shape = n_groups)
def logp(data_dict, alpha = alpha, beta = beta):
log_like = tt.cast(0., 'float64')
for i in range(n_groups):
survived = data_dict[i]['survived']
churned = data_dict[i]['churned']
n_obs = np.shape(survived)[0]
ii = theano.shared(i) # From the otherr link I saw this needs to be a shared variable
# Calculate the final survival probability (eq. 6)
ln_surv_prob = betaln(alpha[ii], beta[ii] + n_obs) - betaln(alpha[ii], beta[ii])
# Find the probability of churn for all values prior
ln_prob_vec = []
for j in range(1, n_obs + 1):
ln_prob_vec.append(betaln(alpha[ii] + 1, beta[ii] + j - 1) - betaln(alpha[ii], beta[ii]))
ln_prob_vec = tt.as_tensor_variable(ln_prob_vec, 'ln_prob_vec', ndim = 1)
log_like += pm.math.dot(churned, ln_prob_vec) + survived[-1] * ln_surv_prob
return log_like
likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)
However I get the error:
TypeErrorTraceback (most recent call last)
<ipython-input-34-2cba9497e014> in <module>
44 return log_like
45
---> 46 likelihood = pm.DensityDist('likelihood', logp, observed = {'data_dict': test_dict})
47 unpooled_sbg_trace = pm.sample(tune = 1000, draws = 200, chains = 4, cores = 4)
/usr/local/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
120 else:
121 dist = cls.dist(*args, **kwargs)
--> 122 return model.Var(name, dist, data, total_size, dims=dims)
123
124 def __getnewargs__(self):
/usr/local/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
1160 elif isinstance(data, dict):
1161 with self:
-> 1162 var = MultiObservedRV(
1163 name=name,
1164 data=data,
/usr/local/lib/python3.8/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1864 """
1865 self.name = name
-> 1866 self.data = {
1867 name: as_tensor(data, name, model, distribution) for name, data in data.items()
1868 }
/usr/local/lib/python3.8/site-packages/pymc3/model.py in <dictcomp>(.0)
1865 self.name = name
1866 self.data = {
-> 1867 name: as_tensor(data, name, model, distribution) for name, data in data.items()
1868 }
1869
/usr/local/lib/python3.8/site-packages/pymc3/model.py in as_tensor(data, name, model, distribution)
1745 def as_tensor(data, name, model, distribution):
1746 dtype = distribution.dtype
-> 1747 data = pandas_to_array(data).astype(dtype)
1748
1749 if hasattr(data, "mask"):
/usr/local/lib/python3.8/site-packages/pymc3/model.py in pandas_to_array(data)
1740 # needed for uses of this function other than with pm.Data:
1741 else:
-> 1742 return pm.floatX(ret)
1743
1744
/usr/local/lib/python3.8/site-packages/pymc3/theanof.py in floatX(X)
81 """
82 try:
---> 83 return X.astype(theano.config.floatX)
84 except AttributeError:
85 # Scalar passed
TypeError: float() argument must be a string or a number, not 'dict'
It seems the error is coming from the observed
part of the pm.DensityDist
call, but I can’t understand what the issue is and more importantly how to fix it.
Any assistance would be greatly appreciated!