Custom likelihood function from kernel density estimation

I’m trying to use pymc3 with a likelihood function derived from some observed data. This observed data doesn’t fit any nice, standard distribution, so I want to define my own, based on these observations.

One approach is to use kernel density estimation over the observations. This was possible in pymc2, but doesn’t play nicely with the Theano variables in pymc3.

In my code below I’m just generating some dummy data that is normally distributed. As my prior, I’m essentially assuming a uniform distribution for my observations.

Here’s my code:

from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity

data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))

def get_log_likelihood(x):
    return kde.score_samples(x)

with pm.Model() as test_model:
    x = pm.Uniform('prior rv', lower=-10, upper=10)
    obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
        
    step = pm.Metropolis()
    trace = pm.sample(200, step=step)

The error I receive seems to be the kde score_samples function blowing up as it expects an array, but x is a Theano variable.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
      1 with pm.Model() as test_model:
      2     x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3     obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
      4 
      5     step = pm.Metropolis()

~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
     40             total_size = kwargs.pop('total_size', None)
     41             dist = cls.dist(*args, **kwargs)
---> 42             return model.Var(name, dist, data, total_size)
     43         else:
     44             raise TypeError("Name needs to be a string but got: {}".format(name))

~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
    825             with self:
    826                 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827                                       total_size=total_size, model=self)
    828             self.observed_RVs.append(var)
    829             if var.missing_values:

~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
   1372         self.missing_values = [datum.missing_values for datum in self.data.values()
   1373                                if datum.missing_values is not None]
-> 1374         self.logp_elemwiset = distribution.logp(**self.data)
   1375         # The logp might need scaling in minibatches.
   1376         # This is done in `Factor`.

<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
      1 def get_log_likelihood(x):
----> 2     return kde.score_samples(x)

~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
    150         # For it to be a probability, we must scale it.  For this reason
    151         # we'll also scale atol.
--> 152         X = check_array(X, order='C', dtype=DTYPE)
    153         N = self.tree_.data.shape[0]
    154         atol_N = self.atol * N

~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

ValueError: setting an array element with a sequence.

Any help would be greatly appreciated. Thanks!

IIUC, you look at the histogram of the observed data, kernel smooth it and use that as a likelihood, and evaluate a prior? Seems to me you can apply a density estimation using a mixture: http://docs.pymc.io/notebooks/dp_mix.html. Otherwise, you can try the Interpolated distribution, an example here: http://docs.pymc.io/notebooks/updating_priors.html