I’m trying to use pymc3 with a likelihood function derived from some observed data. This observed data doesn’t fit any nice, standard distribution, so I want to define my own, based on these observations.
One approach is to use kernel density estimation over the observations. This was possible in pymc2, but doesn’t play nicely with the Theano variables in pymc3.
In my code below I’m just generating some dummy data that is normally distributed. As my prior, I’m essentially assuming a uniform distribution for my observations.
Here’s my code:
from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity
data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))
def get_log_likelihood(x):
return kde.score_samples(x)
with pm.Model() as test_model:
x = pm.Uniform('prior rv', lower=-10, upper=10)
obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
step = pm.Metropolis()
trace = pm.sample(200, step=step)
The error I receive seems to be the kde score_samples
function blowing up as it expects an array, but x
is a Theano variable.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
1 with pm.Model() as test_model:
2 x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3 obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
4
5 step = pm.Metropolis()
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
825 with self:
826 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827 total_size=total_size, model=self)
828 self.observed_RVs.append(var)
829 if var.missing_values:
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1372 self.missing_values = [datum.missing_values for datum in self.data.values()
1373 if datum.missing_values is not None]
-> 1374 self.logp_elemwiset = distribution.logp(**self.data)
1375 # The logp might need scaling in minibatches.
1376 # This is done in `Factor`.
<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
1 def get_log_likelihood(x):
----> 2 return kde.score_samples(x)
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
150 # For it to be a probability, we must scale it. For this reason
151 # we'll also scale atol.
--> 152 X = check_array(X, order='C', dtype=DTYPE)
153 N = self.tree_.data.shape[0]
154 atol_N = self.atol * N
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
Any help would be greatly appreciated. Thanks!