I’m trying to fit a mixture of dirichlet distributions but am running into difficulties. While I don’t get any errors, the fits that I get are obviously incorrect. Here is my code:
%pylab inline
import numpy as np
import pymc3 as pm
import theano.tensor as tt
import pandas as pd
import random
import math
def create_data():
dir1 = pm.distributions.multivariate.Dirichlet.dist(np.array([1, 5, 2]))
dir2 = pm.distributions.multivariate.Dirichlet.dist(np.array([7, .5, 1]))
dir3 = pm.distributions.multivariate.Dirichlet.dist(np.array([2, 3, 3]))
data = np.concatenate((dir1.random(size=700), dir1.random(size=200), dir1.random(size=100)), axis=0)
return data
def dirichlet(n_dim, suffix=""):
if not isinstance(suffix, str):
suffix = str(suffix)
b = pm.HalfNormal("b" + suffix, sigma=10)
a = pm.Dirichlet("a" + suffix, np.ones(n_dim))
c = pm.Deterministic("c" + suffix, a * b)
return pm.Dirichlet.dist(c, shape=3)
def stick_breaking(beta):
portion_remaining = tt.concatenate([[1], tt.extra_ops.cumprod(1 - beta)[:-1]])
return beta * portion_remaining
def estimate_model(data, n_clusters, n_features):
with pm.Model() as model:
alpha = pm.Gamma('alpha', 1., 1.)
beta = pm.Beta('beta', 1, alpha, shape=n_clusters)
w = pm.Dirichlet('w', stick_breaking(beta), shape=n_clusters)
obs = pm.Mixture('obs', w, [dirichlet(3, k) for k in range(n_clusters)], observed=data)
trace = pm.sample(50000, tune=10000)
pm.traceplot(trace, ["w", "a0", "b0", "c0", "a1", "b1", "c1"])
return pm.summary(trace)
data = create_data()
summ = estimate_model(data, 10, 3)
print(summ)
To match the data generating process, I should end up with 3 w’s with significant weight, at about .7, .2, and .1, and the c’s should reflect the vectors in the create_data method. I never get anything close to this result.
Any suggestions?