Could you help me with dirichlet process mixture model?


#1

I built a hierarchical model, but there was an error. If anyone can give me some advice, I will be very grateful.
The code is:

def stick_breaking(beta):
     portion_remaining = tt.concatenate([[1], tt.extra_ops.cumprod(1 - beta)[:-1]])
     eturn beta * portion_remaining

K = 10
pm Model() as model:
    alpha = pm.Gamma('alpha', 1., 1.)
    beta = pm.Beta('beta', 1., alpha, shape=K)
    w = pm.Deterministic('w', stick_breaking(beta))
    
    Phi = []
    Delte = []
    aj = []
    Mu = []
    Eta = []
    bj = []
    comp_dist = []
for i in range(K):`
        Phi.append(pm.InverseGamma('Phi%i'%i, alpha=1, beta=1))
        Delte.append(pm.Exponential('Delte%i'%i, lam=1/4))
        aj.append(pm.InverseGamma('aj%i'%i, alpha=Phi[i], beta=Delte[i], shape=2))
        Mu.append(pm.InverseGamma('Mu%i'%i, alpha=1, beta=1))
        Eta.append(pm.Uniform('Eta%i'%i, lower=0.0, upper=1.0))
        bj.append(bj = pm.Beta('bj%i'%i, alpha=Mu[i]*Eta[i], beta=Mu[i]*(1-Eta[i]), shape=2))
        comp_dist.append(pm.Beta.dist(alpha=aj[i], beta=bj[i], shape=2))

x_obs = pm.Mixture('x_obs', w, com_dist, observed=X)
trace = pm.sample(2000, tune=1000, chains=1)

where X.shape=(1000, 2)

The error is:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-123d410582be> in <module>()
     24         bj.append(pm.Beta('bj%i'%i, alpha=Mu[i]*Eta[i], beta=Mu[i]*(1-Eta[i]), shape=2))
     25         comp_dist.append(pm.Beta.dist(alpha=aj[i], beta=bj[i]))
---> 26     x_obs = pm.Mixture('x_obs', w, comp_dist, observed=X)
     27 
     28     trace = pm.sample(2000, tune=1000, chains=1)

~\Anaconda3\lib\site-packages\pymc3-3.5-py3.6.egg\pymc3\distributions\distribution.py in __new__(cls, name, *args, **kwargs)
     40             total_size = kwargs.pop('total_size', None)
     41             dist = cls.dist(*args, **kwargs)
---> 42             return model.Var(name, dist, data, total_size)
     43         else:
     44             raise TypeError("Name needs to be a string but got: {}".format(name))

~\Anaconda3\lib\site-packages\pymc3-3.5-py3.6.egg\pymc3\model.py in Var(self, name, dist, data, total_size)
    836                 var = ObservedRV(name=name, data=data,
    837                                  distribution=dist,
--> 838                                  total_size=total_size, model=self)
    839             self.observed_RVs.append(var)
    840             if var.missing_values:

~\Anaconda3\lib\site-packages\pymc3-3.5-py3.6.egg\pymc3\model.py in __init__(self, type, owner, index, name, data, distribution, total_size, model)
   1317 
   1318             self.missing_values = data.missing_values
-> 1319             self.logp_elemwiset = distribution.logp(data)
   1320             # The logp might need scaling in minibatches.
   1321             # This is done in `Factor`.

~\Anaconda3\lib\site-packages\pymc3-3.5-py3.6.egg\pymc3\distributions\mixture.py in logp(self, value)
    143         w = self.w
    144 
--> 145         return bound(logsumexp(tt.log(w) + self._comp_logp(value), axis=-1),
    146                      w >= 0, w <= 1, tt.allclose(w.sum(axis=-1), 1),
    147                      broadcast_conditions=False)

~\Anaconda3\lib\site-packages\theano\tensor\var.py in __add__(self, other)
    126     def __add__(self, other):
    127         try:
--> 128             return theano.tensor.basic.add(self, other)
    129         # We should catch the minimum number of exception here.
    130         # Otherwise this will convert error when Theano flags

~\Anaconda3\lib\site-packages\theano\gof\op.py in __call__(self, *inputs, **kwargs)
    672                 thunk.outputs = [storage_map[v] for v in node.outputs]
    673 
--> 674                 required = thunk()
    675                 assert not required  # We provided all inputs
    676 

~\Anaconda3\lib\site-packages\theano\gof\op.py in rval()
    860 
    861         def rval():
--> 862             thunk()
    863             for o in node.outputs:
    864                 compute_map[o][0] = True

~\Anaconda3\lib\site-packages\theano\gof\cc.py in __call__(self)
   1733                 print(self.error_storage, file=sys.stderr)
   1734                 raise
-> 1735             reraise(exc_type, exc_value, exc_trace)
   1736 
   1737 

~\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693             raise value
    694         finally:
    695             value = None

ValueError: Input dimension mis-match. (input[0].shape[2] = 7, input[1].shape[2] = 2)

Could you give me some advice about this modle?
Could you give me some advice about this modle?
#2

Did you check the Dirichlet process example here? I would definitely recommend replacing the loop with vector operations as in the example and checking if your problems persist.
http://docs.pymc.io/notebooks/dp_mix.html


#3

Thanks for your reply! I have read this document, but the example is about one-dimensional, I want to model high-dimensional data and refer to this link.https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/WIP/%5BWIP%5D%20Bayesian%20GMM.ipynb
Then I ran into this problem.:joy:


#4

High-dimensional mixture could be quite tricky to get the shape right, as the mixture distribution was not implemented with high-dimension in mind. The safest way to start is writing down the mixture logp using pm.DensityDist. You can do a search here, there are a few similar previous discussion on this.


#5

Thank a lot!
I wil try it again.


#6

I am very sorry, I will reply you so late.
I use pm.DensityDist and the dimension of the model is reduced to one-dimensional data, then the experiment is re-run.
But there was an error.
Here is my code:

K = 8
n_samples = N
from pymc3.math import logsumexp
def KMM_logp(weight, aj, bj):
    def logp(value):
        Ncomps = 8
        logps = []
        for i in range(Ncomps):
            a = aj[i]*bj[i]
            b = aj[i]*(1 - bj[i])
            result = tt.gammaln(a + b) - tt.gammaln(a) - tt.gammaln(b) + (a - 1)*tt.log(value) + (b - 1)*tt.log(1 - value)
            logps.append(tt.log(weight[i]) + result)
        return tt.sum(logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))
    return logp

with pm.Model() as model:
    alpha = pm.Gamma('alpha', 1., 1.)
    beta = pm.Beta('beta', 1., alpha, shape=K)
    w = pm.Deterministic('w', stick_breaking(beta))
    
    Phi = pm.InverseGamma('Phi', alpha=1, beta=1)
    Delte = pm.Exponential('Delte', lam=1/8)
    Mu = pm.InverseGamma('Mu', alpha=1, beta=1)
    Eta = pm.Uniform('Eta', lower=0.0, upper=1.0)
    aj = pm.InverseGamma('aj', alpha=Phi, beta=Delte, shape=K)
    bj = pm.Beta('bj', alpha=Mu*Eta, beta=Mu*(1-Eta), shape=K)
    
    obs = pm.DensityDist('obs', KMM_logp(w, aj, bj), observed=X)

I use MCMC to inference, the error is:

AttributeError: Can't pickle local object 'KMM_logp.&lt;locals&gt;.logp'

while I use variational inference, no error, but has a bad result.


#7

This used to be a problem as we use joblib to pickle function. I think this is fixed in the newest release. If you still have error, setting cores=1 should work for sure.