@junpenglao I am trying to implement the LDA model for a dataset from Scikit learn. For a dummy dataset my model works fine but in this case it gives MemoryError: None .
My code is :
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pymc3 as pm, theano.tensor as t
from theano import shared
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
no_features = 1000
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
# we have sparse dataset. It's better to have dence batch so that all words accure there
#minibatch_size = 128
# defining minibatch
#doc_t_minibatch = pm.Minibatch(tf.toarray(), minibatch_size)
#doc_t = shared(tf.toarray()[:minibatch_size])
K = 20 # number of topics
V = 1000 # number of words
alpha = np.ones((1, K))
beta = np.ones((1, V))
model = pm.Model()
(D, W) = tf.shape
with model:
theta = pm.Dirichlet("thetas", a=alpha, shape=(D, K))
phi = pm.Dirichlet("phis", a=beta, shape=(K, V))
z = pm.Categorical("zx", p=theta, shape=(W,D))
w = pm.Categorical("wx",
p=t.reshape(phi[z.T], (D*W, V)),
observed=tf.reshape(D*W))
with model:
tr = pm.sample(1000,chains = 1)
pm.plots.traceplot(tr, ['thetas','phis']);
Can you please help me with this? What could go wrong?