LDA implementation with pymc3

@junpenglao I am trying to implement the LDA model for a dataset from Scikit learn. For a dummy dataset my model works fine but in this case it gives MemoryError: None .

My code is :

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pymc3 as pm, theano.tensor as t
from theano import shared

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

# we have sparse dataset. It's better to have dence batch so that all words accure there
#minibatch_size = 128

# defining minibatch
#doc_t_minibatch = pm.Minibatch(tf.toarray(), minibatch_size)
#doc_t = shared(tf.toarray()[:minibatch_size])

K = 20 # number of topics
V = 1000 # number of words

alpha = np.ones((1, K))
beta = np.ones((1, V))
model = pm.Model()
(D, W) = tf.shape
        
with model: 
    theta = pm.Dirichlet("thetas", a=alpha, shape=(D, K))
    phi = pm.Dirichlet("phis", a=beta, shape=(K, V))
    z = pm.Categorical("zx", p=theta, shape=(W,D))
    w = pm.Categorical("wx", 
                       p=t.reshape(phi[z.T], (D*W, V)), 
                       observed=tf.reshape(D*W))
    
   
with model:    
    tr = pm.sample(1000,chains = 1)
    pm.plots.traceplot(tr, ['thetas','phis']);

Can you please help me with this? What could go wrong?