I am trying to build a hierarchical model, where records are divided into two categories: score classes and topics. I have a dataset of 239417 with 4 score classes and 12 topics. The model I am trying to build is raising and exception:
import pymc3 as pm
topic_score_model_2level = pm.Model()
n_score_classes = 4
n_topics = len(topics) * n_score_classes
batch_size = 64
data = indexed_score_topics_df
# X = pm.Minibatch(data.score.astype('float32').values, batch_size=batch_size)
# idx_topic = pm.Minibatch(data.index_topic.values, batch_size=batch_size)
# idx_score_class = pm.Minibatch(data.index_score_class.values, batch_size=batch_size)
X = data.score.astype('float32').values
idx_topic = data.index_topic.values
idx_score_class = data.index_score_class.values
with topic_score_model_2level:
global_mu = pm.Normal('global_mu', mu=10, sigma=25)
global_sd = pm.Uniform('global_sd', lower=0, upper=50)
score_class_mu = pm.Normal(
'score_class_mu',
mu=global_mu,
sd=global_sd,
shape=n_score_classes
)
score_class_sd = pm.Uniform(
'score_class_sd',
lower=100,
upper=200,
shape=n_score_classes
)
topic = pm.Normal(
'topic_mu',
mu=score_class_mu[idx_score_class],
sd=score_class_sd[idx_score_class],
shape=n_topics
)
topic_intercept = pm.Flat("topic_intercept")
theta = np.exp(topic_intercept + topic[idx_topic])
y = pm.Poisson('y', mu=theta, observed=X, total_size=len(data))
advi_fit = pm.fit(n=10000, method='fullrank_advi')
The exception is the following:
ValueError Traceback (most recent call last)
<ipython-input-321-1510f670ffea> in <module>
34 )
35
---> 36 topic = pm.Normal(
37 'topic_mu',
38 mu=score_class_mu[idx_score_class],
~/miniconda3/envs/ds38/lib/python3.8/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
120 else:
121 dist = cls.dist(*args, **kwargs)
--> 122 return model.Var(name, dist, data, total_size, dims=dims)
123
124 def __getnewargs__(self):
~/miniconda3/envs/ds38/lib/python3.8/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size, dims)
1135 if getattr(dist, "transform", None) is None:
1136 with self:
-> 1137 var = FreeRV(name=name, distribution=dist, total_size=total_size, model=self)
1138 self.free_RVs.append(var)
1139 else:
~/miniconda3/envs/ds38/lib/python3.8/site-packages/pymc3/model.py in __init__(self, type, owner, index, name, distribution, total_size, model)
1666 self.distribution = distribution
1667 self.tag.test_value = (
-> 1668 np.ones(distribution.shape, distribution.dtype) * distribution.default()
1669 )
1670 self.logp_elemwiset = distribution.logp(self)
ValueError: operands could not be broadcast together with shapes (48,) (239417,)
How should change the indexing so that pymc3 won’t try to broadcast my data sample to number of topic distributions? 48 is the total number of topic distributions = 4 score classes (level 1) * 12 topics in each (level 2). I thought that total_size=len(data)
should correct this behaviour. data.index_topic
and data.index_score_class.values
are simple indices with category ids.