Hello,
I’m trying to implement the Multivariate Hypergeometric distribution in PyMC3. My latest efforts so far run fine, but don’t seem to sample correctly. Here is my implementation of the MvHypergeometric
:
from pymc3.distributions import Discrete, draw_values, generate_samples
import theano.tensor as tt
from pymc3.util import get_variable_name
from pymc3.distributions.dist_math import bound, factln, binomln, betaln, logpow, random_choice
from pymc3.math import tround, sigmoid, logaddexp, logit, log1pexp
import pymc3 as pm
import numpy as np
class MvHypergeometric(Discrete):
def __init__(self, K, n, *args, **kwargs):
super(MvHypergeometric, self).__init__(*args, **kwargs)
N = tt.sum(K, axis=-1, keepdims=True)
n = np.squeeze(n) # works also if n is a
self.N = N
if len(self.shape) > 1:
m = self.shape[-2]
# try:
# assert n.shape == (m,)
# except (AttributeError, AssertionError):
# n = n * tt.ones(m)
self.n = tt.shape_padright(n)
self.K = K if K.ndim > 1 else tt.shape_padleft(K)
elif n.ndim == 1:
self.n = tt.shape_padright(n)
self.K = K if K.ndim > 1 else tt.shape_padleft(K)
else:
# n is a scalar, K is a 1d array
self.n = tt.as_tensor_variable(n)
self.K = tt.as_tensor_variable(K)
self.mean = self.n * self.K / self.N
mode = tt.cast(tt.round(self.mean), 'int32')
diff = self.n - tt.sum(mode, axis=-1, keepdims=True)
inc_bool_arr = tt.abs_(diff) > 0
mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()],
diff[inc_bool_arr.nonzero()])
self.mode = mode
def _random(self, K, n, size=None):
original_dtype = K.dtype
# Set float type to int64 for numpy. This change is related to numpy issue #8317 (
# https://github.com/numpy/numpy/issues/8317)
K = K.astype('int64')
# Now, re-normalize all of the values in int64 precision. This is done inside the
# conditionals
# np.random.multinomial needs `n` to be a scalar int and `p` a
# sequence
if K.ndim == 1 and (n.ndim == 0 or (n.ndim == 1 and n.shape[0] == 1)):
# If `n` is already a scalar and `K` is a sequence, then just
# return np.multinomial with some size handling
# p = K / K.sum()
if size is not None:
if size == K.shape:
size = None
elif size[-len(K.shape):] == K.shape:
size = size[:len(size) - len(K.shape)]
randnum = random_mv_hypergeom(K, n, size=size)
return randnum.astype(original_dtype)
# The shapes of `K` and `n` must be broadcasted by hand depending on
# their ndim. We will assume that the last axis of the `K` array will
# be the sequence to feed into np.random.multinomial. The other axis
# will only have to be iterated over.
if n.ndim == K.ndim:
# K and n have the same ndim, so n.shape[-1] must be 1
if n.shape[-1] != 1:
raise ValueError('If n and K have the same number of '
'dimensions, the last axis of n must be '
'have len 1. Got {} instead.\n'
'n.shape = {}\n'
'K.shape = {}.'.format(n.shape[-1],
n.shape,
K.shape))
n_p_shape = np.broadcast(np.empty(K.shape[:-1]),
np.empty(n.shape[:-1])).shape
K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
n = np.broadcast_to(n, n_p_shape + (1,))
elif n.ndim == K.ndim - 1:
# n has the number of dimensions of p for the iteration, these must
# broadcast together
n_p_shape = np.broadcast(np.empty(K.shape[:-1]),
n).shape
K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
n = np.broadcast_to(n, n_p_shape + (1,))
elif K.ndim == 1:
# K only has the sequence array. We extend it with the dimensions
# of n
n_p_shape = n.shape
K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
n = np.broadcast_to(n, n_p_shape + (1,))
elif n.ndim == 0 or (n.dim == 1 and n.shape[0] == 1):
# n is a scalar. We extend it with the dimensions of p
n_p_shape = K.shape[:-1]
n = np.broadcast_to(n, n_p_shape + (1,))
else:
# There is no clear rule to broadcast p and n so we raise an error
raise ValueError('Incompatible shapes of n and K.\n'
'n.shape = {}\n'
'K.shape = {}'.format(n.shape, K.shape))
# Check what happens with size
if size is not None:
if size == K.shape:
size = None
_size = 1
elif size[-len(K.shape):] == K.shape:
size = size[:len(size) - len(K.shape)]
_size = np.prod(size)
else:
_size = np.prod(size)
else:
_size = 1
# We now flatten K and n up to the last dimension
K_shape = K.shape
K = np.reshape(K, (np.prod(n_p_shape), -1))
n = np.reshape(n, (np.prod(n_p_shape), -1))
# We renormalize K
# K = K / K.sum(axis=1, keepdims=True)
# We iterate calls to np.random.multinomial
randnum = np.asarray([
random_mv_hypergeom(KK, nn, size=_size)
for (nn, KK) in zip(n, K)
])
# We swap the iteration axis with the _size axis
randnum = np.moveaxis(randnum, 1, 0)
# We reshape the random numbers to the corresponding size + p_shape
if size is None:
randnum = np.reshape(randnum, K_shape)
else:
randnum = np.reshape(randnum, size + K_shape)
# We cast back to the original dtype
return randnum.astype(original_dtype)
def random(self, point=None, size=None):
n, K = draw_values([self.n, self.K], point=point, size=size)
samples = generate_samples(self._random, K, n,
dist_shape=self.shape,
size=size)
return samples
def logp(self, x):
n = self.n
K = self.K
N = self.N
# print('--- Call to logp ')
# print(self._repr_latex_())
# for varname in ['x', 'n', 'K', 'N']:
# val = locals()[varname]
# print('{0} = {1}'.format(varname, val))
return bound(
tt.sum(binomln(K, x)) - binomln(N,tt.sum(x, axis=-1, keepdims=True)),
tt.all(x >= 0),
tt.all(tt.eq(tt.sum(x, axis=-1, keepdims=True), n)),
tt.all(K >= 0),
tt.all(tt.eq(tt.sum(K), N)),
tt.all(tt.le(n, N)),
broadcast_conditions=False
)
def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
n = dist.n
K = dist.K
return r'${} \sim \text{{MVHypergeometric}}*(\mathit{{K}}={}, \mathit{{n}}={})$'.format(
name,
get_variable_name(K),
get_variable_name(n))
Essentially, I copied the Multinomial
implementation from PyMC3 and changed the random()
and logp()
methods.
Since NumPy doesn’t yet have functions for generating random multivariate hypergeometric samples, I implemented one as well:
def random_mv_hypergeom(K, n, size=1):
"""
Parameters
----------
K : one-dimensional array containing the total amount of each category in the population
n : sample size
Returns
-------
One-dimensional array with the same length as `K` containing a random sample of the
population
"""
if size is None:
size = 1
if not isinstance(size, int):
size = size[0]
n = np.repeat(a=n, repeats=size)
remaining = np.cumsum(K[::-1])[::-1]
result = np.zeros((len(K), size), dtype=np.int)
for i, k in enumerate(K[:-1]):
n_true = n[:]
ignore_results = n<1
n[n<1] = 0
result[i] = np.random.hypergeometric(k, remaining[i + 1], n)
n -= result[i]
result[i, ignore_results] = 0
n = n_true[:]
result[-1] = n
return result.transpose()
I created a simple model to test my implementation, and I compared it to a similar Multinomial model. Here are both models:
if __name__ == '__main__':
trueK = np.asarray([40, 60])
N = trueK.sum()
fn = 0.9
n = (N*fn).astype('int32')
obs = random_mv_hypergeom(K=trueK, n=n)[0]
p = obs/n
b = 10
a = (1+(b-2)*p)/(1-p)
with pm.Model() as ex:
K = pm.BetaBinomial('K', n=N, alpha=a, beta=b, testval=(N*p).astype('int32'), shape=p.shape)
p = pm.Deterministic('p', K/N)
hyp = MvHypergeometric(name='hypergeom', K=K, n=n, observed=obs)
tex1 = pm.sample(draws=10000, nchains=2)
and
with pm.Model() as ex2:
p_bin = pm.Beta(name='p_bin', alpha=1, beta=1)
binom = pm.Binomial(name='binomial', n=n, p=p_bin, observed=obs[0])
tex2 = pm.sample(draws=10000, nchains=2)
Both these models compile and run fine (albeit with warnings about small effective samples):
Multiprocess sampling (2 chains in 4 jobs)
Metropolis: [K]
Sampling 2 chains: 100%|██████████| 21000/21000 [00:03<00:00, 5508.65draws/s]
The number of effective samples is smaller than 10% for some parameters.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [p_bin]
Sampling 2 chains: 100%|██████████| 21000/21000 [00:07<00:00, 2783.64draws/s]
However, the summary is strange:
In[3]: pm.summary(tex1)
Out[3]:
mean sd mc_error ... hpd_97.5 n_eff Rhat
K__0 43.425600 5.464317 0.215351 ... 54.00 560.442691 1.001975
K__1 64.322450 6.868856 0.291459 ... 77.00 439.016363 1.005313
p__0 0.434256 0.054643 0.002154 ... 0.54 560.442691 1.001975
p__1 0.643224 0.068689 0.002915 ... 0.77 439.016363 1.005313
[4 rows x 7 columns]
In[4]: pm.summary(tex2)
Out[4]:
mean sd mc_error ... hpd_97.5 n_eff Rhat
p_bin 0.401749 0.051078 0.000642 ... 0.5006 8164.413999 0.999957
[1 rows x 7 columns]
Specifically, the standard deviation for the variable p__0
in the first model is almost the same as the p_bin
variable in the second model. Since the number of samples (n = 90
) in the multivariate hypergeometric model is so close to the population size (N = 100
), I would expect the standard deviation of p
in the first model to be significantly less than in the second model (multinomial).
Also, the trace is inconsistent with the distribution parameters. The population size is N = 100
, so the sum of the K
s in each sample should be 100, and yet they are frequently not:
In[25]: plt.hist(tex1['K'].sum(axis=1))
The logp()
method puts a 0% posterior probability on any sample where the sum of K
s is different than N
, so I don’t see how this is possible.
Finally, the traceplot shows what seems like a lot of autocorrelation in the hypergeometric model.
What can I do to improve my implementation of the multivariate hypergeometric distribution?