Implementation of Multivariate Hypergeometric distribution not sampling correctly

Hello,

I’m trying to implement the Multivariate Hypergeometric distribution in PyMC3. My latest efforts so far run fine, but don’t seem to sample correctly. Here is my implementation of the MvHypergeometric:

from pymc3.distributions import Discrete, draw_values, generate_samples
import theano.tensor as tt
from pymc3.util import get_variable_name
from pymc3.distributions.dist_math import bound, factln, binomln, betaln, logpow, random_choice
from pymc3.math import tround, sigmoid, logaddexp, logit, log1pexp
import pymc3 as pm
import numpy as np

class MvHypergeometric(Discrete):

    def __init__(self, K, n, *args, **kwargs):
        super(MvHypergeometric, self).__init__(*args, **kwargs)

        N = tt.sum(K, axis=-1, keepdims=True)
        n = np.squeeze(n) # works also if n is a

        self.N = N
        if len(self.shape) > 1:
            m = self.shape[-2]
            # try:
            #     assert n.shape == (m,)
            # except (AttributeError, AssertionError):
            #     n = n * tt.ones(m)
            self.n = tt.shape_padright(n)
            self.K = K if K.ndim > 1 else tt.shape_padleft(K)
        elif n.ndim == 1:
            self.n = tt.shape_padright(n)
            self.K = K if K.ndim > 1 else tt.shape_padleft(K)
        else:
            # n is a scalar, K is a 1d array
            self.n = tt.as_tensor_variable(n)
            self.K = tt.as_tensor_variable(K)

        self.mean = self.n * self.K / self.N
        mode = tt.cast(tt.round(self.mean), 'int32')
        diff = self.n - tt.sum(mode, axis=-1, keepdims=True)
        inc_bool_arr = tt.abs_(diff) > 0
        mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()],
                                diff[inc_bool_arr.nonzero()])
        self.mode = mode

    def _random(self, K, n, size=None):
        original_dtype = K.dtype
        # Set float type to int64 for numpy. This change is related to numpy issue #8317 (
        # https://github.com/numpy/numpy/issues/8317)
        K = K.astype('int64')
        # Now, re-normalize all of the values in int64 precision. This is done inside the
        # conditionals

        # np.random.multinomial needs `n` to be a scalar int and `p` a
        # sequence
        if K.ndim == 1 and (n.ndim == 0 or (n.ndim == 1 and n.shape[0] == 1)):
            # If `n` is already a scalar and `K` is a sequence, then just
            # return np.multinomial with some size handling
            # p = K / K.sum()
            if size is not None:
                if size == K.shape:
                    size = None
                elif size[-len(K.shape):] == K.shape:
                    size = size[:len(size) - len(K.shape)]
            randnum = random_mv_hypergeom(K, n, size=size)
            return randnum.astype(original_dtype)
        # The shapes of `K` and `n` must be broadcasted by hand depending on
        # their ndim. We will assume that the last axis of the `K` array will
        # be the sequence to feed into np.random.multinomial. The other axis
        # will only have to be iterated over.
        if n.ndim == K.ndim:
            # K and n have the same ndim, so n.shape[-1] must be 1
            if n.shape[-1] != 1:
                raise ValueError('If n and K have the same number of '
                                 'dimensions, the last axis of n must be '
                                 'have len 1. Got {} instead.\n'
                                 'n.shape = {}\n'
                                 'K.shape = {}.'.format(n.shape[-1],
                                                        n.shape,
                                                        K.shape))
            n_p_shape = np.broadcast(np.empty(K.shape[:-1]),
                                     np.empty(n.shape[:-1])).shape
            K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
            n = np.broadcast_to(n, n_p_shape + (1,))
        elif n.ndim == K.ndim - 1:
            # n has the number of dimensions of p for the iteration, these must
            # broadcast together
            n_p_shape = np.broadcast(np.empty(K.shape[:-1]),
                                     n).shape
            K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
            n = np.broadcast_to(n, n_p_shape + (1,))
        elif K.ndim == 1:
            # K only has the sequence array. We extend it with the dimensions
            # of n
            n_p_shape = n.shape
            K = np.broadcast_to(K, n_p_shape + (K.shape[-1],))
            n = np.broadcast_to(n, n_p_shape + (1,))
        elif n.ndim == 0 or (n.dim == 1 and n.shape[0] == 1):
            # n is a scalar. We extend it with the dimensions of p
            n_p_shape = K.shape[:-1]
            n = np.broadcast_to(n, n_p_shape + (1,))
        else:
            # There is no clear rule to broadcast p and n so we raise an error
            raise ValueError('Incompatible shapes of n and K.\n'
                             'n.shape = {}\n'
                             'K.shape = {}'.format(n.shape, K.shape))

        # Check what happens with size
        if size is not None:
            if size == K.shape:
                size = None
                _size = 1
            elif size[-len(K.shape):] == K.shape:
                size = size[:len(size) - len(K.shape)]
                _size = np.prod(size)
            else:
                _size = np.prod(size)
        else:
            _size = 1

        # We now flatten K and n up to the last dimension
        K_shape = K.shape
        K = np.reshape(K, (np.prod(n_p_shape), -1))
        n = np.reshape(n, (np.prod(n_p_shape), -1))
        # We renormalize K
        # K = K / K.sum(axis=1, keepdims=True)
        # We iterate calls to np.random.multinomial
        randnum = np.asarray([
            random_mv_hypergeom(KK, nn, size=_size)
            for (nn, KK) in zip(n, K)
        ])
        # We swap the iteration axis with the _size axis
        randnum = np.moveaxis(randnum, 1, 0)
        # We reshape the random numbers to the corresponding size + p_shape
        if size is None:
            randnum = np.reshape(randnum, K_shape)
        else:
            randnum = np.reshape(randnum, size + K_shape)
        # We cast back to the original dtype
        return randnum.astype(original_dtype)

    def random(self, point=None, size=None):
        n, K = draw_values([self.n, self.K], point=point, size=size)
        samples = generate_samples(self._random, K, n,
                                   dist_shape=self.shape,
                                   size=size)
        return samples

    def logp(self, x):
        n = self.n
        K = self.K
        N = self.N

        # print('--- Call to logp ')
        # print(self._repr_latex_())
        # for varname in ['x', 'n', 'K', 'N']:
        #     val = locals()[varname]
        #     print('{0} = {1}'.format(varname, val))

        return bound(
            tt.sum(binomln(K, x)) - binomln(N,tt.sum(x, axis=-1, keepdims=True)),
            tt.all(x >= 0),
            tt.all(tt.eq(tt.sum(x, axis=-1, keepdims=True), n)),
            tt.all(K >= 0),
            tt.all(tt.eq(tt.sum(K), N)),
            tt.all(tt.le(n, N)),
            broadcast_conditions=False
        )

    def _repr_latex_(self, name=None, dist=None):
        if dist is None:
            dist = self
        n = dist.n
        K = dist.K
        return r'${} \sim \text{{MVHypergeometric}}*(\mathit{{K}}={}, \mathit{{n}}={})$'.format(
            name,
            get_variable_name(K),
            get_variable_name(n))

Essentially, I copied the Multinomial implementation from PyMC3 and changed the random() and logp() methods.

Since NumPy doesn’t yet have functions for generating random multivariate hypergeometric samples, I implemented one as well:

def random_mv_hypergeom(K, n, size=1):
    """
    Parameters
    ----------
    K : one-dimensional array containing the total amount of each category in the population
    n : sample size

    Returns
    -------
    One-dimensional array with the same length as `K` containing a random sample of the
    population
    """

    if size is None:
        size = 1

    if not isinstance(size, int):
        size = size[0]

    n = np.repeat(a=n, repeats=size)

    remaining = np.cumsum(K[::-1])[::-1]
    result = np.zeros((len(K), size), dtype=np.int)
    for i, k in enumerate(K[:-1]):
        n_true = n[:]
        ignore_results = n<1
        n[n<1] = 0
        result[i] = np.random.hypergeometric(k, remaining[i + 1], n)
        n -= result[i]
        result[i, ignore_results] = 0
        n = n_true[:]

    result[-1] = n
    return result.transpose()

I created a simple model to test my implementation, and I compared it to a similar Multinomial model. Here are both models:

if __name__ == '__main__':

    trueK = np.asarray([40, 60])
    N = trueK.sum()

    fn = 0.9
    n = (N*fn).astype('int32')
    obs = random_mv_hypergeom(K=trueK, n=n)[0]
    p = obs/n

    b = 10
    a = (1+(b-2)*p)/(1-p)

    with pm.Model() as ex:
        K = pm.BetaBinomial('K', n=N, alpha=a, beta=b, testval=(N*p).astype('int32'), shape=p.shape)
        p = pm.Deterministic('p', K/N)
        hyp = MvHypergeometric(name='hypergeom', K=K, n=n, observed=obs)

        tex1 = pm.sample(draws=10000, nchains=2)

and

    with pm.Model() as ex2:
        p_bin = pm.Beta(name='p_bin', alpha=1, beta=1)
        binom = pm.Binomial(name='binomial', n=n, p=p_bin, observed=obs[0])

        tex2 = pm.sample(draws=10000, nchains=2)

Both these models compile and run fine (albeit with warnings about small effective samples):

Multiprocess sampling (2 chains in 4 jobs)
Metropolis: [K]
Sampling 2 chains: 100%|██████████| 21000/21000 [00:03<00:00, 5508.65draws/s]
The number of effective samples is smaller than 10% for some parameters.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [p_bin]
Sampling 2 chains: 100%|██████████| 21000/21000 [00:07<00:00, 2783.64draws/s]

However, the summary is strange:

 In[3]: pm.summary(tex1)
Out[3]: 
           mean        sd  mc_error    ...     hpd_97.5       n_eff      Rhat
K__0  43.425600  5.464317  0.215351    ...        54.00  560.442691  1.001975
K__1  64.322450  6.868856  0.291459    ...        77.00  439.016363  1.005313
p__0   0.434256  0.054643  0.002154    ...         0.54  560.442691  1.001975
p__1   0.643224  0.068689  0.002915    ...         0.77  439.016363  1.005313
[4 rows x 7 columns]

 In[4]: pm.summary(tex2)
Out[4]: 
           mean        sd  mc_error    ...     hpd_97.5        n_eff      Rhat
p_bin  0.401749  0.051078  0.000642    ...       0.5006  8164.413999  0.999957
[1 rows x 7 columns]

Specifically, the standard deviation for the variable p__0 in the first model is almost the same as the p_bin variable in the second model. Since the number of samples (n = 90) in the multivariate hypergeometric model is so close to the population size (N = 100), I would expect the standard deviation of p in the first model to be significantly less than in the second model (multinomial).

Also, the trace is inconsistent with the distribution parameters. The population size is N = 100, so the sum of the Ks in each sample should be 100, and yet they are frequently not:

In[25]: plt.hist(tex1['K'].sum(axis=1))

The logp() method puts a 0% posterior probability on any sample where the sum of Ks is different than N, so I don’t see how this is possible.

Finally, the traceplot shows what seems like a lot of autocorrelation in the hypergeometric model.

What can I do to improve my implementation of the multivariate hypergeometric distribution?

Traceplot from the hypergeometric model:

Bumping. Any tips, anyone?

Sorry for the prolonged silence. Your distribution implementation looks really involved and detailed, so I can’t spot problems with it at a glance. So my recommendations will mostly be about testing for correctness.

The weird behavior you see could be caused by a multitude of things:

  1. An error in the logp
  2. An error in random, most of these come from bad shape handling, but as you also wrote random_, then that could be the source of errors.
  3. Bad sampling of the distribution
  4. An error in the pymc3 code base that is lurking around

You have to make sure that points 1 and 2 are not the cause. How? Write a small test suite that compares the distribution’s logp and random with reference implementations. Do this for a variety of parameter values and shapes. If all tests pass, then maybe it is caused by poor mixing. Try changing the step method properties to facilitate transitions and see if you mix better.

If everything works so far, then the problem may be in the codebase itself. Try to narrow the conflictive part of the code to the minimum and we’ll see what we can do to help you then.

Hope these pointers help you out a bit.

1 Like

Thanks for the pointers, I’ll try them out.

Most of the “involvedness” in the code comes from the original Multinomial, so if I can’t get it to work no matter what, I’ll try a simpler implementation, with just random() and logp().

One difficulty is that I don’t have any reference implementations. Neither NumPy nor SciPy have implemented multivariate hypergeometric functions, so I’ll have to do hand calculations.

Yeah, the only reference for comparison is the bivariate hypergeometric. It looks like the random method is the real challenge.

1 Like

Yeah, I used a clever implementation that I found on Stack Overflow. I checked it against a couple of values that I got doing hand calculations, and they are okay. I’ll do a more thorough validation now.

Once again, thanks for the pointers. I checked possible implementation mistakes which you pointed out in 1. and 2.

  1. An error in the logp

So I wrote the logp function for 1000 random values of K and k in both Excel and Python (neither have native implementations of the multivariate version of the hypergeometric distributions), based on this formulation in Wikipedia. I also ran the same values through the implementation that I used in the logp() of the MvHypergeometric distribution. All three matched exactly. So the logp() function seems fine.

  1. An error in random, most of these come from bad shape handling, but as you also wrote random_, then that could be the source of errors.

I did something similar to the above for the random sampling function. I took 105 random draws from MvHypergeometric(K=[35, 43], n=20) (which is equivalent to a bivariate Hypergeometric with ngood = 35 and nbad = 43 since there are only two categories), and compared it to the numpy random generator (numpy.random.hypergeometric) and to another function that implements the hypergeometric RNG in another way. I was then able to generate this graph, comparing the (cumulative) histogram for the three ways of generating hypergeometric-distributed samples:

So it seems the random() function is also working fine. Here is the code to generate the plot:

def compare_random(size_stat=10 ** 4, n_sample=20, n_cats=2, K_max=50):
    K_max = 50

    Ki = range(n_cats)
    K = np.random.randint(low=1, high=K_max + 1, size=n_cats)
    n_actual = min(n_sample, K.sum())

    if n_actual != n_sample:
        print('Actual sample size is {0:n}'.format(n_actual))

    df_cols = ['n'] + ['k_{0:n}'.format(idx + 1) for idx in range(K.shape[-1])]

    df_k_python = pd.DataFrame([], columns=df_cols, index=range(size_stat))
    df_k_np = pd.DataFrame([], columns=df_cols, index=range(size_stat))
    df_k_pymc3 = pd.DataFrame([], columns=df_cols, index=range(size_stat))

    dfs = { 'python': df_k_python,
            'numpy': df_k_np,
            'pymc3': df_k_pymc3
            }

    for s in tqdm(range(size_stat), 'Sample'):
        K_it = K.copy()
        k = np.zeros_like(K)
        for i in range(n_actual):
            rand_idx = np.random.choice(Ki, p=K_it / K_it.sum())
            K_it[rand_idx] -= 1
            k[rand_idx] += 1

        df_k_python.loc[s] = [n_actual] + list(k)

    df_k_np['n'] = df_k_python['n']
    df_k_np['k_1'] = np.random.hypergeometric(ngood=K[0], nbad=K.sum() - K[0], nsample=df_k_python['n'])
    df_k_np['k_2'] = df_k_np['n'] - df_k_np['k_1']

    df_k_pymc3['n'] = df_k_python['n']
    with pm.Model() as ex4:
        hyp = MvHypergeometric('hyp', K=K, n=n_actual, shape=K.shape)
    df_k_pymc3[['k_1', 'k_2']] = hyp.random(size=size_stat)

    # plot
    fig = plt.figure(figsize=(12, 8))
    ax1, ax2 = fig.subplots(nrows=2)

    for n, df in dfs.items():
        sns.distplot(a=df['k_1'], kde=False,
                     hist_kws={'cumulative': True, 'histtype': 'step', "linewidth": 3
                     }, ax=ax1, label=n)
        sns.distplot(a=df['k_2'], kde=False,
                     hist_kws={'cumulative': True, 'histtype': 'step', "linewidth": 3,
                     }, ax=ax2, label=n)

    labeldict = {1: 'good', 2: 'bad'}
    for ax in [ax1, ax2]:
        ax.legend()
        i = int(ax.get_xlabel().split('_')[-1])
        label = labeldict[i]
        ax.set_xlabel(r'$k_' + '{0:n}'.format(i) + r' = k_{' + label + r'}$')
        ax.set_ylabel('Frequency (cumulative)')

    ax1.figure.suptitle('{0:.2e} draws from MvHypergeom with two categories (equal to Hypergeom)'.format(size_stat))

    ax1.set_title(r' Distribution: $\vec{k}\sim MvHypergeometric(n_{good} = ' + \
                   '{0:n}'.format(K[0]) + r', n_{bad} = ' + '{0:n}'.format(K[1]) + r', n = ' + \
                   '{0:n}'.format(n_actual) + r')$')

    ax1.figure.tight_layout(rect=(0, 0, 1, 0.96))

    return [ax1, ax2], dfs

Next I’ll try implementing a simpler version of the MvHypergeometric class (just random() and logp(), simple shape handling) just to see if I can get it to work.

2 Likes

I just figured out what was the problem. When I set up the model, I chose to represent each K as a univariate BetaBinomial:

if __name__ == '__main__':

    trueK = np.asarray([40, 60])
    N = trueK.sum()

    fn = 0.9
    n = (N*fn).astype('int32')
    obs = random_mv_hypergeom(K=trueK, n=n)[0]
    p = obs/n

    b = 10
    a = (1+(b-2)*p)/(1-p)

    with pm.Model() as ex:
        K = pm.BetaBinomial('K', n=N, alpha=a, beta=b, testval=(N*p).astype('int32'), shape=p.shape)
        p = pm.Deterministic('p', K/N)
        hyp = MvHypergeometric(name='hypergeom', K=K, n=n, observed=obs)

        tex1 = pm.sample(draws=10000, nchains=2)

It is important that all K sum up to N, but since each BetaBinomial is independent of each other, there is no guarantee of that. Changing the specification so that K has a Multinomial prior solved the issue:

    with pm.Model() as ex:
        pp = pm.Dirichlet('p', a = N*np.repeat(1/len(trueK), repeats=len(trueK)), shape=trueK.shape,
                          testval=obs)
        KK = pm.Multinomial('K', n=N, p=pp, shape=trueK.shape)
        hyp = MvHypergeometric(name='hypergeom', K=KK, n=n,
                               observed=obs)

There are warnings about divergence and number of effective samples, and the traces for the K variable don’t look very well behaved, but this seems out of this question`s scope. I’ll start another question in the forum regarding this problem.

Anyway, thanks for the attention and the tips!

3 Likes