Advice for speeding up large multivariate models

Hello all,

I have the following model. Conceptually it is quite simple, but there’s a lot of data. Using the default options (i.e., ADVI followed by NUTS), it takes about an hour to run on my desktop Mac. I’d like to speed it up as much as possible. I wondered whether anyone has any suggestions for speeding it up, perhaps using some new features from 3.1, GPU integration, re-parametrizing, etc.?

import numpy as np
import pymc3 as pm
from theano.tensor import dot


with pm.Model():

    # knowns; example data loaded here

    y = np.load('y.npy')  # 1,000 data points
    x = np.load('x.npy')  # covariate
    A = np.load('A.npy')  # a (1,000 x 1,000 correlation matrix)
    I = np.identity(len(x))

    # priors

    β = pm.Cauchy(name='β', alpha=0., beta=1)
    σ_A = pm.HalfCauchy(name='σ_A', beta=1)
    σ_E = pm.HalfCauchy(name='σ_E', beta=1)

    # intermediate variables

    μ = dot(x, β)
    Σ = σ_A * A + σ_E * I

    # likelihood

    y_ = pm.MvNormal(name='y_', mu=μ, cov=Σ, observed=y)

    pm.sample()

The most expensive part of this should be the cholesky decomposition in each step. But since A is constant, you could compute the eigenvalue decomposition Q @ D @ Qt for it once, and get Sigma = Q @ (sigma_A * D + sigma_E) @ Q.t. With this representation of Sigma it should be possible solve in O(n^2) I think, and also to compute the logdet. It would still require a bit of trickery to get MvNormal to do that (or maybe just write your own version of that for this sepecial case). I can give this a try tomorrow if you need more help.

That sounds great, but unfortunately somewhat beyond my skill level. Any further help would be greatly appreciated!

This should be faster:

class MvNormalReg(pm.Continuous):
    """Multivariate normal for a regularized decomposed covariance.
    
    The covariance matrix has the form
    
        Sigma = σ1(A + σ2 I)
        
    where A = Q @ np.diag(diag) @ Q.T for orthogonal Q.
    """
    def __init__(self, mu, Q, diag, sigma1, sigma2, *args, **kwargs):
        self._mu = tt.as_tensor_variable(mu)
        self._Q = tt.as_tensor_variable(Q)
        self._diag = tt.as_tensor_variable(diag)
        self._sigma1 = tt.as_tensor_variable(sigma1)
        self._sigma2 = tt.as_tensor_variable(sigma2)
        
        self.mean = self.median = self._mu
        
        super(MvNormalReg, self).__init__(*args, **kwargs)

    def logp(self, value):
        if value.ndim != 1:
            raise ValueError('Only one-dimensional values are supported.')
        y = value - self._mu
        Q = self._Q
        n = value.shape[-1]
        
        d = self._diag + self._sigma2

        logdet = tt.sum(tt.log(d))
        logdet += n * tt.log(self._sigma1)

        trans = tt.dot(Q.T, tt.dot(Q, y) / d) / self._sigma1
        
        quaddist = (y * trans).sum()

        norm = - 0.5 * pm.floatX(np.log(2 * np.pi))
        return norm - 0.5 * (quaddist + logdet)


with pm.Model() as model:
    eigs, Q = linalg.eigh(A)
    
    β = pm.Cauchy(name='β', alpha=0., beta=1)
    σ_A = pm.HalfCauchy(name='σ_A', beta=1)
    σ_E = pm.HalfCauchy(name='σ_E', beta=1)

    μ = tt.dot(x, β)
    MvNormalReg('y', μ, Q, eigs, sigma1=σ_A, sigma2=σ_E/σ_A, observed=y)
1 Like

This is much faster! But for some reason it doesn’t converge on the correct values of σ_A and σ_E. Specifically σ_A always bottoms about and all variance gets attributed to σ_E. I’m at a loss to explain this, as these kinds of models are very commonly solved using MLE, so I don’t see why a Bayesian model wouldn’t work.

import numpy as np
import pymc3 as pm
import theano.tensor as tt
import matplotlib.pyplot as plt


class MvNormalReg(pm.Continuous):
    """Multivariate normal for a regularized decomposed covariance.

    The covariance matrix has the form

        Sigma = σ1(A + σ2 I)

    where A = Q @ np.diag(diag) @ Q.T for orthogonal Q.
    """

    def __init__(self, mu, Q, diag, sigma1, sigma2, *args, **kwargs):

        self._mu = tt.as_tensor_variable(mu)
        self._Q = tt.as_tensor_variable(Q)
        self._diag = tt.as_tensor_variable(diag)
        self._sigma1 = tt.as_tensor_variable(sigma1)
        self._sigma2 = tt.as_tensor_variable(sigma2)
        self.mean = self.median = self._mu

        super(MvNormalReg, self).__init__(*args, **kwargs)

    def logp(self, value):

        if value.ndim != 1:

            raise ValueError('Only one-dimensional values are supported.')

        y = value - self._mu
        Q = self._Q
        n = value.shape[-1]
        d = self._diag + self._sigma2
        logdet = tt.sum(tt.log(d))
        logdet += n * tt.log(self._sigma1)
        trans = tt.dot(Q.T, tt.dot(Q, y) / d) / self._sigma1
        quaddist = (y * trans).sum()
        norm = - 0.5 * pm.floatX(np.log(2 * np.pi))

        return norm - 0.5 * (quaddist + logdet)


with pm.Model():

    A = np.load('A.npy')  # using my correlation matrix

    # or using a randomly generated matrix; huge so there is plenty of evidence
    d, k = 3000, 1500
    W = np.random.randn(d, k)
    S = np.dot(W, W.T) + np.diag(np.random.rand(1, d))
    A = np.dot(np.diag(1. / np.sqrt(np.diag(S))),
               np.dot(S, np.diag(1. / np.sqrt(np.diag(S)))))

    X = np.random.normal(size=(A.shape[-1], 2))  # fixed-effect covariates

    true_β = [.1, .5]
    true_σ_A = .5
    true_σ_E = .5

    true_μ = np.dot(X, true_β)
    true_Σ = true_σ_A * A + true_σ_E * np.identity(A.shape[-1])
    y = np.random.multivariate_normal(true_μ, true_Σ)

    eigs, Q = np.linalg.eigh(A)
    β = pm.Cauchy(name='β', alpha=0., beta=10., shape=2)
    σ_A = pm.HalfCauchy(name='σ_A', beta=10.)
    σ_E = pm.HalfCauchy(name='σ_E', beta=10.)
    MvNormalReg('y', tt.dot(X, β), Q, eigs, sigma1=σ_A, sigma2=σ_E / σ_A,
                observed=y)
    trace = pm.sample()
    pm.traceplot(trace)
    plt.savefig('traceplot.png')

This produces the following trace plot:

The generated cov matrix A has a very strong diagonal:

plt.imshow(A);

As a result, your model is unidentifiable, and most of the variance is contributed the identity matrix (controlled by σ_E)

@sammosummo The near identity matrix might be a problem, but more to the point, I messed up the logp function a bit. It should be

    def logp(self, value):
        if value.ndim != 1:
            raise ValueError('Only one-dimensional values are supported.')
        y = value - self._mu
        Q = self._Q
        n = value.shape[-1]
        
        d = self._diag + self._sigma2

        logdet = tt.sum(tt.log(d))
        logdet += n * tt.log(self._sigma1)

        trans = tt.dot(Q, tt.dot(Q.T, y) / d) / self._sigma1
        
        quaddist = (y * trans).sum()

        norm = n * pm.floatX(np.log(2 * np.pi))
        return - 0.5 * (norm + quaddist + logdet)

I switched around Q and Q.T in the line with trans =. Sorry about that.

LOL my reasoning is wrong - yep you CAN recover the underlying σ_A and σ_E with the correct logp :wink:

I think this is closer but still not quite right. logp doesn’t return the same values as scipy.stats.multivariate_normal.logpdf, which it should right? Here is an example.

Mean posterior estimates of σ_A are now non-zero but still smaller than their MLEs.

I’m not too concerned about the strong diagonal in A. There is a big literature in quantitative genetics using exactly these kinds of models, and they estimate non-zero values of σ_A just fine even when most of the off-diagonals are << 1.

I really appreciate your help with this problem!

Edit: fig labels.

I also changed the normalization factor in the last post. Forgot to mention it, as it doesn’t actually change the behaviour of the sampler. Did you take that into account?

It’s still not quite right:

I also tried changing n to -0.5*n, which gets close but still not identical:

Strange. I get the same values for both:

stats.multivariate_normal(true_μ, true_Σ).logpdf(y)
MvNormalReg.dist(true_μ, Q, eigs, true_σ_A, true_σ_E / true_σ_A).logp(y).eval({})
-812.890846728
-812.8908467283757

Can you post the code you used to create the plot?

I must be doing something dumb then:

import gzip
import numpy as np
import pymc3 as pm
import pandas as pd
import theano.tensor as tt
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal


class MvNormalReg(pm.Continuous):
    """Multivariate normal for a regularized decomposed covariance.

    The covariance matrix has the form

        Sigma = σ1(A + σ2 I)

    where A = Q @ np.diag(diag) @ Q.T for orthogonal Q.
    """

    def __init__(self, mu, Q, diag, sigma1, sigma2, *args, **kwargs):

        self._mu = tt.as_tensor_variable(mu)
        self._Q = tt.as_tensor_variable(Q)
        self._diag = tt.as_tensor_variable(diag)
        self._sigma1 = tt.as_tensor_variable(sigma1)
        self._sigma2 = tt.as_tensor_variable(sigma2)
        self.mean = self.median = self._mu

        super(MvNormalReg, self).__init__(*args, **kwargs)

    def logp(self, value):

        if value.ndim != 1:

            raise ValueError('Only one-dimensional values are supported.')

        y = value - self._mu
        Q = self._Q
        n = value.shape[-1]
        d = self._diag + self._sigma2
        logdet = tt.sum(tt.log(d))
        logdet += n * tt.log(self._sigma1)
        trans = tt.dot(Q, tt.dot(Q.T, y) / d) / self._sigma1
        quaddist = (y * trans).sum()
        norm = n * pm.floatX(np.log(2 * np.pi))

        return norm - 0.5 * (quaddist + logdet)


with pm.Model():

    """# my data; ignore this
    
    df = pd.read_csv(gzip.open('phi2.gz'), delim_whitespace=True)
    df.columns = ('id1', 'id2', 'phi2', 'd7')
    df = pd.pivot_table(df, values='phi2', index='id1', columns='id2')
    df = df.fillna(0)
    A = df.values
    """

    # randomly generated A

    d, k = 1000, 1500
    W = np.random.randn(d, k)
    S = np.dot(W, W.T) + np.diag(np.random.rand(1, d))
    A = np.dot(np.diag(1. / np.sqrt(np.diag(S))),
               np.dot(S, np.diag(1. / np.sqrt(np.diag(S)))))

    n = A.shape[-1]
    X = np.random.normal(size=(n, 2))  # fixed-effect covariates

    true_β = [0.1, 0.5]
    true_σ_A = .5
    true_σ_E = .5

    eigs, Q = np.linalg.eigh(A)

    true_μ = np.dot(X, true_β)
    true_Σ = true_σ_A * A + true_σ_E * np.identity(A.shape[-1])
    y = multivariate_normal.rvs(true_μ, true_Σ)

    print(multivariate_normal.logpdf(y, true_μ, true_Σ))
    print(MvNormalReg.dist(true_μ, Q, eigs, true_σ_A, true_σ_E / true_σ_A).logp(y).eval({}))

    β = pm.Cauchy(name='β', alpha=0., beta=10., shape=2)
    σ_A = pm.HalfCauchy(name='σ_A', beta=10.)
    σ_E = pm.HalfCauchy(name='σ_E', beta=10.)
    MvNormalReg('y', tt.dot(X, β), Q, eigs, sigma1=σ_A, sigma2=σ_E / σ_A,
                observed=y)
    trace = pm.sample()
    pm.traceplot(trace)
    plt.savefig('traceplot.png')

Obviously this returns something different every time because the data are different, but here is the last thing:

-1377.36542449
1379.4501627243912

The last line of the logp function is wrong. It should be -0.5 * (norm + quaddist + logdet). Maybe just copy the whole function to be sure.

Whoops, sorry! It produces the same values as the scipy function now.

Now the model seems to recover all parameters well with random data. When I load my real data, σ_A is still always biased downward! But I expect this is my problem and not one inherent to PyMC.

No problem, glad I could help.

I’ve returned this after some time away; @aseyboldt, is there some way to re-write the regularised multivariate normal so that σ1 = 0 is a valid value? In my original model, where Σ = σ_A * A + σ_E * I, allowed σ_A = 0, but the new model doesn’t if σ1 = σ_A.