Vectorized autoregressive model

Hey,

i currently try to reproduce the hierarchical model from Chapados, Nicolas. “Effective Bayesian modeling of groups of related count time series.” arXiv preprint arXiv:1405.3738 (2014)..

In the paper they show an approximate inference method but also state, that they where able to get the posteriors by using Stan. I wanted to achieve the same with PyMC3. Right now I have the following model.

with pm.Model() as ar_model_hier:
    # Global Hyperpriors
    alpha = pm.Uniform("alpha", lower=0.001, upper=0.1)
    kappa_tau = pm.Uniform("kappa_tau", lower=5, upper=10)
    beta_tau = pm.Uniform("beta_tau", lower=2, upper=25)
    mu_mu = pm.Normal("mu_mu", mu=0, sd=2**2)
    tau_mu = pm.Uniform("tau_mu", lower=1, upper=10)
    phi_plus = pm.Uniform("phi_plus", lower=1, upper=600)
    phi_minus = pm.Uniform("phi_minus", lower=1, upper=50)

    # AR part of the model
    mu_l = pm.Normal("mu_l", mu_mu, 1. / tau_mu, shape=Y_hat.shape[1])
    phi_l = pm.Beta("phi_l", phi_plus + phi_minus, phi_minus, shape=Y_hat.shape[1])
    c_l = mu_l * (1 - phi_l)
    tau_l = pm.Gamma("tau_l", kappa_tau, beta_tau, shape=Y_hat.shape[1])

    # Binomial part of the model
    alpha_l = pm.Exponential("alpha_l", alpha, shape=Y_hat.shape[1])

    for i in range(Y_hat.shape[1]):
       y_hat = Y_hat[:, i]
       eta_l = pm.AR("eta_%d" % i, rho=[c_l[i], phi_l[i]], tau=tau_l[i], constant=True, shape=len(y_hat))
       y_t_l = pm.NegativeBinomial("y_t_%d" % i, tt.exp(eta_l), alpha_l[i], observed=y_hat)

Where Y_hat is a matrix, with observed counts in rows, and different series in columns. The above model works great as long as I just have ~ 10 columns. The datasets from the paper actually have > 1000 column/series. So currently I also tried to get rid of the for loop, for the autoregressive model, but was not successful so far.

Are there any examples for a vectorized autoregressive model?

Cheers.
Joachim

Unfortunately the current implementation of pm.AR does not allow multidimensional rho:

A bit of rewrite of the pm.AR the model seems to run:

import theano.tensor as tt
from theano import scan
from pymc3.distributions.distribution import Continuous
from pymc3.distributions.continuous import get_tau_sd, Normal, Flat


class AR2d(Continuous):
    R"""
    Autoregressive process with p lags.
    .. math::
       x_t = \rho_0 + \rho_1 x_{t-1} + \ldots + \rho_p x_{t-p} + \epsilon_t,
       \epsilon_t \sim N(0,\sigma^2)
    The innovation can be parameterized either in terms of precision
    or standard deviation. The link between the two parametrizations is
    given by
    .. math::
       \tau = \dfrac{1}{\sigma^2}
    Parameters
    ----------
    rho : tensor
        Vector of autoregressive coefficients.
    sd : float
        Standard deviation of innovation (sd > 0). (only required if tau is not specified)
    tau : float
        Precision of innovation (tau > 0). (only required if sd is not specified)
    constant: bool (optional, default = False)
        Whether to include a constant.
    init : distribution
        distribution for initial values (Defaults to Flat())
    """

    def __init__(self, rho, sd=None, tau=None,
                 constant=False, init=Flat.dist(),
                 *args, **kwargs):

        super(AR2d, self).__init__(*args, **kwargs)
        tau, sd = get_tau_sd(tau=tau, sd=sd)
        self.sd = tt.as_tensor_variable(sd)
        self.tau = tt.as_tensor_variable(tau)

        self.mean = tt.as_tensor_variable(0.)

        rho = tt.as_tensor_variable(rho, ndim=2)
        if constant:
            self.p = rho.shape[0] - 1
        else:
            self.p = rho.shape[0]

        self.constant = constant
        self.rho = rho
        self.init = init

    def logp(self, value):
        if self.constant:
            results, _ = scan(lambda l_, obs, p, rho: rho[l_ - 1] * obs[p - l_:-l_],
                              outputs_info=None, sequences=[tt.arange(1, self.p + 1)],
                              non_sequences=[value, self.p, self.rho[1:]])
            x = tt.sum(results, axis=0)
            eps = value[self.p:] - self.rho[0] - x
        else:
            results, _ = scan(lambda l_, obs, p, rho: rho[l_ - 1] * obs[p - l_:-l_],
                              outputs_info=None, sequences=[tt.arange(1, self.p + 1)],
                              non_sequences=[value, self.p, self.rho])
            x = tt.sum(results, axis=0)
            eps = value[self.p:] - x

        innov_like = Normal.dist(mu=0.0, tau=self.tau).logp(eps)
        init_like = self.init.logp(value[:self.p])

        return tt.sum(innov_like) + tt.sum(init_like)

You can have a look at this small notebook as a demonstration:

However, the implementation doesnt seems to improve the speed a lot, as you can see from the notebook.

Actually, the scan part seems unnecessary, try this:

import theano.tensor as tt
from theano import scan
from pymc3.distributions.distribution import Continuous
from pymc3.distributions.continuous import get_tau_sd, Normal, Flat


class AR(Continuous):
    R"""
    Autoregressive process with p lags.
    .. math::
       x_t = \rho_0 + \rho_1 x_{t-1} + \ldots + \rho_p x_{t-p} + \epsilon_t,
       \epsilon_t \sim N(0,\sigma^2)
    The innovation can be parameterized either in terms of precision
    or standard deviation. The link between the two parametrizations is
    given by
    .. math::
       \tau = \dfrac{1}{\sigma^2}
    Parameters
    ----------
    rho : tensor
        Vector of autoregressive coefficients.
    sd : float
        Standard deviation of innovation (sd > 0). (only required if tau is not specified)
    tau : float
        Precision of innovation (tau > 0). (only required if sd is not specified)
    constant: bool (optional, default = False)
        Whether to include a constant.
    init : distribution
        distribution for initial values (Defaults to Flat())
    """

    def __init__(self, rho, sd=None, tau=None,
                 constant=False, init=Flat.dist(),
                 *args, **kwargs):

        super(AR, self).__init__(*args, **kwargs)
        tau, sd = get_tau_sd(tau=tau, sd=sd)
        self.sd = tt.as_tensor_variable(sd)
        self.tau = tt.as_tensor_variable(tau)

        self.mean = tt.as_tensor_variable(0.)
        
        if isinstance(rho, list):
            p = len(rho)
        else:
            try:
                p = rho.shape.tag.test_value[0]
            except AttributeError:
                p = rho.shape[0]

        if constant:
            self.p = p - 1
        else:
            self.p = p

        self.constant = constant
        self.rho = rho = tt.as_tensor_variable(rho)
        self.init = init

    def logp(self, value):
        if self.constant:
            x = tt.add(*[self.rho[i+1]*value[self.p-(i+1):-(i+1)] for i in range(self.p)])
            eps = value[self.p:] - self.rho[0] - x
        else:
            x = tt.add(*[self.rho[i]*value[self.p-(i+1):-(i+1)] for i in range(self.p)])
            eps = value[self.p:] - x

        innov_like = Normal.dist(mu=0.0, tau=self.tau).logp(eps)
        init_like = self.init.logp(value[:self.p])

        return tt.sum(innov_like) + tt.sum(init_like)

FYI: I submitted a PR so this would be easier to implement: https://github.com/pymc-devs/pymc3/pull/3070

Very, cool. Tried both version. Both work fine. I don’t see a huge performance difference between the version with, or without scan. Thank you very much!

1 Like