# Speeding up bayesian multiple regression

Hi

I am playing around with Bayesian modeling for a multiple regression task and I think I’m going wrong somewhere in modeling or parametrization. On a simple toy dataset (4 features, 100 observations), ADVI takes ~1m to fit and sample 10,000 posterior points, and NUTS takes 0.5h. By contrast, a 10,000x bootstrap takes 0.8s. (I checked the marginal posteriors against the bootstrapped marginals to ensure that ADVI converged well enough, and tuned the iterations down as far as I could). Another issue that suggests I have gone wrong somewhere is that there are divergences and low effective samples and tree depth, even with target_accept cranked up to 0.99:

``````There were 9 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The number of effective samples is smaller than 25% for some parameters.
``````

Ultimately I’d like the model to be more complex than a linear, mean effects model; and the actual data I have is ~8,000 observations and ~1,000 features, so tricks for speeding and/or scaling up this model would be quite helpful. It may also be helpful for the community more generally, as currently the only result for `"bayesian multiple regression" python` is a blog post that uses pystan (!).

Thank you

Toy model code below:

``````import matplotlib
%matplotlib inline
import scipy as sp
import pandas as pd
import numpy as np
import theano.tensor as tt
import pymc3 as pm
import time
from matplotlib import pyplot as plt

N_COVARS = 4
N_SAMPLES = 100
COVAR_COV = np.array([[1.0, 0.4, 0.2, 0.],
[0.4, 1., 0.3, 0.8],
[0.2, 0.3, 1., 0.6],
[0., 0.8, 0.6, 1.]])

def make_data(n_covs, n_samples, n_response, r2=0.8, covar_cov=None):
if covar_cov is not None:
assert covar_cov.shape[0] == covar_cov.shape[1]
assert covar_cov.shape[0] == n_covs
betas = 1 + 0.25 * np.random.normal(0., 1., size=(n_covs, n_response))
betas *= (1 - 2*np.random.binomial(1, 0.5, size=(n_covs, n_response)))  # 50% + 50% -
offset = np.random.normal(0., 4., size=(n_response,))
covar_values = np.random.normal(0., 1., size=(n_samples, n_covs))
if covar_cov is not None:
covar_L = np.linalg.cholesky(covar_cov)
covar_values = np.dot(covar_values, covar_L.T)
latent_values = np.dot(covar_values, betas) + offset
latent_var = np.var(latent_values)
noise_var = ((1 - r2) * latent_var)/4.
obs_var = 0.1
while latent_var/obs_var > r2:  # tune
noise_var *= 1.0005
noise = np.random.normal(0., noise_var, size=(n_response, n_samples))
observed = latent_values.T + noise
obs_var = np.var(observed)
return observed, covar_values, betas, offset

data, covars, beta_true, offset_true = make_data(N_COVARS, N_SAMPLES, 1, covar_cov=COVAR_COV)
data = data.reshape(N_SAMPLES,)

def fit_boot(dat, cov, n_boot=10000):
cov = np.hstack([cov, np.ones((cov.shape[0],1))])
betas = np.zeros((cov.shape[1], n_boot), dtype=np.float32)
for b in xrange(n_boot):
idx = np.random.choice(cov.shape[0], cov.shape[0], replace=True)
C = cov[idx, :]
betas[:, b] = np.dot(np.linalg.inv(np.dot(C.T, C)), np.dot(C.T, dat[idx]))
return betas

def cov2cor(C):
M = np.sqrt(np.outer(np.diag(C), np.diag(C)))
return C/M

def fit_mcmc(dat, cov, n_sample=10000):
with pm.Model() as model:
hp_beta_sd = pm.HalfNormal('hyper_beta_sd', 10.)
hp_intercept_mu = pm.Normal('hyper_int_mu', 0., 100.)
hp_intercept_sd = pm.HalfNormal('hyper_int_sd', 5.)
hp_err_sd = pm.HalfNormal('hyper_err_sd', 5.)
inter_offset = pm.Normal('intercept_offset', 0., 1.)
intercept = pm.Deterministic('intercept', inter_offset * hp_intercept_sd + hp_intercept_mu)
#beta_vec_offset = pm.Normal('beta_offset', 0., 1., shape=(cov.shape[1],))
#beta_vec = pm.Deterministic('beta', beta_vec_offset * hp_beta_sd)
beta_vec = pm.Normal('beta', 0., hp_beta_sd, shape=(cov.shape[1],))
lat_expr = pm.Deterministic('lat_expr', intercept + tt.dot(cov, beta_vec))
obs_expr = pm.Normal('expr', mu=lat_expr, sd=hp_err_sd, observed=dat)

return fit_res

def fit_advi(dat, cov, n_sample=10000):
with pm.Model() as model:
hp_beta_sd = pm.HalfNormal('hyper_beta_sd', 10.)
hp_intercept_mu = pm.Normal('hyper_int_mu', 0., 100.)
hp_intercept_sd = pm.HalfNormal('hyper_int_sd', 5.)
hp_err_sd = pm.HalfNormal('hyper_err_sd', 5.)
inter_offset = pm.Normal('intercept_offset', 0., 1.)
intercept = pm.Deterministic('intercept', inter_offset * hp_intercept_sd + hp_intercept_mu)
#beta_vec_offset = pm.Normal('beta_offset', 0., 1., shape=(cov.shape[1],))
#beta_vec = pm.Deterministic('beta', beta_vec_offset * hp_beta_sd)
beta_vec = pm.Normal('beta', 0., hp_beta_sd, shape=(cov.shape[1],))
lat_expr = pm.Deterministic('lat_expr', intercept + tt.dot(cov, beta_vec))
obs_expr = pm.Normal('expr', mu=lat_expr, sd=hp_err_sd, observed=dat)

fit = pm.fit(25000, method='fullrank_advi')

fit_res = fit.sample(draws=n_sample)

return fit_res

stime = time.time()
boot_res = fit_boot(data, covars)
beta_means = np.mean(boot_res, 1)
beta_cov = np.cov(boot_res)
etime = time.time()
elapse = etime - stime
print '{} seconds'.format(elapse)
plt.plot(boot_res[1,:], boot_res[3,:], 'ob')

stime = time.time()
etime = time.time()
elapse_advi = etime - stime

stime = time.time()
mcmc_res = fit_mcmc(data, covars)
etime = time.time()
elapse_mcmc = etime - stime
print '{} seconds'.format(elapse_mcmc)

print 'NUTS slowdown factor: {}'.format(elapse_mcmc/elapse)

``````

Outputs:

0.787487030029 seconds
78.5826408863 seconds
ADVI slowdown factor: 99.7891239979
1476.3067131 seconds
NUTS slowdown factor: 1874.70606729

You should use the pairplot to first check where in the parameter space is problematic. My guess is that your intercept is overparameterized - usually we just do `intercept = pm.Normal('intercept', 0., 100.)`.

There’s a little bit of a funnel in the intercept, but nothing looks too crazy to me:

But re-parameterizing the intercept made a huge difference:

``````135.169487 seconds
NUTS slowdown factor: 189.685194039
``````

effectively 10x faster. Increasing to 1000 observations shows a very nice scaling:

``````173.795094013 seconds
NUTS slowdown factor: 52.3025973161
``````

subsequently increasing to 40 covariates (1000 x 40 data matrix) looks even better

``````174.969336987 seconds
NUTS slowdown factor: 33.5733716568
``````

so it looks like it’s scaling really well. Thank you.

2 Likes

Just want to follow up and point out that there is actually a funnel for `hyper_int_mu` and `hyper_int_sd` around mu=0

1 Like