Hi folks,
glad to be back to working on some statistics after a long time
I encountered an issue with posterior predictive sampling which might qualify as a bug, but I wanted to check first.
goal: given a sampled model, I’d like to set a specific data point and sample values here. As opposed to the old way of messing with theano tensors directly, I was glad to find this: Model — PyMC3 3.11.2 documentation
(but note that the issue below also appears when using theano shared variables)
Note that the values set for prediction have a different shape than the originally observed data.
Finally, and that’s the crucial dead switch: I need multivariate posterior, with LKJ prior.
problem: this works smoothly when using an uncorrelated posterior/likelihood/observable (how to correctly say it these days?) structure. However, when switching to a MvNormal or MvStudent with a Cholesky matrix, the following error occurs:
ValueError: Cannot broadcast provided shapes (57, 2), (99, 2) given size: ()
[shapes may vary]
i.e. some tensor in the multivariate procedure was not updated.
Or maybe I forgot to make some part of the LKJ prior “shared”; or data is set to shared internally but how can I grab it?
non-minimal working example:
Note that I suspected a lot of different parts of my original, more complex model to cause the issue (e.g. saving/loading). Therefore, the example below is not exactly minimal. You can toggle the issue by setting line 149, under ## observable
to True
|False
.
Thanks a lot in advance!
Falk
#!/usr/bin/env python3
import os as os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pymc3 as pm
import scipy.stats as stats
import patsy as pa
import theano as th
import theano.tensor as tt
import matplotlib.pyplot as plt
import pickle as pck
PrintShape = lambda label, tensor: th.printing.Print(label, attrs = [ 'shape' ])(tensor)
### simulated data
print ('#### Simulation ####')
n_observations = 99
n_groups = 3
n_predictors = 2
n_observables = 2
# actual intercept
actual_intercepts = [-0.8, 0.4] # observables differ in intercept
# slopes
actual_slopes = {} # groups differ in slopes
actual_slopes[0] = [-1.2, 1.5, 0.1]
actual_slopes[1] = [0., -0.1, 0.0]
# groups
group_bool = np.zeros((n_observations,3), dtype = bool)
group_bool[ : n_observations//3 , 0 ] = True
group_bool[ n_observations//3 : (2*n_observations)//3 , 1 ] = True
group_bool[ (2*n_observations)//3 : , 2 ] = True
# groupgroups in vector form
group_vector = np.array(group_bool, dtype = float)
group_vector[:, 0] *= 0
group_vector[:, 2] *= 2
group_vector = np.array(np.sum(group_vector, 1), dtype = int)
# predictor variables
predictors = {}
predictors[0] = np.random.normal(0., 2., size = n_observations)
predictors[1] = np.random.normal(0., 1.4, size = n_observations)
# noise
noise = np.random.normal(0., 0.5, size = n_observations)
# observable = outcome variable
observable0 = actual_intercepts[0] + noise
# add slopes
for grp in range(n_groups):
for prd in range(n_predictors):
observable0[group_bool[:, grp]] += predictors[prd][group_bool[:, grp]] * actual_slopes[prd][grp]
# obs1 has different intercept and shifted predictor1 slopes
observable1 = observable0 \
- actual_intercepts[0]+actual_intercepts[1] \
+ 0.4 * predictors[1] \
- noise + np.random.normal(0., 0.8, size = n_observations)
# NOTE: perfect correlation hinders sampling
print ('obs corr:', stats.pearsonr(observable0, observable1))
# NOTE: two observables and stacking samples equally fast as a single one
# combining the data
data = pd.DataFrame.from_dict({ \
'index': np.arange(n_observations) \
, 'group': pd.Categorical(group_vector) \
, 'predictor0': predictors[0] \
, 'predictor1': predictors[1] \
, 'observable0': observable0 \
, 'observable1': observable1 \
})
if True:
# inspect the data
for grp in range(n_groups):
plt.scatter( data.loc[group_bool[:, grp], 'predictor0'] \
, data.loc[group_bool[:, grp], 'observable0'] \
, s = 10 \
, marker = 'o' \
, label = f'grp {grp}' \
)
plt.axvline(0.)
plt.axhline(0.)
plt.legend(loc = 'best')
plt.show()
### statsmodels inference
formula = f'observable0 ~ 1 + (predictor0 * group) + (predictor1 * group)'
print (formula)
model = smf.glm( formula = formula \
, data = data \
#, groups = data['subject'] \
, family = sm.families.Gaussian()).fit()
# confidence = model.conf_int()
# print (species_confidence)
print(model.summary())
### Model Construction and sampling
print ('#### Modeling ####')
# shared theano objects
shared = {}
with pm.Model() as model:
# NOTE: using pymc3.model.set_data instead of theano magic! see example: https://docs.pymc.io/api/model.html#pymc3.model.set_data
shared['interecept'] = pm.Data(f'intercept_data', np.ones((n_observations, 1)))
shared['predictors'] = pm.Data(f'predictors_data', data.loc[:, ['predictor0', 'predictor1']].values)
shared['groups'] = pm.Data( f'groups_data' \
, pa.dmatrix( '0 + group' \
, data = data \
, return_type = 'matrix' \
) \
) # could also use "group_bool" directly
## intercept
intercept = pm.Normal( 'intercept' \
, mu = 0., sd = 1. \
, shape = (1, n_observables) \
)
interceptset = tt.dot(shared['interecept'], intercept)
PrintShape('\tintercept', interceptset)
estimator = interceptset
## slopes
obslwise = []
for obsl in range(n_observables):
slopes = pm.Normal( f'slopes_obs{obsl}' \
, mu = 0., sd = 1. \
, shape = (n_groups, n_predictors) \
)
groupwise_slopes = tt.dot(shared['groups'], slopes)
groupdat = tt.sum(tt.mul(shared['predictors'], groupwise_slopes), 1)
# NOTE: you can print the shape of a theano/pm tensor with th.printing.Print():
# PrintShape('\tgrpdat', groupdat)
obslwise.append(groupdat)
slopeset = tt.stack(obslwise, axis = 1)
PrintShape('\tslopes', slopeset)
estimator += slopeset
## observable
if False:
# individual observables
# residual
residual = pm.HalfCauchy('residual', 1., shape = (1,n_observables))
PrintShape('\tresidual', residual)
# NOTE: residual must have a shape
posterior = pm.Normal( 'posterior' \
, mu = estimator \
, sd = residual \
, observed = data.loc[:, ['observable0', 'observable1']].values \
)
# NOTE: the "observed" data must be transposed if single observable!
else:
# multivariate observables
# LKJ Prior
sd_dist = pm.HalfNormal.dist(1.)
packed_cholesky = pm.LKJCholeskyCov( 'packed_cholesky' \
, n = n_observables \
, eta = 1. \
, sd_dist = sd_dist \
)
# compute the observables covariance and correlation matrix
cholesky_matrix = pm.expand_packed_triangular(n_observables, packed_cholesky, lower = True)
posterior = pm.MvNormal( 'posterior' \
, mu = estimator \
, chol = cholesky_matrix \
, observed = data.loc[:, ['observable0', 'observable1']].values \
)
# inference
trace = pm.sample( draws = 2**14, tune = 2**14 \
, cores = 4, chains = 4 \
, return_inferencedata = True \
)
# show outcome
print (pm.summary(trace))
pm.plot_trace(trace)
plt.show()
# NOTE: slopes of the second predictor are still off slightly (should be zero)
### saving and loading
print ('#### Deserialization ####')
if True:
filename = 'test.mdl'
# save
with open(filename, 'wb') as store:
pck.dump({ \
'model': model \
, 'trace': trace \
}, store)
# reload
with open(filename, 'rb') as store:
loaded = pck.load(store)
model = loaded['model']
trace = loaded['trace']
os.system(f'rm {filename}')
print ('stored and reloaded!', filename)
### predictive sampling
print ('#### Prediction ####')
n_predictions = 57
# adjust intercept shape
probe_data = {}
probe_data['intercept_data'] = np.ones((n_predictions, 1))
# new prediction values
predictors_probevalues = np.zeros((n_predictions, n_predictors))
predictors_probevalues[:, 0] = -2. # first predictor is at -2
probe_data['predictors_data'] = predictors_probevalues
# ... and groups
groups_probevalues = np.zeros((n_predictions,n_groups))
groups_probevalues[:,0] = 1. # sample the first group
probe_data['groups_data'] = groups_probevalues
# how often to repeat the n_predictions draws
n_repeats = 10
with model:
pm.set_data(probe_data)
prediction = pm.sample_posterior_predictive(trace, samples = n_repeats)
print ('\n') # NOTE: prediction requires an extra linebreak
print (prediction['posterior'].shape)
# NOTE: prediction has shape of (n_repeats, n_predictions); you can freely design your probes and repeats
# print (prediction['posterior'])
# plot the distribution of the prediction
y = prediction['posterior'][:,:,0].ravel()
distribution = stats.kde.gaussian_kde(y)
x = np.linspace(np.min(y), np.max(y), 101, endpoint = True)
plt.plot(x, distribution(x))
plt.axvline(actual_intercepts[0]-2*actual_slopes[0][0])
plt.gca().set_xlim([np.min(x), np.max(x)])
plt.gca().set_ylim([0, plt.gca().get_ylim()[1]])
plt.tight_layout()
plt.show()