I wish to use the weighted average of multiple models to predict on unseen data.
I am unable to use sample_ppc_w when the models to average contain a Deterministic which depends on Theano shared variables.
Here is a simplified example (also in notebook format here)
Model the difference between a & b with the aim of predicting an HPD range for b when a new a becomes available.
Data
df = pd.DataFrame(dict(a=np.random.normal(0.1, 1.5, 1000),
b=np.random.normal(-0.05, 1.2, 1000)))
df['diff'] = df['a'] - df['b']
The averaging method I’m following is shown in the Model Averaging notebook in the docs.
I’m successfully able to average multiple models when using the calculated diff for observed values.
Model 1
with pm.Model() as model_t1:
mu = pm.Uniform('mu', lower=-5, upper=5)
sd = pm.Uniform('sd', lower=0, upper=10)
nu = pm.Uniform('nu', lower=0, upper=10)
y = pm.StudentT('y', mu=mu, nu=nu, sd=sd, observed=df['diff'])
trace_t1 = pm.sample(20000, tune=1000, njobs=2)
btrace_t1 = trace_t1[1000:]
Model 2
with pm.Model() as model_l1:
mu = pm.Uniform('mu', lower=-5, upper=5)
b = pm.Uniform('b', lower=0, upper=10)
y = pm.Laplace('y', mu=mu, b=b, observed=df['diff'])
trace_l1 = pm.sample(20000, tune=1000, njobs=2)
btrace_l1 = trace_l1[1000:]
Comparison
models_dict1 = {
model_t1: btrace_t1,
model_l1: btrace_l1,
}
comparison = pm.compare(models_dict1, method='stacking', ic='WAIC')
Sampling from the weighted average
traces1 = [btrace_t1, btrace_l1]
models1 = [model_t1, model_l1]
ppc_w1 = pm.sample_ppc_w(traces1, models=models1, weights=comparison1w.weight.sort_index(ascending=True))
Plot of Weighted sampling vs Observed df[‘diff’]
However I wish to predict on unseen data.
The prediction method I’m following is to use Theano shared variables for a & b and model diff using a Deterministic variable as shown in section 4.1 of the Quickstart docs.
I’m successfully able to use this method to predict on unseen data with a single model:
Predicting with Model 1
a_shared_t = tt.shared(df['a'].values)
b_shared_t = tt.shared(df['b'].values)
with pm.Model() as model_t2:
arv = pm.Normal('arv', mu=df['a'].mean(), sd=df['a'].std(), observed=a_shared_t)
brv = pm.Normal('brv', mu=df['b'].mean(), sd=df['b'].std(), observed=b_shared_t)
diff = pm.Deterministic('diff', arv - brv)
mu = pm.Uniform('mu', lower=-5, upper=5)
sd = pm.Uniform('sd', lower=0, upper=10)
nu = pm.Uniform('nu', lower=0, upper=10)
y = pm.StudentT('y', mu=mu, nu=nu, sd=sd, observed=diff)
trace_t2 = pm.sample(20000, tune=1000, njobs=2)
btrace_t2 = trace_t2[1000:]
observed_a = 1.8
a_shared_t.set_value(np.append(df['a'].values, observed_a))
b_shared_t.set_value(np.append(df['a'].values, 0.))
ppc_t2_updated = pm.sample_ppc(btrace_t2,
samples=10000,
model=model_t2,
size=100)
sample = ppc_t2_updated['y'][:, -1]
hpd_80_diff = pm.stats.hpd(sample, alpha=0.2)
predicted_b_range = observed_a - hpd_80_diff[1], observed_a - hpd_80_diff[0]
== (-0.7764686082453378, 3.890428414268656)
Predicting with Model 2
a_shared_l = tt.shared(df['a'].values)
b_shared_l = tt.shared(df['b'].values)
with pm.Model() as model_l2:
arv = pm.Normal('arv', mu=df['a'].mean(), sd=df['a'].std(), observed=a_shared_l)
brv = pm.Normal('brv', mu=df['b'].mean(), sd=df['b'].std(), observed=b_shared_l)
diff = pm.Deterministic('diff', arv - brv)
mu = pm.Uniform('mu', lower=-5, upper=5)
b = pm.Uniform('b', lower=0, upper=10)
y = pm.Laplace('y', mu=mu, b=b, observed=diff)
trace_l2 = pm.sample(20000, tune=1000, njobs=2)
btrace_l2 = trace_l2[1000:]
observed_a = 1.8
dummy = 0.
a_shared_l.set_value(np.append(df['a'].values, observed_a))
b_shared_l.set_value(np.append(df['a'].values, dummy))
ppc_l2_updated = pm.sample_ppc(btrace_l2,
samples=10000,
model=model_l2,
size=100)
sample = ppc_l2_updated['y'][:, -1]
hpd_80_diff = pm.stats.hpd(sample, alpha=0.2)
predicted_b_range = observed_a - hpd_80_diff[1], observed_a - hpd_80_diff[0]
== (-0.8231007149866174, 3.9518915820568434)
However when trying to sample from the averaged model using pm.sample_ppc_w:
traces2 = [btrace_t2, btrace_l2]
models2 = [model_t2, model_l2]
ppc_w2 = pm.sample_ppc_w(traces2, models=models2)
I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~/.pyenv/versions/3.6.4/envs/ganesha/lib/python3.6/site-packages/numpy/core/fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
51 try:
---> 52 return getattr(obj, method)(*args, **kwds)
53
AttributeError: 'list' object has no attribute 'repeat'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-197-2cc68ceb6ac2> in <module>()
1 traces2 = [btrace_t2, btrace_l2]
2 models2 = [model_t2, model_l2]
----> 3 ppc_w2 = pm.sample_ppc_w(traces2, models=models2)
~/.pyenv/versions/3.6.4/envs/ganesha/lib/python3.6/site-packages/pymc3/sampling.py in sample_ppc_w(traces, samples, models, weights, random_seed, progressbar)
1160
1161 obs = [x for m in models for x in m.observed_RVs]
-> 1162 variables = np.repeat(obs, n)
1163
1164 lengths = list(set([np.atleast_1d(observed).shape for observed in obs]))
~/.pyenv/versions/3.6.4/envs/ganesha/lib/python3.6/site-packages/numpy/core/fromnumeric.py in repeat(a, repeats, axis)
421
422 """
--> 423 return _wrapfunc(a, 'repeat', repeats, axis=axis)
424
425
~/.pyenv/versions/3.6.4/envs/ganesha/lib/python3.6/site-packages/numpy/core/fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
60 # a downstream library like 'pandas'.
61 except (AttributeError, TypeError):
---> 62 return _wrapit(obj, method, *args, **kwds)
63
64
~/.pyenv/versions/3.6.4/envs/ganesha/lib/python3.6/site-packages/numpy/core/fromnumeric.py in _wrapit(obj, method, *args, **kwds)
40 except AttributeError:
41 wrap = None
---> 42 result = getattr(asarray(obj), method)(*args, **kwds)
43 if wrap:
44 if not isinstance(result, mu.ndarray):
ValueError: operands could not be broadcast together with shape (6,) (2,)
It seems like it could be due to a mismatch between the number of tracked variables but for the life of me I can’t figure out how to resolve this.
Is it possible to use sample_ppc_w when averaging models containing a Deterministic which in turn depends on Theano shared variables?
Full disclosure - I am not a statistician and as awesome as I think this project is, my understanding of both Bayesian analysis as well as the proper use of PyMC3 is still pretty limited.
Hence all input will be appreciated, especially if I’m barking up the wrong tree!
Many thanks