Hello,
I have been working in the parameters identification for a bimodal distribution as follow:
Identification of multimodal distributions
The aim of this notebook is to review how pymc3 can handle identification of mixed gaussians generating data.
import pymc3 as pm
from pymc3 import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import theano
import theano.tensor as tt
First we define a generic function able to generate linear $$y = a.x + b + \epsilon$$
wit \epsilon following a gaussian law with null mean and variance \sigma.
def generate_data(size = 100, true_intercept = 1, true_slope = 2, scale = .1):
#
# generating data
#
x = np.random.uniform(0., 1., size=size)
# y = a + b*x
true_regression_line = true_intercept + true_slope * x
# add noise
#np.random.seed(453)
y = true_regression_line + np.random.normal(scale=scale, size=size)
return x, y, true_regression_line
1- Constant values
In this part, the relation between y and x is constant (e.g. a=0).
We consider two components with respective means \mu_1=0 and \mu_2=1.
def generate_mixed(size=100, data_plot=True):
# data generation
x_data_1, y_data_1, true_regression_line = generate_data(size=size, true_slope=0., true_intercept=0)
x_data_2, y_data_2, true_regression_line_2 = generate_data(size=size, true_slope=0., true_intercept=1)
if(data_plot):
# display data & true regression line
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, xlabel='x', ylabel='y', title='Generated data and underlying model')
ax.plot(x_data_1, y_data_1, 'x', label='sampled data 1')
ax.plot(x_data_2, y_data_2, 'o', label='sampled data 2')
ax.plot(x_data_1, true_regression_line, label='true regression line', lw=2.)
ax.plot(x_data_2, true_regression_line_2, label='true regression line 2', lw=3.)
plt.legend(loc=0);
plt.show()
x_data = np.concatenate((x_data_1, x_data_2))
y_data = np.concatenate((y_data_1, y_data_2))
print(' Overall y mean={0:.2f}'.format(y_data.mean()))
print('Component y_1 mean={0:.2f}'.format(y_data_1.mean()))
print('Component y_2 mean={0:.2f}'.format(y_data_2.mean()))
data = {'x': x_data, 'y': y_data}
return data
data = generate_mixed(size=1000)
x_data = data['x']
y_data = data['y']
Overall y mean=0.50
Component y_1 mean=0.00
Component y_2 mean=0.99
sns.distplot(y_data)
<matplotlib.axes._subplots.AxesSubplot at 0x1c396a2c18>
The model is assuming a mixture of two Gaussians
# https://docs.pymc.io/notebooks/marginalized_gaussian_mixture_model.html
W = np.array([0., 0])
with pm.Model() as model_mixed:
w = pm.Dirichlet('w', np.ones_like(W))
mu = pm.Normal('mu', 0., 10., shape=W.size)
sigma = pm.Uniform('sigma', 0., 5., shape=W.size)
# Likelihood
y_obs = pm.NormalMixture('y_obs', w, mu, sd=sigma, observed=y_data)
trace_mixed = pm.sample(3000, cores=4, step=pm.SMC())
pm.traceplot(trace_mixed)
pm.summary(trace_mixed)
Sample initial stage: ...
Stage: 0 Beta: 0.000325 Steps: 25
100%|ββββββββββ| 3000/3000 [00:27<00:00, 110.90it/s]
Stage: 1 Beta: 0.000917 Steps: 17
100%|ββββββββββ| 3000/3000 [00:20<00:00, 150.97it/s]
Stage: 2 Beta: 0.001724 Steps: 4
100%|ββββββββββ| 3000/3000 [00:04<00:00, 645.63it/s]
Stage: 3 Beta: 0.003111 Steps: 25
100%|ββββββββββ| 3000/3000 [00:28<00:00, 106.95it/s]
Stage: 4 Beta: 0.005534 Steps: 5
100%|ββββββββββ| 3000/3000 [00:05<00:00, 507.53it/s]
Stage: 5 Beta: 0.007663 Steps: 25
100%|ββββββββββ| 3000/3000 [00:28<00:00, 104.99it/s]
Stage: 6 Beta: 0.009495 Steps: 4
100%|ββββββββββ| 3000/3000 [00:04<00:00, 628.53it/s]
Stage: 7 Beta: 0.012544 Steps: 25
100%|ββββββββββ| 3000/3000 [00:30<00:00, 99.27it/s]
Stage: 8 Beta: 0.019295 Steps: 3
100%|ββββββββββ| 3000/3000 [00:03<00:00, 812.85it/s]
Stage: 9 Beta: 0.023961 Steps: 21
100%|ββββββββββ| 3000/3000 [00:24<00:00, 121.05it/s]
Stage: 10 Beta: 0.027704 Steps: 4
100%|ββββββββββ| 3000/3000 [00:04<00:00, 642.48it/s]
Stage: 11 Beta: 0.032374 Steps: 25
100%|ββββββββββ| 3000/3000 [00:29<00:00, 102.82it/s]
Stage: 12 Beta: 0.039610 Steps: 4
100%|ββββββββββ| 3000/3000 [00:04<00:00, 621.53it/s]
Stage: 13 Beta: 0.050035 Steps: 25
100%|ββββββββββ| 3000/3000 [00:30<00:00, 99.43it/s]
Stage: 14 Beta: 0.064108 Steps: 3
100%|ββββββββββ| 3000/3000 [00:03<00:00, 777.79it/s]
Stage: 15 Beta: 0.083553 Steps: 15
100%|ββββββββββ| 3000/3000 [00:18<00:00, 160.28it/s]
Stage: 16 Beta: 0.116714 Steps: 3
100%|ββββββββββ| 3000/3000 [00:03<00:00, 795.55it/s]
Stage: 17 Beta: 0.183498 Steps: 8
100%|ββββββββββ| 3000/3000 [00:09<00:00, 309.22it/s]
Stage: 18 Beta: 0.327297 Steps: 4
100%|ββββββββββ| 3000/3000 [00:05<00:00, 586.19it/s]
Stage: 19 Beta: 0.613976 Steps: 7
100%|ββββββββββ| 3000/3000 [00:08<00:00, 341.50it/s]
Stage: 20 Beta: 1.000000 Steps: 4
100%|ββββββββββ| 3000/3000 [00:04<00:00, 602.59it/s]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
mean | sd | mc_error | hpd_2.5 | hpd_97.5 | |
---|---|---|---|---|---|
mu__0 | 0.994766 | 0.003140 | 0.000060 | 0.988806 | 1.000994 |
mu__1 | 0.004642 | 0.003169 | 0.000055 | -0.001040 | 0.011343 |
w__0 | 0.500689 | 0.011411 | 0.000209 | 0.480149 | 0.523877 |
w__1 | 0.499311 | 0.011411 | 0.000209 | 0.476123 | 0.519851 |
sigma__0 | 0.097263 | 0.002185 | 0.000035 | 0.093006 | 0.101483 |
sigma__1 | 0.100837 | 0.002264 | 0.000043 | 0.096461 | 0.105173 |
ppc = pm.sample_posterior_predictive(trace_mixed, samples=500, model=model_mixed)
samples_from_posterior = np.asarray(ppc['y_obs'])
print(samples_from_posterior.shape)
_, ax = plt.subplots(figsize=(12, 6))
for y in ppc['y_obs']:
sns.distplot(y , hist=False, ax=ax, kde_kws={"lw": 1, 'alpha':0.3})
sns.distplot(y_data, ax=ax, hist=False,
kde_kws={"color": "k", "lw": 3, "label": "Dataset"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
100%|ββββββββββ| 500/500 [00:00<00:00, 570.65it/s]
(500, 2000)
<matplotlib.axes._subplots.AxesSubplot at 0x1c376346d8>
As you can see things are going pretty well (in my opinion).
I am now willing to calculate the probability of w given a new data point y. How can I do that?
Thank you in advance.