I’m new using PyMC3 and I’m reproducing the work of this blog to understand the underlying logic of Facebook’s Prophet algorithm.
For reference, I’m using the code at the end of this post. I’ve noticed that the data used in the example is a time series of Wikipedia page views, which is basically a count time series. They used the log
version of it to model it using a Normal distribution as highlighted by this code:
pm.Normal('obs',
mu=y,
sd=sigma,
observed=df['y_scaled'])
So, I’ve three questions regarding this:
-
Why or when is better to use the
log
version of a count time series to model it with a normal distribution? -
Given that we are working with counts, does it make sense to work with the negative binomial distribution? I actually went ahead and tried it, but I guess I’m doing something wrong because I get this error:
ValueError: operands could not be broadcast together with shapes (500,2905) (500,)
. I’m using this to replace the corresponding lines:def trend_model(m, t, n_changepoints=25, changepoints_prior_scale=0.1, growth_prior_scale=0.5, changepoint_range=0.8): """ The piecewise linear trend with changepoint implementation in PyMC3. :param m: (pm.Model) :param t: (np.array) MinMax scaled time. :param n_changepoints: (int) The number of changepoints to model. :param changepoint_prior_scale: (flt/ None) The scale of the Laplace prior on the delta vector. If None, a hierarchical prior is set. :param growth_prior_scale: (flt) The standard deviation of the prior on the growth. :param changepoint_range: (flt) Proportion of history in which trend changepoints will be estimated. :return g, A, s: (tt.vector, np.array, tt.vector) """ s = np.linspace(0, changepoint_range * np.max(t), n_changepoints + 1)[1:] # * 1 casts the boolean to integers A = (t[:, None] > s) * 1 with m: # initial growth k = pm.Normal('k', 2, growth_prior_scale) if changepoints_prior_scale is None: changepoints_prior_scale = pm.Exponential('tau', 1.5) # rate of change delta = pm.Laplace('delta', 0, changepoints_prior_scale, shape=n_changepoints) # offset m = pm.Normal('m', 100, 50) gamma = -s * delta g = (m + det_dot(A, gamma)) * (k + det_dot(A, delta)) ** t return g, A, s # Generate a PyMC3 Model context m = pm.Model() with m: y, A, s = trend_model(m, df['t']) #sigma = pm.HalfCauchy('sigma', 0.5, testval=1) alpha = pm.HalfCauchy('sigma', 0.5, testval=1) pm.NegativeBinomial('obs', mu=y, alpha=alpha, observed=df['y_original'])
-
In case I’ve a bunch of time series of page views from different sites, what’s the most convenient way to modify this to a hierarchical code to take advantage of the information of all the time series, instead of creating a single model for each one.
Any help / suggestion / comment on these questions is highly appreciated!
Full code below:
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import pandas as pd
np.random.seed(25)
n_changepoints = 10
t = np.arange(1000)
s = np.sort(np.random.choice(t, n_changepoints, replace=False))
A = (t[:, None] > s) * 1
delta = np.random.normal(size=n_changepoints)
k = 1
m = 5
growth = (k + A @ delta) * t
gamma = -s * delta
offset = m + A @ gamma
trend = growth + offset
plt.figure(figsize=(16, 3 * 3))
n = 310
i = 0
for t, f in zip(['Linear Trend with Changepoints', 'Growth rate', 'Growth offset'],
[trend, growth, offset]):
i += 1
plt.subplot(n + i)
plt.title(t)
plt.yticks([])
plt.vlines(s, min(f), max(f), lw=0.8, linestyles='--')
plt.plot(f)
plt.show()
def det_dot(a, b):
"""
The theano dot product and NUTS sampler don't work with large matrices?
:param a: (np matrix)
:param b: (theano vector)
"""
return (a * b[None, :]).sum(axis=-1)
df = pd.read_csv('https://raw.githubusercontent.com/facebook/prophet/master/examples/example_wp_log_peyton_manning.csv')
# Make sure we work with datetime types
df['ds'] = pd.to_datetime(df['ds'])
# Scale the data
df['y_scaled'] = df['y'] / df['y'].max()
df['t'] = (df['ds'] - df['ds'].min()) / (df['ds'].max() - df['ds'].min())
df.plot(x='ds', y='y', figsize=(16, 6), title="Wikipedia pageviews for 'Peyton Manning'")
plt.show()
def trend_model(m, t, n_changepoints=25, changepoints_prior_scale=0.1,
growth_prior_scale=5, changepoint_range=0.8):
"""
The piecewise linear trend with changepoint implementation in PyMC3.
:param m: (pm.Model)
:param t: (np.array) MinMax scaled time.
:param n_changepoints: (int) The number of changepoints to model.
:param changepoint_prior_scale: (flt/ None) The scale of the Laplace prior on the delta vector.
If None, a hierarchical prior is set.
:param growth_prior_scale: (flt) The standard deviation of the prior on the growth.
:param changepoint_range: (flt) Proportion of history in which trend changepoints will be estimated.
:return g, A, s: (tt.vector, np.array, tt.vector)
"""
s = np.linspace(0, changepoint_range * np.max(t), n_changepoints + 1)[1:]
# * 1 casts the boolean to integers
A = (t[:, None] > s) * 1
with m:
# initial growth
k = pm.Normal('k', 0, growth_prior_scale)
if changepoints_prior_scale is None:
changepoints_prior_scale = pm.Exponential('tau', 1.5)
# rate of change
delta = pm.Laplace('delta', 0, changepoints_prior_scale, shape=n_changepoints)
# offset
m = pm.Normal('m', 0, 5)
gamma = -s * delta
g = (k + det_dot(A, delta)) * t + (m + det_dot(A, gamma))
return g, A, s
# Generate a PyMC3 Model context
m = pm.Model()
with m:
y, A, s = trend_model(m, df['t'])
sigma = pm.HalfCauchy('sigma', 0.5, testval=1)
pm.Normal('obs',
mu=y,
sd=sigma,
observed=df['y_scaled'])
def sanity_check(m, df):
"""
:param m: (pm.Model)
:param df: (pd.DataFrame)
"""
# Sample from the prior and check of the model is well defined.
y = pm.sample_prior_predictive(model=m, vars=['obs'])['obs']
plt.figure(figsize=(16, 6))
plt.plot(y.mean(0), label='mean prior')
plt.fill_between(np.arange(y.shape[1]), -y.std(0), y.std(0), alpha=0.25, label='standard deviation')
plt.plot(df['y_scaled'], label='true value')
plt.legend()
plt.show()
# And run the sanity check
sanity_check(m, df)
# Find a point estimate of the models parameters
with m:
aprox = pm.find_MAP()
# Determine g, based on the parameters
def det_trend(k, m, delta, t, s, A):
return (k + np.dot(A, delta)) * t + (m + np.dot(A, (-s * delta)))
# run function and rescale to original scale
g = det_trend(aprox['k'], aprox['m'], aprox['delta'], df['t'], s, A) * df['y'].max()
plt.figure(figsize=(16, 6))
plt.title('$g(t)$')
plt.plot(g)
plt.scatter(np.arange(df.shape[0]), df.y, s=0.5, color='black')
plt.show()
np.random.seed(6)
def fourier_series(t, p=365.25, n=10):
# 2 pi n / p
x = 2 * np.pi * np.arange(1, n + 1) / p
# 2 pi n / p * t
x = x * t[:, None]
x = np.concatenate((np.cos(x), np.sin(x)), axis=1)
return x
def seasonality_model(m, df, period='yearly', seasonality_prior_scale=10):
if period == 'yearly':
n = 10
# rescale the period, as t is also scaled
p = 365.25 / (df['ds'].max() - df['ds'].min()).days
else: # weekly
n = 3
# rescale the period, as t is also scaled
p = 7 / (df['ds'].max() - df['ds'].min()).days
x = fourier_series(df['t'], p, n)
with m:
beta = pm.Normal(f'beta_{period}', mu=0, sd=seasonality_prior_scale, shape=2 * n)
return x, beta
m = pm.Model()
with m:
# changepoints_prior_scale is None, so the exponential distribution
# will be used as prior on \tau.
y, A, s = trend_model(m, df['t'], changepoints_prior_scale=None)
x_yearly, beta_yearly = seasonality_model(m, df, 'yearly')
x_weekly, beta_weekly = seasonality_model(m, df, 'weekly')
y += det_dot(x_yearly, beta_yearly) + det_dot(x_weekly, beta_weekly)
sigma = pm.HalfCauchy('sigma', 0.5, testval=1)
obs = pm.Normal('obs',
mu=y,
sd=sigma,
observed=df['y_scaled'])
with m:
trace = pm.sample(500)
pm.traceplot(trace)
def det_seasonality_posterior(beta, x):
return np.dot(x, beta.T)
p = 0.025
# vector distributions
beta_yearly = trace['beta_yearly']
beta_weekly = trace['beta_weekly']
delta = trace['delta']
# scalar distributions
k = trace['k']
m = trace['m']
# determine the posterior by evaulating all the values in the trace.
trend_posterior = ((k + np.dot(A, delta.T)) * df['t'][:, None] + m + np.dot(A, (-s * delta).T)) * df['y'].max()
yearly_posterior = det_seasonality_posterior(beta_yearly, x_yearly) * df['y'].max()
weekly_posterior = det_seasonality_posterior(beta_weekly, x_weekly) * df['y'].max()
date = df['ds'].dt.to_pydatetime()
sunday = np.argmax(df['ds'].dt.dayofweek)
weekdays = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
idx_year = np.argmax(df['ds'].dt.dayofyear)
plt.figure(figsize=(16, 3*6))
b = 411
plt.subplot(b)
plt.title('total')
plt.plot(date,
(trend_posterior + yearly_posterior + weekly_posterior).mean(1), lw=0.5)
plt.scatter(date, df['y'], s=0.5, color='black')
plt.subplot(b + 1)
plt.title('trend')
plt.plot(date, trend_posterior.mean(1))
quant = np.quantile(trend_posterior, [p, 1 - p], axis=1)
plt.fill_between(date, quant[0, :], quant[1, :], alpha=0.25)
plt.subplot(b + 2)
plt.title('yearly')
plt.plot(date[idx_year: idx_year + 365], yearly_posterior.mean(1)[idx_year: idx_year + 365])
quant = np.quantile(yearly_posterior, [p, 1 - p], axis=1)
plt.fill_between(date[idx_year: idx_year + 365],
quant[0, idx_year: idx_year + 365], quant[1, idx_year: idx_year + 365], alpha=0.25)
plt.subplot(b + 3)
plt.title('weekly')
plt.plot(weekdays, weekly_posterior.mean(1)[sunday: sunday + 7])
quant = np.quantile(weekly_posterior, [p, 1 - p], axis=1)
plt.fill_between(weekdays, quant[0, sunday: sunday + 7],
quant[1, sunday: sunday + 7], alpha=0.25)