hold on, i might have misinterpreted your example in https://discourse.pymc.io/t/interpretation-of-posterior-predictive-checks-for-a-gaussian-process/12415/9 .
I think this example here below will really showcase what i want to get at(thanks for your patience).
Consider the code below:
import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
import pandas as pd
import pytensor.tensor as tt
def generate_input_data(timerange):
import random
return np.array([random.uniform(0.5, 0.7) for x in range(len(timerange))]).reshape(-1, 1)
def get_coefficient_val_month(date, coefficientknots):
coefficientval = 0
for knot, val in coefficientknots.items():
lb, ub = map(int, knot.split("-"))
if lb <= date.month <= ub:
coefficientval = val
break
return coefficientval
def get_coefficient_val(date, coefficientknots):
coefficientval = 0
for knot, val in coefficientknots.items():
lb, ub = map(int, knot.split("-"))
if lb <= date.day <= ub:
coefficientval = val
break
return coefficientval
def get_exponent_val(date, exponentknots):
exponentval = 1.0
for exponentknot, val in exponentknots.items():
lb, ub = map(int, exponentknot.split("-"))
if lb <= date.day <= ub:
exponentval = val
break
return exponentval
def compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots):
true_values = []
for idx, date in enumerate(timerange):
coefficientval = get_coefficient_val(date, coefficientknots)
coefficientvalmonth = get_coefficient_val_month(date, coefficientknotsmonths)
exponentval = get_exponent_val(date, exponentknots)
if coefficientval == 0 or exponentval == 1.0:
raise ValueError("Invalid coefficient or exponent value")
val = generate_input_data([date])[0]
entry = coefficientvalmonth * coefficientval * val ** exponentval
true_values.append(entry)
return np.array(true_values)
def main():
timerange = pd.date_range(start="2023-01-01", end="2023-07-24", freq='D')
coefficientknots = {'0-15': 2.0, '15-32': 3.0}
exponentknots = {'0-15': 0.2, '15-32': 0.3}
coefficientknotsmonths = {'0-6': 4.0, '6-13': 7.0}
data = generate_input_data(timerange)
true_values = compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots)
# Gather stuff into a dataframe for organization
df = pd.DataFrame(np.c_[data, true_values], index=timerange, columns=['data', 'targets'])
df['day'] = df.index.day
df['month'] = df.index.month
# Create the map by assiging a bin index to each datapoint based on which day it falls into
knot_cutpoints = [0, 15, 32]
df['knot_idx'] = pd.cut(df['day'], knot_cutpoints, labels=False)
knot_cutpoints_month = [0, 6, 13]
df['knot_idx_month'] = pd.cut(df['month'], knot_cutpoints_month, labels=False)
# Train-test split includes the knot mapping
df_train = df.iloc[:140].copy()
df_test = df.iloc[140:].copy()
# These names will be used to make the model output look nicer
knot_names = [f'{a}-{b}' for a, b in zip(knot_cutpoints[:-1], knot_cutpoints[1:])]
knot_names_month = [f'{a}-{b}' for a, b in zip(knot_cutpoints_month[:-1], knot_cutpoints[1:])]
# Coords let us refer to dimension by their name, rather than having to remember their shapes
coords = {'knot': knot_names,
'knot_names_month': knot_names_month}
with pm.Model(coords=coords) as model:
data_pt = pm.MutableData('input', df_train.data)
targets_pt = pm.MutableData(f'convresponse', df_train.targets)
knot_idx_pt = pm.MutableData('knot_idx', df_train.knot_idx)
knot_idx_pt_month = pm.MutableData('knot_idx_month', df_train.knot_idx_month)
knot_coef = pm.Normal(f'knot_coef', 0, 1, dims=['knot'])
knot_exponent = pm.Gamma(f'knot_exponent', 3, 3, dims=['knot'])
exp = pm.Deterministic('knot_exponent_deterministic', var=knot_exponent[knot_idx_pt])
knot_coef_month = pm.Normal(f'knot_coef_month', 0, 1, dims=['knot_names_month'])
coefficients = pm.Deterministic(name="coefficients",
var=knot_coef[knot_idx_pt] * knot_coef_month[knot_idx_pt_month])
modelledresponse = tt.power(coefficients * data_pt,
exp)
sigma = pm.HalfCauchy(name="sigma", beta=10)
outcome = pm.Normal(name="outcome",
mu=modelledresponse,
sigma=sigma,
observed=targets_pt,
shape=data_pt.shape[0], # Important!
initval={'knot_exponent': [1, 1]}) # I found sometimes the exponents blow up
import pymc.sampling_jax as jax
trace = jax.sample_blackjax_nuts(1000,
tune=1000,
target_accept=0.97,
cores=2,
random_seed=0,
idata_kwargs={"log_likelihood": True},
return_inferencedata=True)
with model:
# idata = pm.sample_posterior_predictive(trace, extend_inferencedata=False)
pm.set_data({'input': df_test.data, 'knot_idx': df_test.knot_idx, 'knot_idx_month': df_test.knot_idx_month})
idata = pm.sample_posterior_predictive(trace, random_seed=0, var_names=["outcome",
"knot_coef", "knot_exponent",
"knot_exponent_deterministic",
"coefficients"],
extend_inferencedata=False, predictions=True)
exponents = idata.predictions["knot_exponent_deterministic"]
coefficients = idata.predictions["coefficients"]
idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
var_names=["knot_exponent_deterministic", "coefficients"],
predictions=True)
print("-")
if __name__ == '__main__':
main()
To my understanding, the variables exponents and coefficients here below will correspond to PRIOR samples and not the posterior samples used to compute the posterior predictive distribution:
idata = pm.sample_posterior_predictive(trace, random_seed=0, var_names=["outcome","knot_coef", "knot_exponent","knot_exponent_deterministic","coefficients"],
extend_inferencedata=False, predictions=True)
exponents = idata.predictions["knot_exponent_deterministic"]
coefficients = idata.predictions["coefficients"]
and the same applies here:
idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
var_names=["knot_exponent_deterministic", "coefficients"],
predictions=True)
But i might have misunderstood your example, if var_names include the outcome, then the other var_names yields the posterior samples from which it samples the posterior predictive.
If var_names does not include the outcome, then var_names will correspond to priors?
Thus including “outcome” or “y” or whatever your target is in var_names will automatically generate the posterior samples for the variables the target DEPENDS on. If it is not included it will generate the priors?
If that is the case we are set and to my understanding then exponents and coefficients above correspond to what i want, namely posteriors mapped over the testdata.
If it is not the case, that exponents = idata.predictions[“knot_exponent_deterministic”] above correspond to PRIORS i am screwed.