I am currently engaged in a spline regression task and have been grappling with a particular issue for some time, yet a satisfactory solution eludes me. Recently, I encountered an example here that I’ve been experimenting with in an attempt to address the problem effectively. However, despite my efforts, I have not been able to devise a solution. Please, take a look at the code snippet below, which I extracted from the following thread: : https://discourse.pymc.io/t/posterior-predictive-distributions-of-parameter-values-differs-a-lot-from-posterior/13304
import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
import pandas as pd
import pytensor.tensor as tt
def generate_input_data(timerange):
import random
return np.array([random.uniform(0.5, 0.7) for x in range(len(timerange))]).reshape(-1, 1)
def get_coefficient_val_month(date, coefficientknots):
coefficientval = 0
for knot, val in coefficientknots.items():
lb, ub = map(int, knot.split("-"))
if lb <= date.month <= ub:
coefficientval = val
break
return coefficientval
def get_coefficient_val(date, coefficientknots):
coefficientval = 0
for knot, val in coefficientknots.items():
lb, ub = map(int, knot.split("-"))
if lb <= date.day <= ub:
coefficientval = val
break
return coefficientval
def get_exponent_val(date, exponentknots):
exponentval = 1.0
for exponentknot, val in exponentknots.items():
lb, ub = map(int, exponentknot.split("-"))
if lb <= date.day <= ub:
exponentval = val
break
return exponentval
def compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots):
true_values = []
for idx, date in enumerate(timerange):
coefficientval = get_coefficient_val(date, coefficientknots)
coefficientvalmonth = get_coefficient_val_month(date, coefficientknotsmonths)
exponentval = get_exponent_val(date, exponentknots)
if coefficientval == 0 or exponentval == 1.0:
raise ValueError("Invalid coefficient or exponent value")
val = generate_input_data([date])[0]
entry = coefficientvalmonth * coefficientval * val ** exponentval
true_values.append(entry)
return np.array(true_values)
def main():
timerange = pd.date_range(start="2023-01-01", end="2023-07-24", freq='D')
coefficientknots = {'0-15': 2.0, '15-32': 3.0}
exponentknots = {'0-15': 0.2, '15-32': 0.3}
coefficientknotsmonths = {'0-13': 4.0} #, '10-13': 7.0}
data = generate_input_data(timerange)
true_values = compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots)
df = pd.DataFrame(np.c_[data, true_values], index=timerange, columns=['data', 'targets'])
df['day'] = df.index.day
df['month'] = df.index.month
knot_cutpoints = [0, 15, 32]
df['knot_idx'] = pd.cut(df['day'], knot_cutpoints, labels=False)
knot_cutpoints_month = [0, 6, 13]
df['knot_idx_month'] = pd.cut(df['month'], knot_cutpoints_month, labels=False)
df_train = df.iloc[:140].copy()
df_test = df.iloc[140:].copy()
knot_names = [f'{a}-{b}' for a, b in zip(knot_cutpoints[:-1], knot_cutpoints[1:])]
knot_names_month = [f'{a}-{b}' for a, b in zip(knot_cutpoints_month[:-1], knot_cutpoints[1:])]
coords = {'knot': knot_names,
'knot_names_month': knot_names_month}
with pm.Model(coords=coords) as model:
data_pt = pm.MutableData('input', df_train.data)
targets_pt = pm.MutableData(f'convresponse', df_train.targets)
knot_idx_pt = pm.MutableData('knot_idx', df_train.knot_idx)
knot_idx_pt_month = pm.MutableData('knot_idx_month', df_train.knot_idx_month)
knot_coef = pm.Normal(f'knot_coef', 0, 1, dims=['knot'])
knot_exponent = pm.Gamma(f'knot_exponent', 3, 3, dims=['knot'])
exp = pm.Deterministic('knot_exponent_deterministic', var=knot_exponent[knot_idx_pt])
knot_coef_month = pm.Normal(f'knot_coef_month', 0, 1, dims=['knot_names_month'])
coefficients = pm.Deterministic(name="coefficients_deterministic",
var=knot_coef[knot_idx_pt] * knot_coef_month[knot_idx_pt_month])
modelledresponse = tt.power(coefficients * data_pt,
exp)
sigma = pm.HalfCauchy(name="sigma", beta=10)
outcome = pm.Normal(name="outcome",
mu=modelledresponse,
sigma=sigma,
observed=targets_pt,
shape=data_pt.shape[0], # Important!
initval={'knot_exponent': [1, 1]}) # I found sometimes the exponents blow up
import pymc.sampling_jax as jax
trace = jax.sample_blackjax_nuts(1000,
tune=1000,
target_accept=0.97,
cores=2,
random_seed=0,
idata_kwargs={"log_likelihood": True},
return_inferencedata=True)
with model:
# idata = pm.sample_posterior_predictive(trace, extend_inferencedata=False)
pm.set_data({'input': df_test.data, 'knot_idx': df_test.knot_idx, 'knot_idx_month': df_test.knot_idx_month})
idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
var_names=["knot_exponent_deterministic",
"coefficients_deterministic"],
predictions=True)
coeff_deterministic = idata_new.predictions.coefficients_deterministic
knot_exponent_deterministic = idata_new.predictions.knot_exponent_deterministic
print("-")
if __name__ == '__main__':
main()
One problem in this code is the occurrence of the so-called dummy variable trap, as both the day-cutpoints and the monthly cutpoints encompass all categories or intervals. The desired approach is to exclude one interval, considering that the other one already covers the whole dataset.
Instead of defining knot_cutpoints_month as [0, 6, 13], the preference is to have a single parameter, knot_cutpoints_month = [0, 6], in order to circumvent the dummy variable trap.
However, the challenge lies in the indexing. Altering knot_cutpoints_month to [0, 6] results in an error, specifically “index must be integers or a boolean mask,” because the entire dataset has not been mapped with indexes, and NaN values exist where this “zero” value is. I would like to find a clever solution to this issue but i cannot come up with anything that does not involve manual handling/multiplying with zeros.