Is it feasible to define the model by utilizing both training and test data when partitioning the data, and subsequently sample posteriors based solely on the training data, followed by running a posterior predictive on the testing data? Alternatively, are we compelled to explicitly retrieve the posteriors and re-map them for predictions?
By referring to a complex mapping, I mean the mapping of certain random variables (RVs) to days, certain RVs to the day of the month, months, etc. If new data is introduced, the modelwill not have the correct information to perform this mapping.
In this scenario, I sense that i am unable to determine how to utilize set_data, as I require the correct parameters to be mapped to the new data.
Consider e.g the following simple example with a datebased mapping:
import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
import pandas as pd
import pytensor.tensor as tt
def generate_input_data(timerange):
import random
return np.array([random.uniform(0.5, 0.7) for x in range(len(timerange))]).reshape(-1, 1)
def get_coefficient_val(date, coefficientknots):
coefficientval = 0
for knot, val in coefficientknots.items():
lb, ub = map(int, knot.split("-"))
if lb <= date.day <= ub:
coefficientval = val
break
return coefficientval
def get_exponent_val(date, exponentknots):
exponentval = 1.0
for exponentknot, val in exponentknots.items():
lb, ub = map(int, exponentknot.split("-"))
if lb <= date.day <= ub:
exponentval = val
break
return exponentval
def compute_true_values(timerange, coefficientknots, exponentknots):
true_values = []
for idx, date in enumerate(timerange):
coefficientval = get_coefficient_val(date, coefficientknots)
exponentval = get_exponent_val(date, exponentknots)
if coefficientval == 0 or exponentval == 1.0:
raise ValueError("Invalid coefficient or exponent value")
val = generate_input_data([date])[0]
entry = coefficientval * val ** exponentval
true_values.append(entry)
return np.array(true_values)
def run_model(timerange, coefficientknots, exponentknots, train_input, train_labels, test_input, test_labels):
with pm.Model() as model:
train_labels = pm.Data(f'convresponse', train_labels)
train_input = pm.Data('input', train_input)
coefficientknots_ = {}
for interval in coefficientknots:
knotcoefficient = pm.Normal(f'{interval}-knot', 0, 1)
coefficientknots_[interval] = knotcoefficient
exponentknots_ = {}
for interval in exponentknots:
knotexponent = pm.Gamma(f'{interval}-exponentknot', 3, 3)
exponentknots_[interval] = knotexponent
coefficients = []
exponents = []
for date in timerange:
coefficientknot = next(val for key, val in coefficientknots_.items()
if int(key.split("-")[0]) <= date.day <= int(key.split("-")[1]))
exponentknot = next(val for key, val in exponentknots_.items()
if int(key.split("-")[0]) <= date.day <= int(key.split("-")[1]))
coefficients.append(coefficientknot)
exponents.append(exponentknot)
coefficients = tt.as_tensor_variable(coefficients)
exponents = tt.as_tensor_variable(exponents)
modelledresponse = np.power(coefficients * train_input, exponents)
sigma = pm.HalfCauchy(name="sigma", beta=10)
_ = pm.Normal(name="outcome",
mu=modelledresponse,
sigma=sigma,
observed=train_labels)
trace = pm.sample()
def main():
timerange = pd.date_range(start="2023-01-01", end="2023-03-24", freq='D')
coefficientknots = {'0-15': 2.0, '15-32': 3.0}
exponentknots = {'0-15': 0.9, '15-32': 0.3}
input = generate_input_data(timerange)
true_values = compute_true_values(timerange, coefficientknots, exponentknots)
train_input = input[:140].squeeze()
train_labels = true_values[:140].squeeze()
test_input = input[140:].squeeze()
test_labels = input[140:].squeeze()
run_model(timerange, coefficientknots, exponentknots, train_input, train_labels, test_input, test_labels)
if __name__ == '__main__':
main()
I need to map the test_inputs to the correct dates over the horizon which i wish to predict over.