Posterior predictive distributions of parameter values differs a lot from posterior?

richardbaws · November 17, 2023, 12:11am

hold on, i might have misinterpreted your example in https://discourse.pymc.io/t/interpretation-of-posterior-predictive-checks-for-a-gaussian-process/12415/9 .

I think this example here below will really showcase what i want to get at(thanks for your patience).

Consider the code below:

import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
import pandas as pd
import pytensor.tensor as tt

def generate_input_data(timerange):
    import random
    return np.array([random.uniform(0.5, 0.7) for x in range(len(timerange))]).reshape(-1, 1)

def get_coefficient_val_month(date, coefficientknots):
    coefficientval = 0
    for knot, val in coefficientknots.items():
        lb, ub = map(int, knot.split("-"))
        if lb <= date.month <= ub:
            coefficientval = val
            break
    return coefficientval

def get_coefficient_val(date, coefficientknots):
    coefficientval = 0
    for knot, val in coefficientknots.items():
        lb, ub = map(int, knot.split("-"))
        if lb <= date.day <= ub:
            coefficientval = val
            break
    return coefficientval

def get_exponent_val(date, exponentknots):
    exponentval = 1.0
    for exponentknot, val in exponentknots.items():
        lb, ub = map(int, exponentknot.split("-"))
        if lb <= date.day <= ub:
            exponentval = val
            break
    return exponentval

def compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots):
    true_values = []
    for idx, date in enumerate(timerange):
        coefficientval = get_coefficient_val(date, coefficientknots)
        coefficientvalmonth = get_coefficient_val_month(date, coefficientknotsmonths)
        exponentval = get_exponent_val(date, exponentknots)
        if coefficientval == 0 or exponentval == 1.0:
            raise ValueError("Invalid coefficient or exponent value")
        val = generate_input_data([date])[0]
        entry = coefficientvalmonth * coefficientval * val ** exponentval
        true_values.append(entry)
    return np.array(true_values)


def main():

    timerange = pd.date_range(start="2023-01-01", end="2023-07-24", freq='D')
    coefficientknots = {'0-15': 2.0, '15-32': 3.0}
    exponentknots = {'0-15': 0.2, '15-32': 0.3}
    coefficientknotsmonths = {'0-6': 4.0, '6-13': 7.0}
    data = generate_input_data(timerange)
    true_values = compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots)

    # Gather stuff into a dataframe for organization
    df = pd.DataFrame(np.c_[data, true_values], index=timerange, columns=['data', 'targets'])
    df['day'] = df.index.day

    df['month'] = df.index.month

    # Create the map by assiging a bin index to each datapoint based on which day it falls into
    knot_cutpoints = [0, 15, 32]
    df['knot_idx'] = pd.cut(df['day'], knot_cutpoints, labels=False)

    knot_cutpoints_month = [0, 6, 13]
    df['knot_idx_month'] = pd.cut(df['month'], knot_cutpoints_month, labels=False)

    # Train-test split includes the knot mapping
    df_train = df.iloc[:140].copy()
    df_test = df.iloc[140:].copy()

    # These names will be used to make the model output look nicer
    knot_names = [f'{a}-{b}' for a, b in zip(knot_cutpoints[:-1], knot_cutpoints[1:])]

    knot_names_month = [f'{a}-{b}' for a, b in zip(knot_cutpoints_month[:-1], knot_cutpoints[1:])]
    # Coords let us refer to dimension by their name, rather than having to remember their shapes
    coords = {'knot': knot_names,
              'knot_names_month': knot_names_month}



    with pm.Model(coords=coords) as model:
        data_pt = pm.MutableData('input', df_train.data)
        targets_pt = pm.MutableData(f'convresponse', df_train.targets)

        knot_idx_pt = pm.MutableData('knot_idx', df_train.knot_idx)

        knot_idx_pt_month = pm.MutableData('knot_idx_month', df_train.knot_idx_month)

        knot_coef = pm.Normal(f'knot_coef', 0, 1, dims=['knot'])
        knot_exponent = pm.Gamma(f'knot_exponent', 3, 3, dims=['knot'])

        exp = pm.Deterministic('knot_exponent_deterministic', var=knot_exponent[knot_idx_pt])

        knot_coef_month = pm.Normal(f'knot_coef_month', 0, 1, dims=['knot_names_month'])

        coefficients = pm.Deterministic(name="coefficients",
                                        var=knot_coef[knot_idx_pt] * knot_coef_month[knot_idx_pt_month])

        modelledresponse = tt.power(coefficients * data_pt,
                                    exp)

        sigma = pm.HalfCauchy(name="sigma", beta=10)
        outcome = pm.Normal(name="outcome",
                            mu=modelledresponse,
                            sigma=sigma,
                            observed=targets_pt,
                            shape=data_pt.shape[0],  # Important!
                            initval={'knot_exponent': [1, 1]})  # I found sometimes the exponents blow up

        import pymc.sampling_jax as jax
        trace = jax.sample_blackjax_nuts(1000,
                                         tune=1000,
                                         target_accept=0.97,
                                         cores=2,
                                         random_seed=0,
                                         idata_kwargs={"log_likelihood": True},
                                         return_inferencedata=True)

    with model:
        # idata = pm.sample_posterior_predictive(trace, extend_inferencedata=False)
        pm.set_data({'input': df_test.data, 'knot_idx': df_test.knot_idx, 'knot_idx_month': df_test.knot_idx_month})
        idata = pm.sample_posterior_predictive(trace, random_seed=0, var_names=["outcome",
                                                                                "knot_coef", "knot_exponent",
                                                                                "knot_exponent_deterministic",
                                                                                "coefficients"],
                                                   extend_inferencedata=False, predictions=True)
        exponents = idata.predictions["knot_exponent_deterministic"]
        coefficients = idata.predictions["coefficients"]

        idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
                                                   var_names=["knot_exponent_deterministic", "coefficients"],
                                                   predictions=True)
        print("-")

if __name__ == '__main__':
    main()

To my understanding, the variables exponents and coefficients here below will correspond to PRIOR samples and not the posterior samples used to compute the posterior predictive distribution:

idata = pm.sample_posterior_predictive(trace, random_seed=0, var_names=["outcome","knot_coef", "knot_exponent","knot_exponent_deterministic","coefficients"],
                                                   extend_inferencedata=False, predictions=True)
exponents = idata.predictions["knot_exponent_deterministic"]
coefficients = idata.predictions["coefficients"]

and the same applies here:

idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
                                                   var_names=["knot_exponent_deterministic", "coefficients"],
                                                   predictions=True)

But i might have misunderstood your example, if var_names include the outcome, then the other var_names yields the posterior samples from which it samples the posterior predictive.

If var_names does not include the outcome, then var_names will correspond to priors?

Thus including “outcome” or “y” or whatever your target is in var_names will automatically generate the posterior samples for the variables the target DEPENDS on. If it is not included it will generate the priors?

If that is the case we are set and to my understanding then exponents and coefficients above correspond to what i want, namely posteriors mapped over the testdata.

If it is not the case, that exponents = idata.predictions[“knot_exponent_deterministic”] above correspond to PRIORS i am screwed.

Topic		Replies	Views
Get samples of posterior on new data mapped with set_data	6	191	November 15, 2023
Difference between 'jitter+adapt_diag' and 'adapt_diag'? Questions	2	1068	April 2, 2019
Beginner question - Comparing two posterior predictive distributions with different number of observed data v5	8	399	July 12, 2023
Observed and Simulated data difference and prediction of unseen data v5	0	227	August 2, 2023
The standard deviation of posterior distribution remains the same compared to prior v5 modeling	16	421	January 21, 2024

Posterior predictive distributions of parameter values differs a lot from posterior?

Related Topics