Using dims and MutableData to index non-existing/zero-value variables in splines

albertoromeo · November 20, 2023, 1:50pm

I am currently engaged in a spline regression task and have been grappling with a particular issue for some time, yet a satisfactory solution eludes me. Recently, I encountered an example here that I’ve been experimenting with in an attempt to address the problem effectively. However, despite my efforts, I have not been able to devise a solution. Please, take a look at the code snippet below, which I extracted from the following thread: : https://discourse.pymc.io/t/posterior-predictive-distributions-of-parameter-values-differs-a-lot-from-posterior/13304

import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
import pandas as pd
import pytensor.tensor as tt

def generate_input_data(timerange):
    import random
    return np.array([random.uniform(0.5, 0.7) for x in range(len(timerange))]).reshape(-1, 1)

def get_coefficient_val_month(date, coefficientknots):
    coefficientval = 0
    for knot, val in coefficientknots.items():
        lb, ub = map(int, knot.split("-"))
        if lb <= date.month <= ub:
            coefficientval = val
            break
    return coefficientval

def get_coefficient_val(date, coefficientknots):
    coefficientval = 0
    for knot, val in coefficientknots.items():
        lb, ub = map(int, knot.split("-"))
        if lb <= date.day <= ub:
            coefficientval = val
            break
    return coefficientval

def get_exponent_val(date, exponentknots):
    exponentval = 1.0
    for exponentknot, val in exponentknots.items():
        lb, ub = map(int, exponentknot.split("-"))
        if lb <= date.day <= ub:
            exponentval = val
            break
    return exponentval

def compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots):
    true_values = []
    for idx, date in enumerate(timerange):
        coefficientval = get_coefficient_val(date, coefficientknots)
        coefficientvalmonth = get_coefficient_val_month(date, coefficientknotsmonths)
        exponentval = get_exponent_val(date, exponentknots)
        if coefficientval == 0 or exponentval == 1.0:
            raise ValueError("Invalid coefficient or exponent value")
        val = generate_input_data([date])[0]
        entry = coefficientvalmonth * coefficientval * val ** exponentval
        true_values.append(entry)
    return np.array(true_values)


def main():

    timerange = pd.date_range(start="2023-01-01", end="2023-07-24", freq='D')
    coefficientknots = {'0-15': 2.0, '15-32': 3.0}
    exponentknots = {'0-15': 0.2, '15-32': 0.3}
    coefficientknotsmonths = {'0-13': 4.0} #, '10-13': 7.0}
    data = generate_input_data(timerange)
    true_values = compute_true_values(timerange, coefficientknots, coefficientknotsmonths, exponentknots)

    df = pd.DataFrame(np.c_[data, true_values], index=timerange, columns=['data', 'targets'])
    df['day'] = df.index.day

    df['month'] = df.index.month

    knot_cutpoints = [0, 15, 32]
    df['knot_idx'] = pd.cut(df['day'], knot_cutpoints, labels=False)

    knot_cutpoints_month = [0, 6, 13]
    df['knot_idx_month'] = pd.cut(df['month'], knot_cutpoints_month, labels=False)

    df_train = df.iloc[:140].copy()
    df_test = df.iloc[140:].copy()

    knot_names = [f'{a}-{b}' for a, b in zip(knot_cutpoints[:-1], knot_cutpoints[1:])]

    knot_names_month = [f'{a}-{b}' for a, b in zip(knot_cutpoints_month[:-1], knot_cutpoints[1:])]

    coords = {'knot': knot_names,
              'knot_names_month': knot_names_month}

    with pm.Model(coords=coords) as model:
        data_pt = pm.MutableData('input', df_train.data)
        targets_pt = pm.MutableData(f'convresponse', df_train.targets)

        knot_idx_pt = pm.MutableData('knot_idx', df_train.knot_idx)

        knot_idx_pt_month = pm.MutableData('knot_idx_month', df_train.knot_idx_month)

        knot_coef = pm.Normal(f'knot_coef', 0, 1, dims=['knot'])
        knot_exponent = pm.Gamma(f'knot_exponent', 3, 3, dims=['knot'])

        exp = pm.Deterministic('knot_exponent_deterministic', var=knot_exponent[knot_idx_pt])

        knot_coef_month = pm.Normal(f'knot_coef_month', 0, 1, dims=['knot_names_month'])

        coefficients = pm.Deterministic(name="coefficients_deterministic",
                                        var=knot_coef[knot_idx_pt] * knot_coef_month[knot_idx_pt_month])

        modelledresponse = tt.power(coefficients * data_pt,
                                    exp)

        sigma = pm.HalfCauchy(name="sigma", beta=10)
        outcome = pm.Normal(name="outcome",
                            mu=modelledresponse,
                            sigma=sigma,
                            observed=targets_pt,
                            shape=data_pt.shape[0],  # Important!
                            initval={'knot_exponent': [1, 1]})  # I found sometimes the exponents blow up

        import pymc.sampling_jax as jax
        trace = jax.sample_blackjax_nuts(1000,
                                         tune=1000,
                                         target_accept=0.97,
                                         cores=2,
                                         random_seed=0,
                                         idata_kwargs={"log_likelihood": True},
                                         return_inferencedata=True)

    with model:
        # idata = pm.sample_posterior_predictive(trace, extend_inferencedata=False)
        pm.set_data({'input': df_test.data, 'knot_idx': df_test.knot_idx, 'knot_idx_month': df_test.knot_idx_month})
        idata_new = pm.sample_posterior_predictive(trace, random_seed=0, extend_inferencedata=False,
                                                   var_names=["knot_exponent_deterministic",
                                                              "coefficients_deterministic"],
                                                   predictions=True)
        coeff_deterministic = idata_new.predictions.coefficients_deterministic
        knot_exponent_deterministic = idata_new.predictions.knot_exponent_deterministic
        print("-")


if __name__ == '__main__':
    main()

One problem in this code is the occurrence of the so-called dummy variable trap, as both the day-cutpoints and the monthly cutpoints encompass all categories or intervals. The desired approach is to exclude one interval, considering that the other one already covers the whole dataset.

Instead of defining knot_cutpoints_month as [0, 6, 13], the preference is to have a single parameter, knot_cutpoints_month = [0, 6], in order to circumvent the dummy variable trap.

However, the challenge lies in the indexing. Altering knot_cutpoints_month to [0, 6] results in an error, specifically “index must be integers or a boolean mask,” because the entire dataset has not been mapped with indexes, and NaN values exist where this “zero” value is. I would like to find a clever solution to this issue but i cannot come up with anything that does not involve manual handling/multiplying with zeros.

jessegrabowski · November 26, 2023, 1:22pm

The dummy variable trap arises due to perfect collinearity, a condition which itself arises because of the need to compute a matrix inverse when solving for \beta = (X^T X)^{-1} X^T y when doing OLS.

This doesn’t really exist in the Bayesian context. We don’t invert any matrices when doing linear regression. Furthermore, the use of priors can “pin down” each variable, solving the underlying identification issue. I will demonstrate this by way of examples. First, some code to generate data:

import numpy as np
import pymc as pm
import arviz as az

# Generate Data

n_groups = 10
n_obs_per_group = 50
intercept = np.random.normal(scale=3)
group_means = np.random.normal(loc=0, scale=3, size=n_groups)
group_idxs = np.arange(n_groups).repeat(n_obs_per_group)

y = intercept + group_means[group_idxs] + np.random.normal(scale=10, size=n_groups * n_obs_per_group)

X = (group_idxs[:, None] == np.arange(n_groups)[None]) * 1
X = np.c_[np.ones(n_groups * n_obs_per_group), X]
X.shape

If you try to compute the OLS solution, you will get a singular matrix error. This is the dummy variable trap:

try:
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
except Exception as e:
    print(e)

>>> Out: Singular Matrix

Here’s a Bayesian model that makes no effort to respect the dummy variable trap:

with pm.Model() as m1:
    mu_shared = pm.Normal('intercept', sigma=5)
    mu_group = pm.Normal('mu_group', sigma=5, shape=(n_groups))
    mu = mu_shared + mu_group[group_idxs]
    
    sigma = pm.Exponential('sigma', 1)
    y_hat = pm.Normal('y_hat', mu=mu, sigma=sigma, observed=y)
    idata = pm.sample(idata_kwargs={'log_likelihood':True})

Here are the estimates:
Untitled

Nevertheless, you can take the dummies more seriously by using a pm.ZeroSumNormal prior. This will incorporate the information that there are only n-1 degrees of freedom between the n priors. The change is just the one prior:

with pm.Model() as m2:
    mu_shared = pm.Normal('intercept', sigma=5)
    mu_group = pm.ZeroSumNormal('mu_group', sigma=5, shape=(n_groups))
    mu = mu_shared + mu_group[group_idxs]
    
    sigma = pm.Exponential('sigma', 1)
    y_hat = pm.Normal('y_hat', mu=mu, sigma=sigma, observed=y)
    idata_zerosum = pm.sample(idata_kwargs={'log_likelihood':True})

You can see that the intercept is much better identified with the zero-sum normal, but otherwise it doesn’t buy you too much. PISIS-LOO prefers the zero-sum normal, but only slightly:

          rank     elpd_loo      p_loo  elpd_diff        weight         se       dse  warning scale
zero_sum     0 -1869.676587  10.360403    0.00000  1.000000e+00  16.680586  0.000000    False   log
normal       1 -1869.824117  10.518350    0.14753  2.220446e-16  16.715410  0.069137    False   log

albertoromeo · December 2, 2023, 1:05pm

Wow, didnt have a clue, thanks. Is it possible to do this for arbitrary priors not just the normal?

jessegrabowski · December 2, 2023, 3:49pm

If by “this” you mean just fit a model with group-level intercepts, yes. There is never any dummy variable trap in Bayes. Instead, we think about identification, which you can read all about here.

If by “this” you mean use a zero-sum prior, no. There is only a ZeroSumNormal currently implemented.

Topic		Replies	Views
Get samples of posterior on new data mapped with set_data	6	240	November 15, 2023
Posterior predictive distributions of parameter values differs a lot from posterior?	18	555	November 23, 2023
Posterior predictive with spline model v5	2	282	August 31, 2023
Spline Regression in PyMC3 Questions	21	3173	September 25, 2021
B-splines regression, out-of-sample predictions v5 modeling , prediction	8	1107	January 16, 2023

Using dims and MutableData to index non-existing/zero-value variables in splines

Related topics