I am a little bit confused when it comes to index variables. Specifically, when trying to produce OOS predictions. For example, it seems like PyMC will generate OOS predictions even if an index is not an index from the In-Sample fit. Below is a runnable example that might clarify what I am talking about.
from typing import Tuple
import pymc as pm
import polars as pl
import numpy as np
def factorize(series: pl.Series) -> Tuple[np.ndarray, np.ndarray]:
"""
Factorize polars series
"""
name = series.name
df = series.to_frame()
df = df.fill_null("<NA>")
df_ranked = df.unique().sort(name).with_row_index(name=f"{name}_index")
uniques = df_ranked[name].to_numpy()
codes = df.join(df_ranked, how="left", on=name)[f"{name}_index"].to_numpy()
return codes, uniques
# Simulate simple data
X = np.array(["a"] * 20 + ["b"] * 20 + ["c"] * 20)
y_prob = {"a": 0.2, "b": 0.8, "c": 0.5}
y = [np.random.binomial(n=1, p=y_prob[p]) for p in X]
# Split into train + test making sure group c is only in test
X_train = pl.DataFrame(X[:40], schema=["group"])
y_train = y[:40]
X_test = pl.DataFrame(X[40:], schema=["group"])
y_test = X[40:]
# factorize categorical variable
train_group_idx, train_group = factorize(X_train["group"])
train_n_obs = np.arange(X_train.shape[0])
# define coords
coords = {
"n_obs": train_n_obs,
"group": train_group
}
# define simple model
with pm.Model(coords=coords) as model:
group_data = pm.Data("group_data", train_group_idx, dims="n_obs")
beta_group = pm.Normal("beta_group", 0, 1, dims="group")
logit_p = pm.Deterministic("logit_p", beta_group[group_data], dims="n_obs")
pm.Bernoulli("likelihood", logit_p=logit_p, observed=y_train, dims="n_obs")
# sample
with model:
idata = pm.sample()
# Factorize test data group variable
test_group_idx, test_group = factorize(X_test["group"])
test_n_obs = np.arange(X_test.shape[0])
# HERE IS THE ISSUE. Group "c" was not in the training data
# Compute predictions
with model:
pm.set_data(
new_data={"group_data": test_group_idx},
coords={"n_obs": test_n_obs, "group": test_group}
)
predictions = pm.sample_posterior_predictive(idata, predictions=True)
Shouldn’t this example raise an error when computing OOS predictions? Am I missing something here? I tried looking for some documentation with regards to how to handle index variables in OOS predictions but I couldn’t find anything.