How to include coords in out-of-sample prediction?

This question is related to this one

as I am trying to get my head around how to include everything I want in out-of-sample predictions. This current question uses the same example:

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm

from scipy.special import expit as inverse_logit

RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)
az.style.use("arviz-darkgrid")

# Number of data points
n = 250
# Create features
x1 = rng.normal(loc=0.0, scale=2.0, size=n)
x2 = rng.normal(loc=0.0, scale=2.0, size=n)
# Define target variable
intercept = -0.5
beta_x1 = 1
beta_x2 = -1
beta_interaction = 2
z = intercept + beta_x1 * x1 + beta_x2 * x2 + beta_interaction * x1 * x2
p = inverse_logit(z)
# note binomial with n=1 is equal to a Bernoulli
y = rng.binomial(n=1, p=p, size=n)
df = pd.DataFrame(dict(x1=x1, x2=x2, y=y))
df.head()

labels = ["Intercept", "x1", "x2", "x1:x2"]
df["Intercept"] = np.ones(len(df))
df["x1:x2"] = df["x1"] * df["x2"]
# reorder columns to be in the same order as labels
df = df[labels]
x = df.to_numpy()

indices = rng.permutation(x.shape[0])
train_prop = 0.7
train_size = int(train_prop * x.shape[0])
training_idx, test_idx = indices[:train_size], indices[train_size:]
x_train, x_test = x[training_idx, :], x[test_idx, :]
y_train, y_test = y[training_idx], y[test_idx]


coords = {"coeffs": labels}

with pm.Model(coords=coords) as model:
    # data containers
    X = pm.MutableData("X", x_train)
    y = pm.MutableData("y", y_train)
    # priors
    b = pm.Normal("b", mu=0, sigma=1, dims="coeffs")
    # linear model
    mu = pm.math.dot(X, b)
    # link function
    p = pm.Deterministic("p", pm.math.invlogit(mu))
    # likelihood
    pm.Bernoulli("obs", p=p, observed=y)


with model:
    idata = pm.sample()

with model:
    pm.set_data({"X": x_test, "y": y_test})
    predictions = pm.sample_posterior_predictive(trace=idata, predictions=True).predictions

predictions.to_netcdf(__file__.replace('.py', '.nc'))
predictions.to_dataframe().to_csv(__file__.replace('.py', '.csv'))

What I would like to have as a final result is a table with different labels (i.e. different levels of the elements of coords) paired to the sample outcome. I’m not sure the example from above (which comes from the PyMC documentation) is aiming for the same notion of coordinates as I want. The coords I would most often use would be different subgroups of the data.

Are you looking for something like this?


coords = {"coeffs": labels,
          "ydim":range(y_train.size)}

with pm.Model(coords=coords) as model:
    # data containers
    X = pm.MutableData("X", x_train)
    y = pm.MutableData("y", y_train)
    # priors
    b = pm.Normal("b", mu=0, sigma=1, dims="coeffs")
    # linear model
    mu = pm.math.dot(X, b)
    # link function
    p = pm.Deterministic("p", pm.math.invlogit(mu))
    # likelihood
    pm.Bernoulli("obs", p=p, observed=y, dims="ydim")


with model:
    idata = pm.sample()

with model:
    pm.set_data({"X": x_test, "y": y_test})
    model.set_dim("ydim", y_test.size, coord_values=range(y_test.size))
    predictions = pm.sample_posterior_predictive(trace=idata, predictions=True).predictions