This question is related to this one
as I am trying to get my head around how to include everything I want in out-of-sample predictions. This current question uses the same example:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
from scipy.special import expit as inverse_logit
RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)
az.style.use("arviz-darkgrid")
# Number of data points
n = 250
# Create features
x1 = rng.normal(loc=0.0, scale=2.0, size=n)
x2 = rng.normal(loc=0.0, scale=2.0, size=n)
# Define target variable
intercept = -0.5
beta_x1 = 1
beta_x2 = -1
beta_interaction = 2
z = intercept + beta_x1 * x1 + beta_x2 * x2 + beta_interaction * x1 * x2
p = inverse_logit(z)
# note binomial with n=1 is equal to a Bernoulli
y = rng.binomial(n=1, p=p, size=n)
df = pd.DataFrame(dict(x1=x1, x2=x2, y=y))
df.head()
labels = ["Intercept", "x1", "x2", "x1:x2"]
df["Intercept"] = np.ones(len(df))
df["x1:x2"] = df["x1"] * df["x2"]
# reorder columns to be in the same order as labels
df = df[labels]
x = df.to_numpy()
indices = rng.permutation(x.shape[0])
train_prop = 0.7
train_size = int(train_prop * x.shape[0])
training_idx, test_idx = indices[:train_size], indices[train_size:]
x_train, x_test = x[training_idx, :], x[test_idx, :]
y_train, y_test = y[training_idx], y[test_idx]
coords = {"coeffs": labels}
with pm.Model(coords=coords) as model:
# data containers
X = pm.MutableData("X", x_train)
y = pm.MutableData("y", y_train)
# priors
b = pm.Normal("b", mu=0, sigma=1, dims="coeffs")
# linear model
mu = pm.math.dot(X, b)
# link function
p = pm.Deterministic("p", pm.math.invlogit(mu))
# likelihood
pm.Bernoulli("obs", p=p, observed=y)
with model:
idata = pm.sample()
with model:
pm.set_data({"X": x_test, "y": y_test})
predictions = pm.sample_posterior_predictive(trace=idata, predictions=True).predictions
predictions.to_netcdf(__file__.replace('.py', '.nc'))
predictions.to_dataframe().to_csv(__file__.replace('.py', '.csv'))
What I would like to have as a final result is a table with different labels (i.e. different levels of the elements of coords
) paired to the sample outcome. I’m not sure the example from above (which comes from the PyMC documentation) is aiming for the same notion of coordinates as I want. The coords
I would most often use would be different subgroups of the data.