Categorical BART with Out of Sample Predictions

I’ve been looking at building a categorical BART model which will allow for out of sampling predictions on a test set larger than the training dataset.

I’ve written up a toy example to do this that draws on the posts of BART Categorical Hawks, Bayesian Non-parametric Causal Inference and the blog posts here and here that discuss how to update the coords and the need to generate a dummy output variable when doing predictions.

import numpy as np
import pymc as pm
import pymc_bart as pmb
import arviz as az
from sklearn.model_selection import train_test_split
np.random.seed(42)

# Define classes
classes = ['A', 'B', 'C', 'D']
n_classes = len(classes)

# Generate data
n_total = 300
n_features = 4  # We'll create 4 features

y = np.random.randint(0, n_classes, n_total)
X = np.zeros((n_total, n_features))

for i in range(n_total):
    class_value = y[i]
    X[i, 0] = class_value / (n_classes - 1) + np.random.normal(0, 0.1)    
    X[i, 1] = (class_value / (n_classes - 1))**2 + np.random.normal(0, 0.1)
    X[i, 2] = np.sin(class_value * np.pi / (n_classes - 1)) + np.random.normal(0, 0.1)
    X[i, 3] = 1 if class_value > (n_classes - 1) / 2 else 0 + np.random.normal(0, 0.05)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=42)

print(f"Number of features: {n_features}")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of classes: {n_classes}")

print("\nSample data (first 5 rows):")
for i in range(5):
    print(f"y: {y[i]}, X: {X[i]}")


# Define coordinates
coords = {"n_obs": np.arange(len(X_train)), "classes": classes}

# Create the PyMC model
with pm.Model(coords=coords) as model_classes:
    Xdt = pm.Data("Xdt", X_train)
    ydt = pm.Data("ydt", y_train)

    μ = pmb.BART("μ", Xdt, y_train, m=50, dims=["classes", "n_obs"])
    θ = pm.Deterministic("θ", pm.math.softmax(μ, axis=0))
    y = pm.Categorical("y", p=θ.T, observed=ydt)

    # Sample from the model
    idata = pm.sample(chains=4, draws=1000, random_seed=123)
    pp_train = pm.sample_posterior_predictive(idata)

# Out-of-sample prediction
with model_classes:
    pm.set_data(new_data={"Xdt": X_test, "ydt": np.zeros(len(y_test), dtype=int)},
                coords={"n_obs": np.arange(len(X_test)), "classes": classes})
    pp_test = pm.sample_posterior_predictive(idata, return_inferencedata=True)

print(f"Number of features: {n_features}")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of classes: {n_classes}")
print(f"Posterior predictive samples (train): {pp_train.posterior_predictive['y'].shape}")
print(f"Posterior predictive samples (test): {pp_test.posterior_predictive['y'].shape}")

2 Likes

Really cool, thanks for sharing!

1 Like