I’ve been looking at building a categorical BART model which will allow for out of sampling predictions on a test set larger than the training dataset.

I’ve written up a toy example to do this that draws on the posts of BART Categorical Hawks, Bayesian Non-parametric Causal Inference and the blog posts here and here that discuss how to update the coords and the need to generate a dummy output variable when doing predictions.

```
import numpy as np
import pymc as pm
import pymc_bart as pmb
import arviz as az
from sklearn.model_selection import train_test_split
np.random.seed(42)
# Define classes
classes = ['A', 'B', 'C', 'D']
n_classes = len(classes)
# Generate data
n_total = 300
n_features = 4 # We'll create 4 features
y = np.random.randint(0, n_classes, n_total)
X = np.zeros((n_total, n_features))
for i in range(n_total):
class_value = y[i]
X[i, 0] = class_value / (n_classes - 1) + np.random.normal(0, 0.1)
X[i, 1] = (class_value / (n_classes - 1))**2 + np.random.normal(0, 0.1)
X[i, 2] = np.sin(class_value * np.pi / (n_classes - 1)) + np.random.normal(0, 0.1)
X[i, 3] = 1 if class_value > (n_classes - 1) / 2 else 0 + np.random.normal(0, 0.05)
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=42)
print(f"Number of features: {n_features}")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of classes: {n_classes}")
print("\nSample data (first 5 rows):")
for i in range(5):
print(f"y: {y[i]}, X: {X[i]}")
# Define coordinates
coords = {"n_obs": np.arange(len(X_train)), "classes": classes}
# Create the PyMC model
with pm.Model(coords=coords) as model_classes:
Xdt = pm.Data("Xdt", X_train)
ydt = pm.Data("ydt", y_train)
μ = pmb.BART("μ", Xdt, y_train, m=50, dims=["classes", "n_obs"])
θ = pm.Deterministic("θ", pm.math.softmax(μ, axis=0))
y = pm.Categorical("y", p=θ.T, observed=ydt)
# Sample from the model
idata = pm.sample(chains=4, draws=1000, random_seed=123)
pp_train = pm.sample_posterior_predictive(idata)
# Out-of-sample prediction
with model_classes:
pm.set_data(new_data={"Xdt": X_test, "ydt": np.zeros(len(y_test), dtype=int)},
coords={"n_obs": np.arange(len(X_test)), "classes": classes})
pp_test = pm.sample_posterior_predictive(idata, return_inferencedata=True)
print(f"Number of features: {n_features}")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of classes: {n_classes}")
print(f"Posterior predictive samples (train): {pp_train.posterior_predictive['y'].shape}")
print(f"Posterior predictive samples (test): {pp_test.posterior_predictive['y'].shape}")
```