Ok I reproduced it in a very simple example:
import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)
# Based on https://www.pymc.io/projects/examples/en/latest/generalized_linear_models/GLM-robust.html
# Toy data
#---------
size = 100
true_intercept = 1
true_slope = 2
x = np.linspace(0, 1, size)
# y = a + b*x
true_regression_line = true_intercept + true_slope * x
# add noise
y = true_regression_line + rng.normal(scale=0.5, size=size)
# Add outliers
x_out = np.append(x, [0.1, 0.15, 0.2])
y_out = np.append(y, [8, 6, 9])
data = pd.DataFrame(dict(x=x_out, y=y_out))
# Use subset for training
x_train = x_out[0:-1:10]
y_train = y_out[0:-1:10]
# PyMC Model
#------------
def build_model(x, y):
with pm.Model() as model:
xdata = pm.Data("x", x, dims="obs_id", mutable=False)
# define priors
intercept = pm.Normal("intercept", mu=0, sigma=1)
slope = pm.Normal("slope", mu=0, sigma=1)
sigma = pm.HalfCauchy("sigma", beta=10)
mu = intercept + slope * xdata
# define likelihood
likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y, dims="obs_id")
return model
refit = True
if refit:
with build_model(x_train, y_train):
# inference
trace = pm.sample(tune=2000)
print("trace:")
print(trace)
print("trace.posterior:")
print(trace.posterior)
# Save trace
trace.to_netcdf("training_trace.nc")
# Reload training trace for forward predictions
training_trace = az.InferenceData.from_netcdf("training_trace.nc")
print("training_trace:")
print(training_trace)
nan_y = y_out * np.nan
zero_y = y_out * 0.
with build_model(x_out, nan_y):
pp_trace = pm.sample_posterior_predictive(training_trace)
print("pp_trace:")
print(pp_trace)
Building the model with nan_y the output is:
training_trace:
Inference data with groups:
> posterior
> sample_stats
> observed_data
> constant_data
pp_trace:
Inference data with groups:
>
But doing it with zero_y the output is
training_trace:
Inference data with groups:
> posterior
> sample_stats
> observed_data
> constant_data
Sampling: [y]
pp_trace:█████████████████████████████████████████████████████| 100.00% [4000/4000 00:00<00:00]
Inference data with groups:
> posterior_predictive
> observed_data
> constant_data
I.e. it just does absolutely nothing if the "y"s are all nan. Which I find confusing since they are just a placeholder, they shouldn’t have any impact on how the posterior predictions work, at least in my understanding. I guess the actual values don’t matter, but somehow nan is treated differently due to the imputation logic or something?