Censored linear regression, relatively good ppc, horrible loo pit

@OriolAbril I think the issue with loo pit failing for censored data is indeed a problem with the borders. For instance for y=upper and \hat{y} mostly = upper, we have a situation where loo pit is very high (close to 1) because \hat{y} \leq y despite the fact that most of \hat{y} s could still potentially be higher than y. I think same loss of precision also happens in the lower border since for censored y, we really don’t know what its true value is (although to some extend \hat{y} \leq y is less biased in this case).

I think the most sensible option for now is to create a new Inference object where censored observations and their respective samples are removed. I tested removing both upper and lower and each individually. It only seems to get fully fixed when both are removed however the biggest bias seems to come from the upper boundary as would be expected.

nothing removed:
nothing_removed

both removed:
all_removed

upper censored removed:
upper_removed

lower censored removed:
lower_removed

Note that I am not doing as above where I replace lower and upper during fitting. I am using the correct lower and upper for fitting but only removing the censored values for loo pit. I don’t know how to do it in the case when the observation dimension is multidimensional so I had to do a version where it is flattened:


import pymc as pm
import numpy as np
import matplotlib.pyplot as plt
import pytensor.tensor as pt
import arviz as az

from arviz.stats import loo_pit
from arviz import InferenceData
from xarray import DataArray, Dataset

def get_model(x, noise_mu, noise_sd, a_mu, a_sd, b_mu, b_sd, nlines,
              lower, upper, observed=None, centre_data=False, rng=None):

  if rng is None:
    rng = np.random.default_rng(0)


  if centre_data and observed is not None:

    observed = observed.copy()
    lower = lower.copy()
    upper = upper.copy()
    x = x.copy()

    s0 = observed.shape
    s1 = s0/nlines

    assert s1 == int(s1)
    s1 = int(s1)

    observed = np.reshape(observed, (nlines, s1))
    lower = np.reshape(lower, (nlines, s1))
    upper = np.reshape(upper, (nlines, s1))


    lower -= observed.mean(axis=-1)[:,None]
    upper -= observed.mean(axis=-1)[:,None]
    observed -= observed.mean(axis=-1)[:,None]
    x -= x.mean()


    lower = lower.flatten()
    upper = upper.flaten()
    observed = observed.flatten()


  with pm.Model() as model:

    x = pm.Data("x", x)

    noise = pm.InverseGamma("noise", mu=noise_mu, sigma=noise_sd)
    a = pm.Normal("a", a_mu, a_sd)
    b = pm.Normal("b", b_mu, b_sd, size=nlines)

    line_mean = (a*x)[None,:] + b[:,None]
    line_mean = pt.flatten(line_mean)

    dist = pm.Normal.dist(line_mean, noise)
    pm.Censored("obs", dist, lower=lower, upper=upper,
                observed=observed)

  if observed is None:

    nsamples = 1000
    with model:
      idata=pm.sample_prior_predictive(nsamples, random_seed=rng)

    obs = idata["prior"]["obs"].to_numpy()[0, rng.integers(0, nsamples),:]

    return obs

  return model


seed = 0
rng = np.random.default_rng(seed)
nlines = 100

lower = rng.integers(-1,3, size=nlines).astype(float)
upper = rng.integers(9,12, size=nlines).astype(float)

upper = np.tile(upper[:,None], (1, 10)).flatten().squeeze()
lower = np.tile(lower[:,None], (1, 10)).flatten().squeeze()
x = np.linspace(0, 10, num=10)

fake_data=\
  get_model(x, noise_mu=1, noise_sd=0.1, a_mu=-1, a_sd=0.1,
            b_mu = 10, b_sd=2, nlines=nlines, lower=lower, upper=upper)


#fit to data
model=\
get_model(x, noise_mu=1, noise_sd=0.5, a_mu=-1, a_sd=0.5,
          b_mu = 10, b_sd=5, nlines=nlines, lower=lower, upper=upper,
          observed=fake_data, centre_data=False)

with model:
  idata = pm.sample(draws=1000, tune=1000, chains=4, cores=4,
                    random_seed=rng)
  pm.sample_posterior_predictive(idata, extend_inferencedata=True,
                                 random_seed=rng)
  pm.compute_log_likelihood(idata, extend_inferencedata=True)



pp_samples = idata.posterior_predictive.obs.copy()
obs_samples = idata.observed_data.obs.copy()
log_likelihood = idata.log_likelihood["obs"].copy()
pos = idata.posterior

I = np.argwhere((fake_data != lower) & (fake_data != upper)).flatten()

pp_subset = Dataset({"obs":DataArray(pp_samples.isel({"obs_dim_2":I}))})
obs_subset = Dataset({"obs":DataArray(obs_samples.isel({"obs_dim_0":I}))})
log_subset = Dataset({"obs":DataArray(log_likelihood.isel({"obs_dim_0":I}))})

idata_subset = InferenceData()
idata_subset.add_groups({"posterior_predictive":pp_subset})
idata_subset.add_groups({"observed_data":obs_subset})
idata_subset.add_groups({"log_likelihood":log_subset})
idata_subset.add_groups({"posterior":idata.posterior.copy()})
ax = az.plot_loo_pit(idata_subset, y="obs")

For eye balling whether or not amount of censoring is reasonable, I guess the best option is look at the ends of the pcc to see there is a similar accumulation for samples at the borders as in for the observables (less easier to understand when there are multiple lower or upper boundaries though…)

2 Likes