Hi folks, below is a model where there is missing data in a MVN-modelled predictor matrix, and while I love how pymc automatically does missing value imputation, I notice that to achieve this a metropolis step is added to the sampling scheme, which prevents me from using things like nutpie and gpu acceleration. Any suggestions?
import numpy as np
import pymc as pm
import pytensor.tensor as pt
def create_array_with_random_nans(shape, nan_fraction=0.1, seed=None):
"""
Create a 1D or 2D NumPy array with randomly located NaN values.
Parameters
----------
shape : int or tuple of int
Shape of the array. Can be an int (for 1D) or tuple (for 2D).
nan_fraction : float, optional
Fraction of total elements to set as NaN (default is 0.1).
seed : int, optional
Random seed for reproducibility.
Returns
-------
arr : ndarray
NumPy array with NaNs randomly placed.
"""
if seed is not None:
np.random.seed(seed)
# Normalize shape to a tuple
if isinstance(shape, int):
shape = (shape,)
n_total = np.prod(shape)
n_nan = int(n_total * nan_fraction)
# Create the array with random values
arr = np.random.rand(n_total)
# Select indices to set as NaN
nan_indices = np.random.choice(n_total, n_nan, replace=False)
arr[nan_indices] = np.nan
# Reshape back to original shape
# return np.ma.masked_invalid(arr.reshape(shape))
return arr.reshape(shape)
n_predictors = 3
n_cases = 100
x_np = create_array_with_random_nans((n_cases,n_predictors), nan_fraction=0.1)
y_np = create_array_with_random_nans(n_cases, nan_fraction=0.1)
with pm.Model() as model:
mu = pm.Normal("mu", mu=0, sigma=1, shape=n_predictors)
chol, corr, sds = pm.LKJCholeskyCov(name = "chol", n = n_predictors, eta = 1, sd_dist = pm.Weibull.dist(2,1))
x = pm.MvNormal("x", mu = mu, chol = chol, observed = x_np)
x_Q, x_R = pt.nlinalg.qr(x)
beta = pm.Normal("beta", mu=0, sigma=1, shape=n_predictors)
sigma = pm.HalfNormal("sigma", sigma=1)
y = pm.Normal(
"y"
, mu = pm.math.dot(x_Q, beta) # NOTE: using x_Q causes Metropolis; using x does not
, sigma = sigma
, observed = y_np
)
model.debug()
with model:
trace = pm.sample()