Dear all,
I created a simple multivariate Normal model (MWE below). The implementation seems to be correct and I can recover parameter quite fast, but just sampling from the posterior predictive distribution takes around 4 times as long as obtaining the posterior estimates themselves.
Is there any specific reason for that? In all other pymc models that I used so far, obtaining posterior predictive results was very fast.
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import pymc as pm, arviz as az
print(f"Running on PyMC v{pm.__version__}") #Running on PyMC v5.17.0
# Parameters
n = 100 # Number of time steps
mean = [3, 2, 1] # Mean of each feature
corr = [[1, 0.8, 0.3], # Covariance matrix for correlation
[0.8, 1, -0.2],
[0.3, -0.2, 1]]
var = [2.0, 2.5, 4.0] # Variances for each feature
# Compute the covariance matrix
std_dev = np.sqrt(var) # Standard deviations from variances
cov = np.outer(std_dev, std_dev) * corr # Element-wise multiplication ~ outer product of the std_dev vector with itself.
# Create a DataFrame with the correlated features
np.random.seed(42) # For reproducibility
df_features = pd.DataFrame(
np.random.multivariate_normal(mean, cov, size=n),
index=pd.date_range(start="2023-01-01", periods=n, freq="D"),
columns=["feature_1", "feature_2", "feature_3"]
)
coords = {
"T_features": df_features.index, "N_features": df_features.columns,
"features_i": df_features.columns, "features_j": df_features.columns
}
with pm.Model(coords=coords) as MvNormal:
feature_array = pm.Data("feature", df_features.values, dims = ("T_features", "N_features") )
# Assign distributions
chol, ρ, stds = pm.LKJCholeskyCov(
"chol", n = len(coords["N_features"]),
eta = 1.0, sd_dist = pm.HalfNormal.dist(10.0)
)
Σ = pm.Deterministic("Σ", chol.dot(chol.T), dims = ("features_i", "features_j") )
μ = pm.Normal("μ", mu=0.0, sigma=10.0, dims = "N_features" )
# Observations
observed = pm.MvNormal("observed", mu = μ, chol = chol, observed = feature_array, dims = ("T_features", "N_features") )
# Sample and Predict
with MvNormal:
trace = pm.sample(1000, tune=1000, cores=4,
target_accept=0.95, random_seed=np.random.seed(123),
idata_kwargs={"dims": {"chol_stds": ["features_i"], "chol_corr": ["features_i", "features_j"]}},
)
pm.sample_posterior_predictive(trace, extend_inferencedata=True)
# pm.sample took ~ 70s, pm.sample_posterior_predictive took ~ 240s