Hi all, I am new to PyMC and am trying to cluster synthetic data. I generate the data similar to the method described in Bishop’s Pattern Recognition and Machine Learning textbook with the following priors.
p(\boldsymbol{\pi}) = \text{Dir}(\boldsymbol{\pi} \mid \alpha)
p(\boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) = \mathcal{N}(\boldsymbol{\mu}_k \mid \boldsymbol{m}, \beta^{-1} \boldsymbol{\Sigma}_k)\; \mathcal{W}^{-1}(\boldsymbol{\Sigma}_k \mid \boldsymbol{W^{-1}}, \nu).
For the PyMC models, I create distributions for these variables in similar fashion as Bishop so they match the priors reasonaby well. Furhermore I used ADVI. My goal is to use model selection and estimate the number of components K of the synthetic data. To do this, I fit the model for a range of candidate K values across 5 runs (initializations) and compute the ELBO for each run (average of last 5% of elbo values) . I select the value that maximizes this quantity. Bishop also applies a ln(K!) penalty for cluster K (PRML, p. 484) which I currently do not do as the maximum number of clusters I test is 10 which makes the penalty quite small.
However, when I apply this method, I notice that the model has a decreasing relationship between the elbo and the number of clusters where it prefers smaller clusters. I have attached the code below showing my implementation where I generate synthetic data with 6 clusters (n_components = 6) and then I plot the cluster number and the associated ELBO.
from typing import List, Tuple, Dict
import numpy as np
from scipy.stats import dirichlet, multivariate_normal, invwishart
import random
from sklearn.mixture import BayesianGaussianMixture
import math
import matplotlib.pyplot as plt
import pymc as pm
import arviz as az
import numpy as np
def fit_gmm_advi(X: np.ndarray,K: int,n_advi: int = 20000,n_posterior_samples: int = 2000,m0: np.ndarray = None,
beta: float = 0.1,dirichlet_alpha: float = 1.0,lkj_eta: float = 1.0,sd_prior_scale: float = 1.0,
random_seed: int = 42,verbose: bool = False) -> Tuple[pm.Model, az.InferenceData, dict]:
X = np.asarray(X)
N, D = X.shape
m0 = np.zeros(D)
with pm.Model() as model:
# Mixture weights
alpha_vec = np.ones(K) * dirichlet_alpha
pi = pm.Dirichlet("pi", a=alpha_vec)
# For each mixture component, define covariance via LKJCholeskyCov
chol_list = []
mu_list = []
for k in range(K):
# LKJCholeskyCov returns a packed cholesky factor and the sd vector
chol, corr, sd = pm.LKJCholeskyCov(
f"chol_{k}",
n=D,
eta=lkj_eta,
sd_dist=pm.HalfNormal.dist(sd_prior_scale),
compute_corr=True
)
cov_k = pm.Deterministic(f"cov_{k}", chol @ chol.T)
# mean prior: Bishop uses Gaussian with covariance proportional to component covariance:
# mu_k ~ N(m0, (1/beta) * Sigma_k)
mu_k = pm.MvNormal(
f"mu_{k}",
mu=m0,
cov=cov_k / beta,
shape=(D,)
)
chol_list.append(chol)
mu_list.append(mu_k)
# Build mixture components (use .dist with chol to pass cholesky)
comp_dists = []
for k in range(K):
# create distribution object (not a RV) so it can be used in pm.Mixture
comp = pm.MvNormal.dist(mu=mu_list[k], chol=chol_list[k])
comp_dists.append(comp)
# PyMC handles missing entries in observed automatically (it creates missing-value RVs).
X_obs = X.copy()
X_mixture = pm.Mixture("obs",w=pi,comp_dists=comp_dists,observed=X_obs)
approx = pm.fit(n=n_advi, method="fullrank_advi", progressbar=False, random_seed=random_seed)
posterior_samples = approx.sample(draws=n_posterior_samples)
idata = posterior_samples
diagnostics = {
"n_advi": n_advi,
"n_posterior_samples": n_posterior_samples,
"advi_history": approx.hist if hasattr(approx, "hist") else None
}
return model, idata, diagnostics
def select_k_by_elbo_multi_run(X: np.ndarray,
K_list: List[int],
advi_config: dict = None,
runs: int = 3,
random_seeds: List[int] = None,
verbose: bool = True
) -> Tuple[int, Dict]:
advi_config = advi_config or {}
results = {}
if random_seeds is None:
random_seeds = [42 + i for i in range(runs)]
for K in K_list:
print("\n" + "-"*60)
print(f"Fitting model for K = {K} with {runs} runs elbo")
elbo_runs = []
run_results = []
for seed in random_seeds[:runs]:
model, idata, diagnostics = fit_gmm_advi(X, K, random_seed=seed, **advi_config)
hist = np.asarray(diagnostics["advi_history"])
n_tail = int(0.05 * len(hist))
elbo = -np.mean(hist[-n_tail:])
elbo_runs.append(elbo)
run_results.append({
"model": model,
"idata": idata,
"diagnostics": diagnostics,
"elbo": elbo
})
best_idx = np.argmax(elbo_runs)
results[K] = run_results[best_idx]
return results
def generate_data(weight_concentration_prior = 1, mean_precision_prior = 0.01,features =5,
mean_prior = 0.0, degrees_of_freedom_prior = 5, covariance_prior=np.eye(5)):
np.random.seed(42)
random.seed(42)
n_components = 6
seq_len = np.random.randint(100, 500)
weights = np.random.dirichlet(np.ones(n_components) * weight_concentration_prior)
counts = np.random.multinomial(seq_len, weights)
means = []
covariances = []
for _ in range(n_components):
Sigma = invwishart.rvs(df=degrees_of_freedom_prior, scale=covariance_prior)
mu = np.random.multivariate_normal(np.full(features, mean_prior), Sigma / mean_precision_prior)
means.append(mu)
covariances.append(Sigma)
# Sample points from GMM
X = []
y = []
for k in range(n_components):
n_k = counts[k]
X_k = np.random.multivariate_normal(means[k], covariances[k], size=n_k)
X.extend(X_k)
y.extend(np.full(n_k, k))
return np.array(X), np.array(y), n_components
##################################################################################################
weight_concentration_prior = 1
mean_precision_prior = 0.1
features =5
mean_prior = 0.0
degrees_of_freedom_prior = 5
covariance_prior=np.eye(5)
X, y,true_components = generate_data(weight_concentration_prior, mean_precision_prior, features, mean_prior,
degrees_of_freedom_prior, covariance_prior)
K_list = np.arange(2, 11)
runs = 5
config = {"n_advi":40000,"n_posterior_samples": 2000,"m0": None,"beta": 0.1,"dirichlet_alpha": 1.0,"lkj_eta": 1.0,"sd_prior_scale": 1.0}
results = select_k_by_elbo_multi_run(X = X, K_list = K_list,runs=runs,advi_config = config)
arr = []
for K in K_list:
arr.append(results[K]["elbo"])
plt.scatter(K_list,arr)
plt.ylabel(f"ELBO")
plt.xlabel("cluster numbers")
plt.show()
Similar behaviour is seen if I change number of clusters (n_components) to other numbers as well. Furthermore, when I log the ELBO convergence with ADVI steps it does indeed converge.
My questions are:
- Is the ELBO in PyMC computed differently from the standard formulation, such that some values might be excluded, potentially causing this decreasing relationship?
- From my understanding, WAIC is generally a better metric for model selection. However, since I am using ADVI—and plan to work with missing data in the future—I would prefer to use the ELBO, as it is automatically computed even when missing data are present. Given this, should I still consider switching to WAIC?

