So here is a scipy wrapper class that provides the truncated multivariate distribution based on rejection sampling. It provides the sampling method and the log-likelihood method. Only problem is that the log-likelihood is accurate only up to a constant, as I have not normalized the original PDF with respect to constraints.
- Is this sufficient to function as a prior for MCMC posterior sampling in PYMC? or do I need to implement exact log-likelihood (by explicitly integrating PDF outside of the domain)?
- If yes, how can I convert it into a distribution that can be directly used in PYMC?
import numpy as np
from scipy.stats import multivariate_normal
class DistrMVGaussianTruncated(Distribution):
def __init__(self, mu: np.ndarray, cov: np.ndarray, f_constr, sampleFactor=10):
"""
:param mu: Mean of the multivariate truncated distribution
:param cov: Covariance of the multivariate truncated distribution
:param f_constr: A function float[nSamples, nDim] -> bool[nSamples]. For each sample,
checks if it satisfies the truncation constraints
:param sampleFactor: Since some points will be rejected, it makes sense to sample more than the requested
number of points, to avoid sampling too many times. Excess points are stored for future requests
"""
self.mu = mu
self.cov = cov
self.f_constr = f_constr
self.x = np.array([])
self.sampleFactor = sampleFactor
def ndim(self) -> int:
return len(self.mu)
def _sample_increment(self, nsample: int) -> None:
x = np.random.multivariate_normal(self.mu, self.cov, nsample)
x = x[self.f_constr(x)]
self.x = np.concatenate([self.x, x])
def sample(self, nsample: int) -> np.ndarray:
while len(self.x) < nsample:
self._sample_increment(self.sampleFactor * nsample)
x = self.x[:nsample] # Report first n stored distribution samples
self.x = self.x[nsample:] # Keep the rest of the samples for future use
return x
def mle(self, data: np.ndarray) -> None:
"""
:param data: Data samples [nData, nDim].
:return:
Fit the distribution to the data.
NOTE: Maximum likelihood of the truncated distribution is the same as of the non-truncated distribution.
PDF of data is scaled by a constant
-> log-likelihood is different by a constant
-> MLE is the same as of the original distribution
-> Can use non-truncated scipy MLE
"""
assert data.ndim == 2
assert data.shape[0] >= self.ndim(), f"Must have at least {self.ndim()} samples to be able to do inference"
assert data.shape[1] == self.ndim()
assert np.all(self.f_constr(data)), "Data must respect the constraints of the distribution"
self.mu, self.cov = multivariate_normal.fit(data)
# TODO: This quantity is only accurate up to a constant, as original PDF is not normalized with respect to
# the truncated part. If exact quantity is required, can calculate excess by integrating outside the domain.
def log_prob(self, data: np.ndarray):
rez = multivariate_normal.logpdf(data, self.mu, self.cov)
rez[~self.f_constr(data)] = -np.inf
return rez