I have been working with the following link,
Fitting empirical distribution to theoretical ones with Scipy
I have been using my data and found out that the common distribution is the Hypsecant distribution. I couldn’t find the distribution and decided to have a look with scipy to understand how the distribution is formed. I created a custom distribution and I have few questions:
- I would like to know if my approach to creating the distribution is right?
- How can I implement the custom distribution into models?
- Regarding the prior distribution, do I use same steps in normal distribution priors (mu and sigma)?
- How can I create functions for the random variable and logo?
My custom distribution:
import numpy as np
import theano.tensor as tt
from scipy import stats
from scipy.special import hyp1f1, nctdtr
import math
import warnings
from pymc3.theanof import floatX
from pymc3.distributions.dist_math import bound, gammaln
from pymc3.distributions.continuous import assert_negative_support, get_tau_sigma
from pymc3.distributions.distribution import Continuous, draw_values, generate_samples
class Hypsecant(Continuous):
"""
Parameters
----------
mu: float
Location parameter.
sigma: float
Scale parameter (sigma > 0). Converges to the standard deviation as nu increases. (only required if lam is not specified)
lam: float
Scale parameter (lam > 0). Converges to the precision as nu increases. (only required if sigma is not specified)
"""
def __init__(self, mu=0, sigma=1, lam=None, sd=None, *args, **kwargs):
super().__init__(*args, **kwargs)
super(Hypsecant, self).__init__(*args, **kwargs)
if sd is not None:
sigma = sd
warnings.warn("sd is deprecated, use sigma instead", DeprecationWarning)
lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
self.lam = lam = tt.as_tensor_variable(lam)
self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(mu)
self.variance = tt.as_tensor_variable(1)
assert_negative_support(lam, 'lam (sigma)', 'Hypsecant')
def random(self, point=None, size=None):
"""
Draw random values from Hypsecant distribution.
Parameters
----------
point: dict, optional
Dict of variable values on which random values are to be
conditioned (uses default point if not specified).
size: int, optional
Desired size of random sample (returns one sample if not
specified).
Returns
-------
array
"""
mu, lam = draw_values([self.mu, self.lam], point=point, size=size)
return generate_samples(stats.hypsecant.rvs, loc=mu, scale=lam, dist_shape=self.shape, size=size)
def logp(self, value):
"""
Calculate log-probability of Hypsecant distribution at specified value.
Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------
TensorVariable
"""
mu = self.mu
lam = self.lam
return bound()
def logcdf(self, value):
"""
Compute the log of the cumulative distribution function for Hypsecant distribution
at the specified value.
Parameters
----------
value: numeric
Value(s) for which log CDF is calculated. If the log CDF for multiple
values are desired the values must be provided in a numpy array or theano tensor.
Returns
-------
TensorVariable
"""
mu = self.mu
lam = self.lam
return 2.0 / math.pi * tt.math.atan(tf.math.exp(value))
My Custom model:
with pm.Model() as model:
# Prior Distributions for unknown model parameters:
mu = pm.Normal('mu', mu=0, sigma=1)
sd = pm.HalfNormal('sd', sigma=1)
# Observed data is from a Likelihood distributions (Likelihood (sampling distribution) of observations):
observed_data = Hypsecant('observed_data', mu=mu, sd=sd, observed=data_list)
# draw 5000 posterior samples
trace_S = pm.sample(draws=5000, tune=2000, chains=3, cores=1)
# Obtaining Posterior Predictive Sampling:
post_pred_S = pm.sample_posterior_predictive(trace_S, samples=3000)
print(post_pred_S['observed_data'].shape)
print('\nSummary: ')
print(pm.stats.summary(data=trace_S))
print(pm.stats.summary(data=post_pred_S))
Edit 1:
I edited the logp part and custom model. Please find the code below:
class Hypsecant(Continuous):
"""
Parameters
----------
mu: float
Location parameter.
sigma: float
Scale parameter (sigma > 0). Converges to the standard deviation as nu increases. (only required if lam is not specified)
lam: float
Scale parameter (lam > 0). Converges to the precision as nu increases. (only required if sigma is not specified)
"""
def __init__(self, mu=0, sigma=1, lam=None, sd=None, *args, **kwargs):
super().__init__(*args, **kwargs)
super(Hypsecant, self).__init__(*args, **kwargs)
if sd is not None:
sigma = sd
warnings.warn("sd is deprecated, use sigma instead", DeprecationWarning)
lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
self.lam = lam = tt.as_tensor_variable(lam)
self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(mu)
self.variance = tt.as_tensor_variable(1)
assert_negative_support(lam, 'lam (sigma)', 'Hypsecant')
def random(self, point=None, size=None):
"""
Draw random values from Hypsecant distribution.
Parameters
----------
point: dict, optional
Dict of variable values on which random values are to be
conditioned (uses default point if not specified).
size: int, optional
Desired size of random sample (returns one sample if not
specified).
Returns
-------
array
"""
mu, lam = draw_values([self.mu, self.lam], point=point, size=size)
return generate_samples(stats.hypsecant.rvs, loc=mu, scale=lam, dist_shape=self.shape, size=size)
def logp(self, value):
"""
Calculate log-probability of Hypsecant distribution at specified value.
Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or theano tensor
Returns
-------
TensorVariable
"""
mu = self.mu
lam = self.lam
Px = 1.0/(math.pi * tt.math.cosh(value))
return bound(Px, mu, lam)
def logcdf(self, value):
"""
Compute the log of the cumulative distribution function for Hypsecant distribution
at the specified value.
Parameters
----------
value: numeric
Value(s) for which log CDF is calculated. If the log CDF for multiple
values are desired the values must be provided in a numpy array or theano tensor.
Returns
-------
TensorVariable
"""
mu = self.mu
lam = self.lam
return 2.0 / math.pi * tt.math.atan(tt.math.exp(value))
Model:
with pm.Model() as model:
# Prior Distributions for unknown model parameters:
mu = pm.Normal('mu', mu=0, sigma=1)
sd = pm.Normal('sd', mu=0, sigma=1)
# Custom distribution:
# observed_data = pm.DensityDist('observed_data', NonCentralStudentT, observed=data_list)
# Observed data is from a Likelihood distributions (Likelihood (sampling distribution) of observations):
observed_data = Hypsecant('observed_data', mu=mu, sd=sd, observed=data)
# draw 5000 posterior samples
trace = pm.sample(draws=5000, tune=3000, chains=2, cores=1)
# Obtaining Posterior Predictive Sampling:
post_pred = pm.sample_posterior_predictive(trace, samples=3000)
print(post_pred['observed_data'].shape)
print('\nSummary: ')
print(pm.stats.summary(data=trace))
print(pm.stats.summary(data=post_pred))