# How to calibrate the sigma of the likelihood within the black-box method?

Hello,

I don’t manage to calibrate the sigma of the likelihood with the black-box method, I tried to include the variable sigma in the tensor vector theta but it’s not working (see the code below).
Does someone know how to fix this problem ?

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))

sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3

def my_model(theta,x):

var1,var2,sigma0= theta
prediction=x*var1+var2

return prediction

def my_loglike(theta,x,data, sigma):

model = my_model(theta, x)
sigma0=theta[2]

data0=data

return -(0.5/sigma0**2)*np.sum((data0 - model)**2)

class LogLike(tt.Op):

"""
Specify what type of object will be passed and returned to the Op when it is
called. In our case we will be passing it a vector of values (the parameters
that define our model) and returning a single "scalar" value (the
log-likelihood)
"""
itypes = [tt.dvector] # expects a vector of parameter values when called
otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

def __init__(self, loglike, data, x, sigma):
"""
Initialise the Op with various things that our log-likelihood function
requires. Below are the things that are needed in this particular
example.

Parameters
----------
loglike:
The log-likelihood (or whatever) function we've defined
data:
The "observed" data that our log-likelihood function takes in
x:
The dependent variable (aka 'x') that our model requires
sigma:
The noise standard deviation that our function requires.
"""

# add inputs as class attributes
self.likelihood = loglike
self.data = data
self.x = x
self.sigma = sigma

def perform(self, node, inputs, outputs):
# the method that is used when calling the Op
theta, = inputs  # this will contain my variables

# call the log-likelihood function
logl = self.likelihood(theta, self.x, self.data, self.sigma)

outputs[0][0] = np.array(logl) # output the log-likelihood

# create our Op
logl = LogLike(my_loglike, data, x, sigma)

def my_mu(v):
return logl(v)

# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
with pm.Model() as model1:

var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
#sigma0 = pm.HalfNormal('sigma0', sd=4)

theta = tt.as_tensor_variable([var1, var2,sigma0])

pm.DensityDist('likelihood',my_mu , observed={'v': theta})#

step = pm.Slice()

trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains, step=step,cores=cores)

tracedf=pm.trace_to_dataframe(trace)
tim_end=time.process_time()
pm.traceplot(trace,varnames=['var1','var2','sigma0'])#,varnames=[var1,var2]
sns.pairplot(tracedf[['var1','var2','sigma0']])
pm.plot_posterior(trace,varnames=['var1','var2','sigma0'])
pm.autocorrplot(trace,varnames=['var1','var2','sigma0']);
pm.traceplot(trace,combined=True,priors=[var1.distribution, var2.distribution, sigma0.distribution])

You need to pass the free parameter (sigma in this case) in the observed kwarg dict. See for example: How to set up a custom likelihood function for two variables

I added an observed argument ‘sigma’ in the DensityDist, and then changed the theano Op (see the code below). However, I get the same result than when I pass it in the input vector theta: the sigma diverges while it should converge toward 3.

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))
sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3

def my_model(theta,x):

var1,var2= theta
prediction=x*var1+var2

return prediction

def my_loglike(theta,sigma0,x,data, sigma):

model = my_model(theta,x)
#sigma0=theta[2]

data0=data

return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#

class LogLike(tt.Op):

"""
Specify what type of object will be passed and returned to the Op when it is
called. In our case we will be passing it a vector of values (the parameters
that define our model) and returning a single "scalar" value (the
log-likelihood)
"""
itypes = [tt.dvector,tt.dscalar] # expects a vector of parameter values when called
otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

def __init__(self, loglike, data, x, sigma):
"""
Initialise the Op with various things that our log-likelihood function
requires. Below are the things that are needed in this particular
example.

Parameters
----------
loglike:
The log-likelihood (or whatever) function we've defined
data:
The "observed" data that our log-likelihood function takes in
x:
The dependent variable (aka 'x') that our model requires
sigma:
The noise standard deviation that our function requires.
"""

# add inputs as class attributes
self.likelihood = loglike
self.data = data
self.x = x
self.sigma = sigma

def perform(self, node, inputs, outputs):
# the method that is used when calling the Op
theta,sigma0 = inputs  # this will contain my variables

# call the log-likelihood function
logl = self.likelihood(theta,sigma0, self.x, self.data, self.sigma)

outputs[0][0] = np.array(logl) # output the log-likelihood

# create our Op
logl = LogLike(my_loglike, data, x, sigma)

def my_mu(v,sigma0):
return logl(v,sigma0)

# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
with pm.Model() as model1:

var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
#sigma0 = pm.HalfNormal('sigma0', sd=4)

theta = tt.as_tensor_variable([var1, var2])

pm.DensityDist('likelihood',my_mu , observed={'v': theta, 'sigma0':sigma0 })#

step = pm.Slice()

trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains, step=step,cores=cores)

Try with the default sampler, not sure how well Slice sampler performs and I would trust NUTS sampler much more.

I got the same result with the default sampler and even with the NUTS (see the code below). I think the problem coms from black-box method.

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))
sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3

def my_model(theta,x):

var1,var2,sigma0 = theta
prediction=x*var1+var2

return prediction

def my_loglike(theta,x,data, sigma):

model = my_model(theta,x)
sigma0=theta[2]

data0=data

return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#

def gradients(vals, func, releps=1e-3, abseps=None, mineps=1e-9, reltol=1e-3,
epsscale=0.5):
"""
Calculate the partial derivatives of a function at a set of values. The
derivatives are calculated using the central difference, using an iterative
method to check that the values converge as step size decreases.

Parameters
----------
vals: array_like
A set of values, that are passed to a function, at which to calculate
func:
A function that takes in an array of values.
releps: float, array_like, 1e-3
The initial relative step size for calculating the derivative.
abseps: float, array_like, None
The initial absolute step size for calculating the derivative.
This overrides releps if set.
releps is set then that is used.
mineps: float, 1e-9
The minimum relative step size at which to stop iterations if no
convergence is achieved.
epsscale: float, 0.5
The factor by which releps if scaled in each iteration.

Returns
-------
An array of gradients for each non-fixed value.
"""

# maximum number of times the gradient can change sign
flipflopmax = 10.

# set steps
if abseps is None:
if isinstance(releps, float):
eps = np.abs(vals)*releps
eps[eps == 0.] = releps  # if any values are zero set eps to releps
teps = releps*np.ones(len(vals))
elif isinstance(releps, (list, np.ndarray)):
if len(releps) != len(vals):
raise ValueError("Problem with input relative step sizes")
eps = np.multiply(np.abs(vals), releps)
eps[eps == 0.] = np.array(releps)[eps == 0.]
teps = releps
else:
raise RuntimeError("Relative step sizes are not a recognised type!")
else:
if isinstance(abseps, float):
eps = abseps*np.ones(len(vals))
elif isinstance(abseps, (list, np.ndarray)):
if len(abseps) != len(vals):
raise ValueError("Problem with input absolute step sizes")
eps = np.array(abseps)
else:
raise RuntimeError("Absolute step sizes are not a recognised type!")
teps = eps

# for each value in vals calculate the gradient
count = 0
for i in range(len(vals)):
# initial parameter diffs
leps = eps[i]
cureps = teps[i]

flipflop = 0

# get central finite difference
fvals = np.copy(vals)
bvals = np.copy(vals)

# central difference
fvals[i] += 0.5*leps  # change forwards distance to half eps
bvals[i] -= 0.5*leps  # change backwards distance to half eps
cdiff = (func(fvals)-func(bvals))/leps

while 1:
fvals[i] -= 0.5*leps  # remove old step
bvals[i] += 0.5*leps

# change the difference by a factor of two
cureps *= epsscale
if cureps < mineps or flipflop > flipflopmax:
# if no convergence set flat derivative (TODO: check if there is a better thing to do instead)
warnings.warn("Derivative calculation did not converge: setting flat derivative.")
break
leps *= epsscale

# central difference
fvals[i] += 0.5*leps  # change forwards distance to half eps
bvals[i] -= 0.5*leps  # change backwards distance to half eps
cdiffnew = (func(fvals)-func(bvals))/leps

if cdiffnew == cdiff:
break

# check whether previous diff and current diff are the same within reltol
rat = (cdiff/cdiffnew)
if np.isfinite(rat) and rat > 0.:
# gradient has not changed sign
if np.abs(1.-rat) < reltol:
break
else:
cdiff = cdiffnew
continue
else:
cdiff = cdiffnew
flipflop += 1
continue

count += 1

# define a theano Op for our likelihood function

itypes = [tt.dvector] # expects a vector of parameter values when called
otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

def __init__(self, loglike, data, x, sigma):
"""
Initialise with various things that the function requires. Below
are the things that are needed in this particular example.

Parameters
----------
loglike:
The log-likelihood (or whatever) function we've defined
data:
The "observed" data that our log-likelihood function takes in
x:
The dependent variable (aka 'x') that our model requires
sigma:
The noise standard deviation that out function requires.
"""

# add inputs as class attributes
self.likelihood = loglike
self.data = data
self.x = x
self.sigma = sigma

# initialise the gradient Op (below)

def perform(self, node, inputs, outputs):
# the method that is used when calling the Op
theta, = inputs  # this will contain my variables

# call the log-likelihood function
logl = self.likelihood(theta, self.x, self.data, self.sigma)

outputs[0][0] = np.array(logl) # output the log-likelihood

# the method that calculates the gradients - it actually returns the
# vector-Jacobian product - g[0] is a vector of parameter values
theta, = inputs  # our parameters

"""
This Op will be called with a vector of values and also return a vector of
values - the gradients in each dimension.
"""
itypes = [tt.dvector]
otypes = [tt.dvector]

def __init__(self, loglike, data, x, sigma):
"""
Initialise with various things that the function requires. Below
are the things that are needed in this particular example.

Parameters
----------
loglike:
The log-likelihood (or whatever) function we've defined
data:
The "observed" data that our log-likelihood function takes in
x:
The dependent variable (aka 'x') that our model requires
sigma:
The noise standard deviation that out function requires.
"""

# add inputs as class attributes
self.likelihood = loglike
self.data = data
self.x = x
self.sigma = sigma

def perform(self, node, inputs, outputs):
theta, = inputs

# define version of likelihood function to pass to derivative function
def lnlike(values):
return self.likelihood(values, self.x, self.data, self.sigma)

# create our Op
logl = LogLikeWithGrad(my_loglike, data, x, sigma)

def my_mu(v):
return logl(v)

# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
with pm.Model() as model1:

var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
#sigma0 = pm.HalfNormal('sigma0', sd=4)

theta = tt.as_tensor_variable([var1, var2,sigma0])

pm.DensityDist('likelihood',my_mu , observed={'v': theta})# 'sigma0':sigma0

#step = pm.Slice()

trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains,cores=cores)

tracedf=pm.trace_to_dataframe(trace)
tim_end=time.process_time()
pm.traceplot(trace,varnames=['var1','var2','sigma0'])#,varnames=[var1,var2]
sns.pairplot(tracedf[['var1','var2','sigma0']])
pm.plot_posterior(trace,varnames=['var1','var2','sigma0'])
pm.autocorrplot(trace,varnames=['var1','var2','sigma0']);
pm.traceplot(trace,combined=True,priors=[var1.distribution, var2.distribution, sigma0.distribution])

With this likelihood I’m not sure that you would expect sigma0 -> 3. This:

x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))

suggests that you’re sampling from a normal distribution; but

\mathcal{L}(x;\mu, \sigma) = -(x-\mu)^2/(2\sigma^2)

is missing the normalization factor -\frac{1}{2}\log [2 \pi \sigma^2] \propto - \log \sigma

Try adding it and see what happens.

Thank you for your help, I tried to use the following likelihood :
-(0.5/(sigma0)**2)*np.sum((data0 - model)**2)-np.log(sigma0)
However, I still get the same result.

Also shouldn’t these all be tensor operations? tt.sum and tt.log?

not according to this tutorial :
https://docs.pymc.io/notebooks/blackbox_external_likelihood.html

The sigma is not used in your my_loglike function:

def my_loglike(theta,x,data, sigma):

model = my_model(theta,x)
sigma0=theta[2]

data0=data

return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#


There are a lot of code to unpack so I didnt check everything. In general, it will be much easier to get help (and debug) if you share minimal reproducible code that shows error with generated data.

OK, I’ll try to make it more clear :

In my real case, I have some difficulties to use the classic approach cause my model includes a large time loop, which take forever to compile in theano. So, I’m using the black-box method described here :https://docs.pymc.io/notebooks/blackbox_external_likelihood.html. But I noticed that the noise of my model diverges.
So to figure out the problem I tested the black-box method on a simple case with three variables of a linear function with noise: var1 var2 and sigma0.

This is working well with the classic approach :

import pymc3 as pm
import numpy as np
import scipy.stats as st

ndraws = 2000
nburn = 500

#create data
x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))

if __name__ == "__main__":

with pm.Model() as model1:

# DECLARE PRIOR
var1 = pm.Normal('var1', mu=8, sd=3)
var2 = pm.Normal('var2', mu=5, sd=3)
sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)

# DECLARE OBJECTIVE FUNCTION
mu = pm.Deterministic('mu',(var1*x+var2))
y_obs = pm.Normal('y_obs', mu=mu, sd=sigma0, observed=data)
trace = pm.sample(ndraws, tune=nburn, chains=2,cores=2)

#plot
pm.traceplot(trace,varnames=['var1','var2','sigma0'])


But I don’t manage to get the same result with the black-box method. In the tutorial sigma is a fixed parameters, so I don’t use it, and I infer on the created variable sigma0. There is two ways to pass sigma0: or I pass it in the vector of parameters ‘theta’ or I pass it in the DensityDist, separately from vector theta ( this case implies to modify a bit the class LogLike). However, in both cases sigma0 diverges. Here the case where sigma0 is passed separately from the vector of variable theta:

import pymc3 as pm
import numpy as np
import scipy.stats as st
import theano.tensor as tt

ndraws = 2000
nburn = 500

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))
sigma=1# fixed parameter used in the tutorial but not used here

def my_model(theta,x):
var1,var2= theta
prediction=x*var1+var2
return prediction

def my_loglike(theta,sigma0,x,data, sigma):
model = my_model(theta,x)
data0=data
return -(0.5/(sigma0)**2)*np.sum((data0 - model)**2)

class LogLike(tt.Op):

"""
Specify what type of object will be passed and returned to the Op when it is
called. In our case we will be passing it a vector of values (the parameters
that define our model) and returning a single "scalar" value (the
log-likelihood)
"""
itypes = [tt.dvector,tt.dscalar] # expects a vector of parameter values when called
otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

def __init__(self, loglike, data, x, sigma):
"""
Initialise the Op with various things that our log-likelihood function
requires. Below are the things that are needed in this particular
example.

Parameters
----------
loglike:
The log-likelihood (or whatever) function we've defined
data:
The "observed" data that our log-likelihood function takes in
x:
The dependent variable (aka 'x') that our model requires
sigma:
The noise standard deviation that our function requires.
"""

# add inputs as class attributes
self.likelihood = loglike
self.data = data
self.x = x
self.sigma = sigma

def perform(self, node, inputs, outputs):
# the method that is used when calling the Op
theta,sigma0 = inputs  # this will contain my variables

# call the log-likelihood function
logl = self.likelihood(theta,sigma0, self.x, self.data, self.sigma)

outputs[0][0] = np.array(logl) # output the log-likelihood

# create our Op
logl = LogLike(my_loglike, data, x, sigma)

def my_mu(v,sigma0):
return logl(v,sigma0)

# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
with pm.Model() as model1:

var1 = pm.Normal('var1', mu=8, sd=3)
var2 = pm.Normal('var2', mu=5, sd=3)
sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
theta = tt.as_tensor_variable([var1, var2])
pm.DensityDist('likelihood',my_mu , observed={'v': theta, 'sigma0':sigma0 })#
trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=2,cores=2)

#plot
pm.traceplot(trace,varnames=['var1','var2','sigma0'])


I can also run the NUTS step by using the gradient function and the class LogLikeWithGrad, available in the tutorial, but don’t think the problem comes from the sampling method.

I see - but the logp function is different in the two, for example a normal log likelihood is

def my_loglike(theta, sigma0, x, data, sigma):
model = my_model(theta, x)
data0 = data
return -np.sum(0.5 * (np.log(2 * np.pi * sigma0 ** 2) + ((data0 - model) / sigma0) ** 2))

1 Like

yes indeed, sorry that I did not figure out that, thankyou :).
Besides do you know what are the drawbacks of using the black-box method ?

In general it would be a bit slower as there are more overhead on each call. But if you have some component that are highly optimized in the custom Ops it would be worth it.