How to calibrate the sigma of the likelihood within the black-box method?

Hello,

I don’t manage to calibrate the sigma of the likelihood with the black-box method, I tried to include the variable sigma in the tensor vector theta but it’s not working (see the code below).
Does someone know how to fix this problem ?

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))

sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3


def my_model(theta,x):

    var1,var2,sigma0= theta 
    prediction=x*var1+var2

    return prediction



def my_loglike(theta,x,data, sigma):


    model = my_model(theta, x)
    sigma0=theta[2]

    data0=data

    return -(0.5/sigma0**2)*np.sum((data0 - model)**2)




class LogLike(tt.Op):

    """
    Specify what type of object will be passed and returned to the Op when it is
    called. In our case we will be passing it a vector of values (the parameters
    that define our model) and returning a single "scalar" value (the
    log-likelihood)
    """
    itypes = [tt.dvector] # expects a vector of parameter values when called
    otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise the Op with various things that our log-likelihood function
        requires. Below are the things that are needed in this particular
        example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that our function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

    def perform(self, node, inputs, outputs):
        # the method that is used when calling the Op
        theta, = inputs  # this will contain my variables

        # call the log-likelihood function
        logl = self.likelihood(theta, self.x, self.data, self.sigma)

        outputs[0][0] = np.array(logl) # output the log-likelihood



# create our Op
logl = LogLike(my_loglike, data, x, sigma)


def my_mu(v):
    return logl(v)



# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
    with pm.Model() as model1:

        var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
        var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
        sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
        #sigma0 = pm.HalfNormal('sigma0', sd=4)

        theta = tt.as_tensor_variable([var1, var2,sigma0])
    
        pm.DensityDist('likelihood',my_mu , observed={'v': theta})# 

        step = pm.Slice()
        
        trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains, step=step,cores=cores)

    tracedf=pm.trace_to_dataframe(trace)
    tim_end=time.process_time()
    pm.traceplot(trace,varnames=['var1','var2','sigma0'])#,varnames=[var1,var2]
    sns.pairplot(tracedf[['var1','var2','sigma0']])
    pm.plot_posterior(trace,varnames=['var1','var2','sigma0'])
    pm.autocorrplot(trace,varnames=['var1','var2','sigma0']);
    pm.traceplot(trace,combined=True,priors=[var1.distribution, var2.distribution, sigma0.distribution])

You need to pass the free parameter (sigma in this case) in the observed kwarg dict. See for example: How to set up a custom likelihood function for two variables

I added an observed argument ‘sigma’ in the DensityDist, and then changed the theano Op (see the code below). However, I get the same result than when I pass it in the input vector theta: the sigma diverges while it should converge toward 3.

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))
sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3


def my_model(theta,x):

    var1,var2= theta 
    prediction=x*var1+var2

    return prediction



def my_loglike(theta,sigma0,x,data, sigma):


    model = my_model(theta,x)
    #sigma0=theta[2]

    data0=data

    return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#




class LogLike(tt.Op):

    """
    Specify what type of object will be passed and returned to the Op when it is
    called. In our case we will be passing it a vector of values (the parameters
    that define our model) and returning a single "scalar" value (the
    log-likelihood)
    """
    itypes = [tt.dvector,tt.dscalar] # expects a vector of parameter values when called
    otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise the Op with various things that our log-likelihood function
        requires. Below are the things that are needed in this particular
        example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that our function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

    def perform(self, node, inputs, outputs):
        # the method that is used when calling the Op
        theta,sigma0 = inputs  # this will contain my variables

        # call the log-likelihood function
        logl = self.likelihood(theta,sigma0, self.x, self.data, self.sigma)

        outputs[0][0] = np.array(logl) # output the log-likelihood



# create our Op
logl = LogLike(my_loglike, data, x, sigma)


def my_mu(v,sigma0):
    return logl(v,sigma0)



# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
    with pm.Model() as model1:

        var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
        var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
        sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
        #sigma0 = pm.HalfNormal('sigma0', sd=4)

        theta = tt.as_tensor_variable([var1, var2])
    
        pm.DensityDist('likelihood',my_mu , observed={'v': theta, 'sigma0':sigma0 })# 

        step = pm.Slice()
        
        trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains, step=step,cores=cores)

Try with the default sampler, not sure how well Slice sampler performs and I would trust NUTS sampler much more.

I got the same result with the default sampler and even with the NUTS (see the code below). I think the problem coms from black-box method.

import pymc3 as pm
import numpy as np
import theano.tensor as tt
import time as time
import seaborn as sns
import scipy.stats as st

ndraws = 1000
nburn = 200
chains=2
njobs=1
cores=2

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))
sigma=5
prio_var1=8
prio_var2=5
sd_var1=3
sd_var2=3


def my_model(theta,x):

    var1,var2,sigma0 = theta 
    prediction=x*var1+var2

    return prediction



def my_loglike(theta,x,data, sigma):


    model = my_model(theta,x)
    sigma0=theta[2]

    data0=data

    return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#




def gradients(vals, func, releps=1e-3, abseps=None, mineps=1e-9, reltol=1e-3,
              epsscale=0.5):
    """
    Calculate the partial derivatives of a function at a set of values. The
    derivatives are calculated using the central difference, using an iterative
    method to check that the values converge as step size decreases.

    Parameters
    ----------
    vals: array_like
        A set of values, that are passed to a function, at which to calculate
        the gradient of that function
    func:
        A function that takes in an array of values.
    releps: float, array_like, 1e-3
        The initial relative step size for calculating the derivative.
    abseps: float, array_like, None
        The initial absolute step size for calculating the derivative.
        This overrides `releps` if set.
        `releps` is set then that is used.
    mineps: float, 1e-9
        The minimum relative step size at which to stop iterations if no
        convergence is achieved.
    epsscale: float, 0.5
        The factor by which releps if scaled in each iteration.

    Returns
    -------
    grads: array_like
        An array of gradients for each non-fixed value.
    """

    grads = np.zeros(len(vals))

    # maximum number of times the gradient can change sign
    flipflopmax = 10.

    # set steps
    if abseps is None:
        if isinstance(releps, float):
            eps = np.abs(vals)*releps
            eps[eps == 0.] = releps  # if any values are zero set eps to releps
            teps = releps*np.ones(len(vals))
        elif isinstance(releps, (list, np.ndarray)):
            if len(releps) != len(vals):
                raise ValueError("Problem with input relative step sizes")
            eps = np.multiply(np.abs(vals), releps)
            eps[eps == 0.] = np.array(releps)[eps == 0.]
            teps = releps
        else:
            raise RuntimeError("Relative step sizes are not a recognised type!")
    else:
        if isinstance(abseps, float):
            eps = abseps*np.ones(len(vals))
        elif isinstance(abseps, (list, np.ndarray)):
            if len(abseps) != len(vals):
                raise ValueError("Problem with input absolute step sizes")
            eps = np.array(abseps)
        else:
            raise RuntimeError("Absolute step sizes are not a recognised type!")
        teps = eps

    # for each value in vals calculate the gradient
    count = 0
    for i in range(len(vals)):
        # initial parameter diffs
        leps = eps[i]
        cureps = teps[i]

        flipflop = 0

        # get central finite difference
        fvals = np.copy(vals)
        bvals = np.copy(vals)

        # central difference
        fvals[i] += 0.5*leps  # change forwards distance to half eps
        bvals[i] -= 0.5*leps  # change backwards distance to half eps
        cdiff = (func(fvals)-func(bvals))/leps

        while 1:
            fvals[i] -= 0.5*leps  # remove old step
            bvals[i] += 0.5*leps

            # change the difference by a factor of two
            cureps *= epsscale
            if cureps < mineps or flipflop > flipflopmax:
                # if no convergence set flat derivative (TODO: check if there is a better thing to do instead)
                warnings.warn("Derivative calculation did not converge: setting flat derivative.")
                grads[count] = 0.
                break
            leps *= epsscale

            # central difference
            fvals[i] += 0.5*leps  # change forwards distance to half eps
            bvals[i] -= 0.5*leps  # change backwards distance to half eps
            cdiffnew = (func(fvals)-func(bvals))/leps

            if cdiffnew == cdiff:
                grads[count] = cdiff
                break

            # check whether previous diff and current diff are the same within reltol
            rat = (cdiff/cdiffnew)
            if np.isfinite(rat) and rat > 0.:
                # gradient has not changed sign
                if np.abs(1.-rat) < reltol:
                    grads[count] = cdiffnew
                    break
                else:
                    cdiff = cdiffnew
                    continue
            else:
                cdiff = cdiffnew
                flipflop += 1
                continue

        count += 1

    return grads


# define a theano Op for our likelihood function
class LogLikeWithGrad(tt.Op):

    itypes = [tt.dvector] # expects a vector of parameter values when called
    otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise with various things that the function requires. Below
        are the things that are needed in this particular example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that out function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

        # initialise the gradient Op (below)
        self.logpgrad = LogLikeGrad(self.likelihood, self.data, self.x, self.sigma)

    def perform(self, node, inputs, outputs):
        # the method that is used when calling the Op
        theta, = inputs  # this will contain my variables

        # call the log-likelihood function
        logl = self.likelihood(theta, self.x, self.data, self.sigma)

        outputs[0][0] = np.array(logl) # output the log-likelihood

    def grad(self, inputs, g):
        # the method that calculates the gradients - it actually returns the
        # vector-Jacobian product - g[0] is a vector of parameter values
        theta, = inputs  # our parameters
        return [g[0]*self.logpgrad(theta)]


class LogLikeGrad(tt.Op):

    """
    This Op will be called with a vector of values and also return a vector of
    values - the gradients in each dimension.
    """
    itypes = [tt.dvector]
    otypes = [tt.dvector]

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise with various things that the function requires. Below
        are the things that are needed in this particular example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that out function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

    def perform(self, node, inputs, outputs):
        theta, = inputs

        # define version of likelihood function to pass to derivative function
        def lnlike(values):
            return self.likelihood(values, self.x, self.data, self.sigma)

        # calculate gradients
        grads = gradients(theta, lnlike)

        outputs[0][0] = grads
        


# create our Op
logl = LogLikeWithGrad(my_loglike, data, x, sigma)


def my_mu(v):
    return logl(v)




# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
    with pm.Model() as model1:

        var1 = pm.Normal('var1', mu=prio_var1, sd=sd_var1)
        var2 = pm.Normal('var2', mu=prio_var2, sd=sd_var2)
        sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
        #sigma0 = pm.HalfNormal('sigma0', sd=4)

        theta = tt.as_tensor_variable([var1, var2,sigma0])
    
        pm.DensityDist('likelihood',my_mu , observed={'v': theta})# 'sigma0':sigma0 

        #step = pm.Slice()
        
        trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=chains,cores=cores)

    tracedf=pm.trace_to_dataframe(trace)
    tim_end=time.process_time()
    pm.traceplot(trace,varnames=['var1','var2','sigma0'])#,varnames=[var1,var2]
    sns.pairplot(tracedf[['var1','var2','sigma0']])
    pm.plot_posterior(trace,varnames=['var1','var2','sigma0'])
    pm.autocorrplot(trace,varnames=['var1','var2','sigma0']);
    pm.traceplot(trace,combined=True,priors=[var1.distribution, var2.distribution, sigma0.distribution])

With this likelihood I’m not sure that you would expect sigma0 -> 3. This:

x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))

suggests that you’re sampling from a normal distribution; but

\mathcal{L}(x;\mu, \sigma) = -(x-\mu)^2/(2\sigma^2)

is missing the normalization factor -\frac{1}{2}\log [2 \pi \sigma^2] \propto - \log \sigma

Try adding it and see what happens.

Thank you for your help, I tried to use the following likelihood :
-(0.5/(sigma0)**2)*np.sum((data0 - model)**2)-np.log(sigma0)
However, I still get the same result. :frowning:

Also shouldn’t these all be tensor operations? tt.sum and tt.log?

not according to this tutorial :
https://docs.pymc.io/notebooks/blackbox_external_likelihood.html

The sigma is not used in your my_loglike function:

def my_loglike(theta,x,data, sigma):


    model = my_model(theta,x)
    sigma0=theta[2]

    data0=data

    return -(0.5/sigma0**2)*np.sum((data0 - model)**2)#

There are a lot of code to unpack so I didnt check everything. In general, it will be much easier to get help (and debug) if you share minimal reproducible code that shows error with generated data.

OK, I’ll try to make it more clear :

In my real case, I have some difficulties to use the classic approach cause my model includes a large time loop, which take forever to compile in theano. So, I’m using the black-box method described here :https://docs.pymc.io/notebooks/blackbox_external_likelihood.html. But I noticed that the noise of my model diverges.
So to figure out the problem I tested the black-box method on a simple case with three variables of a linear function with noise: var1 var2 and sigma0.

This is working well with the classic approach :

import pymc3 as pm
import numpy as np
import scipy.stats as st

ndraws = 2000
nburn = 500

#create data
x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))


if __name__ == "__main__":

    with pm.Model() as model1:
    
        # DECLARE PRIOR
        var1 = pm.Normal('var1', mu=8, sd=3)
        var2 = pm.Normal('var2', mu=5, sd=3)
        sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)

        # DECLARE OBJECTIVE FUNCTION
        mu = pm.Deterministic('mu',(var1*x+var2))
        y_obs = pm.Normal('y_obs', mu=mu, sd=sigma0, observed=data)
        trace = pm.sample(ndraws, tune=nburn, chains=2,cores=2)
  
    #plot      
    pm.traceplot(trace,varnames=['var1','var2','sigma0'])

But I don’t manage to get the same result with the black-box method. In the tutorial sigma is a fixed parameters, so I don’t use it, and I infer on the created variable sigma0. There is two ways to pass sigma0: or I pass it in the vector of parameters ‘theta’ or I pass it in the DensityDist, separately from vector theta ( this case implies to modify a bit the class LogLike). However, in both cases sigma0 diverges. Here the case where sigma0 is passed separately from the vector of variable theta:

import pymc3 as pm
import numpy as np
import scipy.stats as st
import theano.tensor as tt

ndraws = 2000
nburn = 500

x=np.arange(0,24,1)
data=x*10+2+st.norm.rvs(loc=0, scale=3, size=len(x))   
sigma=1# fixed parameter used in the tutorial but not used here  


def my_model(theta,x):
    var1,var2= theta 
    prediction=x*var1+var2
    return prediction


def my_loglike(theta,sigma0,x,data, sigma):
    model = my_model(theta,x)
    data0=data
    return -(0.5/(sigma0)**2)*np.sum((data0 - model)**2)


class LogLike(tt.Op):

    """
    Specify what type of object will be passed and returned to the Op when it is
    called. In our case we will be passing it a vector of values (the parameters
    that define our model) and returning a single "scalar" value (the
    log-likelihood)
    """
    itypes = [tt.dvector,tt.dscalar] # expects a vector of parameter values when called
    otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise the Op with various things that our log-likelihood function
        requires. Below are the things that are needed in this particular
        example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that our function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

    def perform(self, node, inputs, outputs):
        # the method that is used when calling the Op
        theta,sigma0 = inputs  # this will contain my variables

        # call the log-likelihood function
        logl = self.likelihood(theta,sigma0, self.x, self.data, self.sigma)

        outputs[0][0] = np.array(logl) # output the log-likelihood



# create our Op
logl = LogLike(my_loglike, data, x, sigma)


def my_mu(v,sigma0):
    return logl(v,sigma0)



# use PyMC3 to sampler from log-likelihood
if __name__ == "__main__":
    with pm.Model() as model1:

        var1 = pm.Normal('var1', mu=8, sd=3)
        var2 = pm.Normal('var2', mu=5, sd=3)
        sigma0 = pm.Uniform('sigma0', lower=0.1, upper=10)
        theta = tt.as_tensor_variable([var1, var2])    
        pm.DensityDist('likelihood',my_mu , observed={'v': theta, 'sigma0':sigma0 })#         
        trace = pm.sample(ndraws, tune=nburn, discard_tuned_samples=True, chains=2,cores=2)
        
    #plot
    pm.traceplot(trace,varnames=['var1','var2','sigma0'])

I can also run the NUTS step by using the gradient function and the class LogLikeWithGrad, available in the tutorial, but don’t think the problem comes from the sampling method.

I see - but the logp function is different in the two, for example a normal log likelihood is

def my_loglike(theta, sigma0, x, data, sigma):
    model = my_model(theta, x)
    data0 = data
    return -np.sum(0.5 * (np.log(2 * np.pi * sigma0 ** 2) + ((data0 - model) / sigma0) ** 2))

yes indeed, sorry that I did not figure out that, thankyou :).
Besides do you know what are the drawbacks of using the black-box method ?

In general it would be a bit slower as there are more overhead on each call. But if you have some component that are highly optimized in the custom Ops it would be worth it.