Pause and resume training a model that can change during training

To resume training of the model, the doc says I can specify the trace keyword of pm.sample.

Every time when I resume training of the model, the model seems to reinitialize the parameters.

Setting start=old_trace[-1] somehow make the sampling worse. While start=old_trace[-1] is the default, NUTS would override this default.

1st run:

Combined trace of all runs of the sampler

I guess the spikes in the two subplots at the right means my code reinitialize the variables whenever it tries to resume training …

import theano.tensor as tt
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt

import warnings


def gradients(vals, func, releps=1e-3, abseps=None, mineps=1e-9, reltol=1e-3,
              epsscale=0.5):
    """
    Calculate the partial derivatives of a function at a set of values. The
    derivatives are calculated using the central difference, using an
    iterative method to check that the values converge as step size decreases.

    Parameters
    ----------
    vals: array_like
        A set of values, that are passed to a function, at which to calculate
        the gradient of that function
    func:
        A function that takes in an array of values.
    releps: float, array_like, 1e-3
        The initial relative step size for calculating the derivative.
    abseps: float, array_like, None
        The initial absolute step size for calculating the derivative.
        This overrides `releps` if set.
        `releps` is set then that is used.
    mineps: float, 1e-9
        The minimum relative step size at which to stop iterations if no
        convergence is achieved.
    epsscale: float, 0.5
        The factor by which releps if scaled in each iteration.

    Returns
    -------
    grads: array_like
        An array of gradients for each non-fixed value.
    """

    grads = np.zeros(len(vals))

    # maximum number of times the gradient can change sign
    flipflopmax = 10.

    # set steps
    if abseps is None:
        if isinstance(releps, float):
            eps = np.abs(vals) * releps
            eps[eps == 0.] = releps  # if any values are zero set eps to releps
            teps = releps * np.ones(len(vals))
        elif isinstance(releps, (list, np.ndarray)):
            if len(releps) != len(vals):
                raise ValueError("Problem with input relative step sizes")
            eps = np.multiply(np.abs(vals), releps)
            eps[eps == 0.] = np.array(releps)[eps == 0.]
            teps = releps
        else:
            raise RuntimeError(
                "Relative step sizes are not a recognised type!")
    else:
        if isinstance(abseps, float):
            eps = abseps * np.ones(len(vals))
        elif isinstance(abseps, (list, np.ndarray)):
            if len(abseps) != len(vals):
                raise ValueError("Problem with input absolute step sizes")
            eps = np.array(abseps)
        else:
            raise RuntimeError(
                "Absolute step sizes are not a recognised type!")
        teps = eps

    # for each value in vals calculate the gradient
    count = 0
    for i in range(len(vals)):
        # initial parameter diffs
        leps = eps[i]
        cureps = teps[i]

        flipflop = 0

        # get central finite difference
        fvals = np.copy(vals)
        bvals = np.copy(vals)

        # central difference
        fvals[i] += 0.5 * leps  # change forwards distance to half eps
        bvals[i] -= 0.5 * leps  # change backwards distance to half eps
        cdiff = (func(fvals) - func(bvals)) / leps

        while 1:
            fvals[i] -= 0.5 * leps  # remove old step
            bvals[i] += 0.5 * leps

            # change the difference by a factor of two
            cureps *= epsscale
            if cureps < mineps or flipflop > flipflopmax:
                # if no convergence set flat derivative
                # (TODO: check if there is a better thing to do instead)
                warnings.warn(
                    "Derivative calculation did not converge: setting "
                    "flat derivative.")
                grads[count] = 0.
                break
            leps *= epsscale

            # central difference
            fvals[i] += 0.5 * leps  # change forwards distance to half eps
            bvals[i] -= 0.5 * leps  # change backwards distance to half eps
            cdiffnew = (func(fvals) - func(bvals)) / leps

            if cdiffnew == cdiff:
                grads[count] = cdiff
                break

            # check whether previous diff and current diff are the same within
            # reltol
            rat = (cdiff / cdiffnew)
            if np.isfinite(rat) and rat > 0.:
                # gradient has not changed sign
                if np.abs(1. - rat) < reltol:
                    grads[count] = cdiffnew
                    break
                else:
                    cdiff = cdiffnew
                    continue
            else:
                cdiff = cdiffnew
                flipflop += 1
                continue

        count += 1

    return grads


# define a theano Op for our likelihood function
class LogLikeWithGrad(tt.Op):
    itypes = [tt.dvector]  # expects a vector of parameter values when called
    otypes = [tt.dscalar]  # outputs a single scalar value (the log likelihood)

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise with various things that the function requires. Below
        are the things that are needed in this particular example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that out function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

        # initialise the gradient Op (below)
        self.logpgrad = LogLikeGrad(self.likelihood, self.data, self.x,
                                    self.sigma)

    def perform(self, node, inputs, outputs):
        # the method that is used when calling the Op
        theta, = inputs  # this will contain my variables

        # call the log-likelihood function
        logl = self.likelihood(theta, self.x, self.data, self.sigma)

        outputs[0][0] = np.array(logl)  # output the log-likelihood

    def grad(self, inputs, g):
        # the method that calculates the gradients - it actually returns the
        # vector-Jacobian product - g[0] is a vector of parameter values
        theta, = inputs  # our parameters
        return [g[0] * self.logpgrad(theta)]


class LogLikeGrad(tt.Op):
    """
    This Op will be called with a vector of values and also return a vector of
    values - the gradients in each dimension.
    """
    itypes = [tt.dvector]
    otypes = [tt.dvector]

    def __init__(self, loglike, data, x, sigma):
        """
        Initialise with various things that the function requires. Below
        are the things that are needed in this particular example.

        Parameters
        ----------
        loglike:
            The log-likelihood (or whatever) function we've defined
        data:
            The "observed" data that our log-likelihood function takes in
        x:
            The dependent variable (aka 'x') that our model requires
        sigma:
            The noise standard deviation that out function requires.
        """

        # add inputs as class attributes
        self.likelihood = loglike
        self.data = data
        self.x = x
        self.sigma = sigma

    def perform(self, node, inputs, outputs):
        theta, = inputs

        # define version of likelihood function to pass to derivative function
        def lnlike(values):
            return self.likelihood(values, self.x, self.data, self.sigma)

        # calculate gradients
        grads = gradients(theta, lnlike)
        outputs[0][0] = grads


def my_model(thetha, x):
    m, c = thetha
    return m * x + c


def my_loglike(theta, x, data, sigma):
    """
    A Gaussian log-likelihood function for a model with parameters given in theta
    """
    model = my_model(theta, x)
    return -(0.5 / sigma ** 2) * np.sum((data - model) ** 2)


def main():
    # set up our data
    N = 10  # number of data points
    sigma = 1.  # standard deviation of noise
    x = np.linspace(0., 9., N)

    mtrue = 0.4  # true gradient
    ctrue = 3.  # true y-intercept

    truemodel = my_model([mtrue, ctrue], x)

    # make data
    np.random.seed(
        716742)  # set random seed, so the data is reproducible each time
    data = sigma * np.random.randn(N) + truemodel

    ndraws = 100  # number of draws from the distribution
    nburn = 10  # number of "burn-in points" (which we'll discard)

    # create our Op
    logl = LogLikeWithGrad(my_loglike, data, x, sigma)

    # use PyMC3 to sampler from log-likelihood
    with pm.Model() as opmodel:
        m = pm.Uniform('m', lower=-10., upper=10.)
        c = pm.Uniform('c', lower=-10., upper=10.)
        theta = tt.as_tensor_variable([m, c])
        pm.DensityDist('likelihood', lambda v: logl(v),
                       observed={'v': theta})
        trace = pm.sample(ndraws, tune=nburn,
                          discard_tuned_samples=True)
        _ = pm.traceplot(trace, lines={'m': mtrue, 'c': ctrue})
        plt.show()
        for i in range(5):
            trace = pm.sample(ndraws, tune=nburn,
                              discard_tuned_samples=True, trace=trace,
                              start=None)
    print("trace_len", len(trace))  # trace is the combined trace from all 5 runs.
    # plot the traces
    _ = pm.traceplot(trace, lines={'m': mtrue, 'c': ctrue})

    # put the chains in an array (for later!)
    samples_pymc3_2 = np.vstack((trace['m'], trace['c'])).T
    plt.show()


main()