ParallelSamplingError: Bad initial energy

benedict · December 9, 2019, 10:33am

Hi,

I am trying to apply a method seen in this notebook. I am interested in the first section: Bayesian inference of the parameters of an ODE.

My model is written

\frac{d X(t)}{dt}=\boldsymbol{f}\big(X(t),\boldsymbol{\theta}\big)

with X(t) = (\mu_1(t), \cdots, \mu_{12}(t)), \boldsymbol{\theta} = (\alpha_1, \beta_1, \gamma_1, \delta_1, \cdots, \alpha_{12}, \beta_{12}, \gamma_{12}, \delta_{12}, \eta_0, \cdots, \eta_9) and

\boldsymbol{f}\big(X(t),\boldsymbol{\theta}\big) = \begin{pmatrix} \alpha_1 + \frac{\beta_1 \eta(t)}{\gamma_1+\eta(t)} - \delta_1 \mu_1(t) \\ \cdots \\ \alpha_{12} + \frac{\beta_{12} \eta(t)}{\gamma_{12}+\eta(t)} - \delta_{12} \mu_{12}(t) \\ \end{pmatrix}

The function f is actually explicitly time-dependent through \eta which is piecewise constant:
\eta(t) = \eta_k iff t \in [t_k, t_{k+1}[ for a given time vector.

I closely followed the notebook and came up with the following:

# Taken from https://docs.pymc.io/notebooks/ODE_with_manual_gradients.html
# See also https://github.com/pymc-devs/pymc3/blob/master/docs/source/notebooks/ODE_with_manual_gradients.ipynb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import theano
from io import BytesIO
from scipy.integrate import odeint

THEANO_FLAGS = 'optimizer=fast_compile'
theano.config.exception_verbosity = 'high'
theano.config.floatX = 'float64'

n_conditions = 2
n_genes = 12
n_replicats = 4
times = np.array([0., 0.5, 1., 3.5, 6.5, 9.25, 12.5, 18.5, 22.5, 26.5, 30.5])
n_timeintervals = times.size-1 # = 10
n_timesteps = times.size # = 11

Binary = open('data.txt').read().replace(',', '.').encode()
expr = np.genfromtxt(BytesIO(Binary), missing_values = 'NA', filling_values = np.nan, skip_header = 1)
exprcrtg = expr[: , 3 : ].reshape(n_conditions, n_replicats, n_timesteps, n_genes)
exprtg = expr[: , 3 : ].reshape(n_conditions*n_replicats, n_timesteps, n_genes)
mu0  = np.nanmean(exprcrtg[:, :, 0, :], axis = (0, 1))

n_odeparams = 4*n_genes + n_timeintervals
n_states = n_genes

# 1st class
class ExpressionModel(object):
    def __init__(self, n_states, n_odeparams, y0=None):
        self._n_states = n_states
        self._n_odeparams = n_odeparams
        self._y0 = y0
    def simulate(self, parameters, times):
        return self._simulate(parameters, times, False)
    def simulate_with_sensitivities(self, parameters, times):
        return self._simulate(parameters, times, True)
    def _simulate(self, parameters, times, sensitivities):
        alpha = np.zeros(n_genes)
        beta  = np.zeros(n_genes)
        gamma = np.zeros(n_genes)
        delta = np.zeros(n_genes)
        for i in range(n_genes):
            alpha[i], beta[i], gamma[i], delta[i] = parameters[4*i : 4*(i+1)]
        eta = parameters[4*n_genes : ]
        def r(mu, t, p):
            current_index = np.searchsorted(times, t, side='right')-1
            if current_index >= n_timeintervals:
                current_index -= 1
            current_eta = eta[current_index]
            dmu_dt = alpha + beta * current_eta / (gamma + current_eta) - delta * mu
            return dmu_dt # return statement of r

        if sensitivities:
            def jac(mu):
                return np.diag(-delta) # return statement of jac

            def dfdp(mu, t):
                current_index = np.searchsorted(times, t, side='right')-1
                if current_index >= n_timeintervals:
                    current_index -= 1
                current_eta = eta[current_index]
                ret = np.zeros((self._n_states, self._n_odeparams)) # except the following entries
#                print('ret', ret.shape)
                for j in range(n_genes):

                    ret[j , 4*j]   = 1.
                    ret[j , 4*j+1] = current_eta/(gamma[j]+current_eta)
                    ret[j , 4*j+2] = - beta[j] * current_eta/(gamma[j]+current_eta)**2
                    ret[j , 4*j+3] = -mu[j]
#                print('4*n_genes+current_index', 4*n_genes+current_index)
                ret[:, 4*n_genes+current_index] = beta * gamma / (gamma+current_eta)**2
                return ret # return statement of dfdp

            def rhs(y_and_dydp, t, p):
                y = y_and_dydp[0:self._n_states]
#                print('y', y.shape)
#                print('y_and_dydp', y_and_dydp.shape)
#                print(y_and_dydp[self._n_states:].shape)
#                print(self._n_states, self._n_odeparams)
                dydp = y_and_dydp[self._n_states:].reshape((self._n_states,
                                                            self._n_odeparams))
                dydt = r(y, t, p)
                d_dydp_dt = np.matmul(jac(y), dydp) + dfdp(y, t)
                return np.concatenate((dydt, d_dydp_dt.reshape(-1))) # return statement of rhs

            y0 = np.zeros( n_states*n_odeparams + n_states )
#            print('y0', y0.shape)
            y0[0:n_states] = mu0
            result = odeint(rhs, y0, times, (parameters,), rtol=1e-6, atol=1e-5)
            values = result[:, 0:self._n_states]
            dvalues_dp = result[:, self._n_states:].reshape((n_timesteps,
                                                             self._n_states,
                                                             self._n_odeparams))
            return values, dvalues_dp # first return statement of _simulate
        else:
            values = odeint(r, mu0, times, (parameters,), rtol=1e-6, atol=1e-5)
            return values # second return statement of _simulate

ode_model = ExpressionModel(n_states, n_odeparams)

# 2nd class
class ODEGradop(theano.Op):
    def __init__(self, numpy_vsp):
        self._numpy_vsp = numpy_vsp

    def make_node(self, x, g):
        x = theano.tensor.as_tensor_variable(x)
        g = theano.tensor.as_tensor_variable(g)
        node = theano.Apply(self, [x, g], [g.type()])
        return node

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]

        g = inputs_storage[1]
        out = output_storage[0]
        out[0] = self._numpy_vsp(x, g)       # get the numerical VSP

# 3rd class
class ODEop(theano.Op):

    def __init__(self, state, numpy_vsp):
        self._state = state
        self._numpy_vsp = numpy_vsp

    def make_node(self, x):
        x = theano.tensor.as_tensor_variable(x)

        return theano.Apply(self, [x], [x.type()])

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]
        out = output_storage[0]

        out[0] = self._state(x)               # get the numerical solution of ODE states

    def grad(self, inputs, output_grads):
        x = inputs[0]
        g = output_grads[0]

        grad_op = ODEGradop(self._numpy_vsp)  # pass the VSP when asked for gradient
        grad_op_apply = grad_op(x, g)

        return [grad_op_apply]

# 4th class
class solveCached(object):
    def __init__(self, times, n_params, n_outputs):

        self._times = times
        self._n_params = n_params
        self._n_outputs = n_outputs
        self._cachedParam = np.zeros(n_params)
        self._cachedSens = np.zeros((n_timesteps, n_outputs, n_params))
        self._cachedState = np.zeros((n_timesteps,n_outputs))

    def __call__(self, x):

        if np.all(x==self._cachedParam):
            state, sens = self._cachedState, self._cachedSens

        else:
            state, sens = ode_model.simulate_with_sensitivities(x, times)

        return state, sens

cached_solver=solveCached(times, n_odeparams, n_states)

def state(x):
    State, Sens = cached_solver(np.array(x, dtype=np.float64))
    cached_solver._cachedState, cached_solver._cachedSens, cached_solver._cachedParam = State, Sens, x
    return State.reshape((n_states*len(State),))

def numpy_vsp(x, g):
    numpy_sens = cached_solver(np.array(x, dtype=np.float64))[1].reshape((n_states*n_timesteps, len(x)))
    return numpy_sens.T.dot(g)

# Now instantiate the theano custom ODE op
my_ODEop = ODEop(state, numpy_vsp)

# The probabilistic model
with pm.Model() as ExpressionModel:

    # Priors for unknown model parameters
    alpha = pm.Lognormal('alpha', mu=1.,   sd=0.5,  shape=n_genes)
    beta  = pm.Lognormal('beta',  mu=0.05, sd=0.05, shape=n_genes)
    gamma = pm.Lognormal('gamma', mu=1.,   sd=0.5,  shape=n_genes)
    delta = pm.Lognormal('delta', mu=0.05, sd=0.05, shape=n_genes)

    eta   = pm.Lognormal('eta',   mu=1.,   sd=0.05, shape=n_timeintervals)

    sigma = pm.Lognormal('sigma', mu=1.,   sd=1.,   shape=n_genes)

    # Forward model
    all_params = pm.math.stack((alpha, beta, gamma, delta), axis=-1)
    all_params = pm.math.flatten(all_params)
    all_params = pm.math.concatenate((all_params, eta))
    ode_sol = my_ODEop(all_params)
#    forward = ode_sol.reshape(exprtg.shape)
    forward = ode_sol.reshape((n_timesteps, n_genes))

    # Likelihood
    Y_obs = pm.Lognormal('Y_obs', mu=pm.math.log(forward), sd=sigma, observed=exprtg)

    trace = pm.sample(1500, tune=1000, init='adapt_diag')

trace['diverging'].sum()

with ExpressionModel:
    pm.traceplot(trace);

summary = pm.summary(trace)
summary

The data file is pasted here.

However, when run, I get the following error:

ParallelSamplingError: Bad initial energy

More precisely, I get the very informative message:

Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
Y_obs   NaN

I searched the forum and the FAQ for this error. Trying to debug allowed me to fix a trivial mistake, but not to resolve the issue.

I checked that the command Y_obs.tag.test_value[Y_obs.tag.test_value<=0.] returned an empty array so now I don’t know where to look at any more.

Any help would be very much appreciated.

junpenglao · December 9, 2019, 1:52pm

Try pm.sample(..., init='adapt_diag'), as the random jitter in the initial condition might create bad energy when you have a model that is sensitive to the initial condition.

benedict · December 9, 2019, 2:27pm

Isn’t it what I am already doing here?

junpenglao · December 9, 2019, 3:11pm

Ah sorry I didnt notice that
The problem is that you have a row of data containing NaNs:
1 3 5 NA NA NA NA NA NA NA NA NA NA NA NA

Try excluding this row and rerun your model

benedict · December 10, 2019, 9:44am

Thank you Junpeng! Now the model is running.

I encounter other problems with the interpretation of the results. I’ll come back here if I cannot have better results.

benedict · December 16, 2019, 3:30pm

As a follow-up to the previous posts, I am seeking for help to understand why the results of the samples are not good.

I made a slight correction to the model in the OP as now \eta is normalized with

\eta(t) = 1 \quad \mbox{if} \quad t \in [t_0, t_1[

The model is not rocket science, and is a good way for me to get introduced to probabilistic programming.

# Taken from https://docs.pymc.io/notebooks/ODE_with_manual_gradients.html
# See also https://github.com/pymc-devs/pymc3/blob/master/docs/source/notebooks/ODE_with_manual_gradients.ipynb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import theano
from io import BytesIO
from scipy.integrate import odeint
from scipy.optimize import curve_fit

THEANO_FLAGS = 'optimizer=fast_compile'
theano.config.exception_verbosity = 'high'
theano.config.floatX = 'float64'

n_conditions = 2
n_genes = 12
n_replicats = 4
times = np.array([0., 0.5, 1., 3.5, 6.5, 9.25, 12.5, 18.5, 22.5, 26.5, 30.5])
n_timeintervals = times.size-1 # = 10
n_timesteps = times.size # = 11

Binary = open('data.txt').read().replace(',', '.').encode()
expr = np.genfromtxt(BytesIO(Binary), missing_values = 'NA', filling_values = np.nan, skip_header = 1)
#exprcrtg = expr[: , 3 : ].reshape(n_conditions, n_replicats, n_timesteps, n_genes)
exprtg = expr[: , 3 : ].reshape(7, n_timesteps, n_genes)
exprtgTF = expr[: , 7 ].reshape(7, n_timesteps)
mu0  = np.nanmean(exprtg[:, 0, :], axis = (0, 1))

n_odeparams = 4*n_genes + n_timeintervals-1 # eta is normalized with eta=1. on the first time interval
n_states = n_genes

# 1st class
class ExpressionModel(object):
    def __init__(self, n_states, n_odeparams, y0=None):
        self._n_states = n_states
        self._n_odeparams = n_odeparams
        self._y0 = y0
    def simulate(self, parameters, times):
        return self._simulate(parameters, times, False)
    def simulate_with_sensitivities(self, parameters, times):
        return self._simulate(parameters, times, True)
    def _simulate(self, parameters, times, sensitivities):
        alpha = np.zeros(n_genes)
        beta  = np.zeros(n_genes)
        gamma = np.zeros(n_genes)
        delta = np.zeros(n_genes)
        for i in range(n_genes):
            alpha[i], beta[i], gamma[i], delta[i] = parameters[4*i : 4*(i+1)]
        eta = np.concatenate(([1.], parameters[4*n_genes : ]))
#        eta = parameters[4*n_genes : ]
        def r(mu, t, p):
            current_index = np.searchsorted(times, t, side='right')-1
            if current_index >= n_timeintervals:
                current_index -= 1
            current_eta = eta[current_index]
            dmu_dt = alpha + beta * current_eta / (gamma + current_eta) - delta * mu
            return dmu_dt # return statement of r

        if sensitivities:
            def jac(mu):
                return np.diag(-delta) # return statement of jac

            def dfdp(mu, t):
                current_index = np.searchsorted(times, t, side='right')-1
                if current_index >= n_timeintervals:
                    current_index -= 1
                current_eta = eta[current_index]
                ret = np.zeros((self._n_states, self._n_odeparams)) # except the following entries
                for j in range(n_genes):
                    ret[j , 4*j]   = 1.
                    ret[j , 4*j+1] = current_eta/(gamma[j]+current_eta)
                    ret[j , 4*j+2] = - beta[j] * current_eta/(gamma[j]+current_eta)**2
                    ret[j , 4*j+3] = -mu[j]
                if current_index >= 1:
                    ret[:, 4*n_genes+current_index-1] = beta * gamma / (gamma+current_eta)**2
#                ret[:, 4*n_genes+current_index] = beta * gamma / (gamma+current_eta)**2
                return ret # return statement of dfdp

            def rhs(y_and_dydp, t, p):
                y = y_and_dydp[:self._n_states]
                dydp = y_and_dydp[self._n_states:].reshape((self._n_states,
                                                            self._n_odeparams))
                dydt = r(y, t, p)
                d_dydp_dt = np.matmul(jac(y), dydp) + dfdp(y, t)
                return np.concatenate((dydt, d_dydp_dt.reshape(-1))) # return statement of rhs

            y0 = np.zeros( n_states*n_odeparams + n_states )
            y0[0:n_states] = mu0
            result = odeint(rhs, y0, times, (parameters,), rtol=1e-6, atol=1e-5)
            values = result[:, :self._n_states]
            dvalues_dp = result[:, self._n_states:].reshape((n_timesteps,
                                                             self._n_states,
                                                             self._n_odeparams))
            return values, dvalues_dp # first return statement of _simulate
        else:
            values = odeint(r, mu0, times, (parameters,), rtol=1e-6, atol=1e-5)
            return values # second return statement of _simulate

ode_model = ExpressionModel(n_states, n_odeparams)

# 2nd class
class ODEGradop(theano.Op):
    def __init__(self, numpy_vsp):
        self._numpy_vsp = numpy_vsp

    def make_node(self, x, g):
        x = theano.tensor.as_tensor_variable(x)
        g = theano.tensor.as_tensor_variable(g)
        node = theano.Apply(self, [x, g], [g.type()])
        return node

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]

        g = inputs_storage[1]
        out = output_storage[0]
        out[0] = self._numpy_vsp(x, g)       # get the numerical VSP

# 3rd class
class ODEop(theano.Op):

    def __init__(self, state, numpy_vsp):
        self._state = state
        self._numpy_vsp = numpy_vsp

    def make_node(self, x):
        x = theano.tensor.as_tensor_variable(x)

        return theano.Apply(self, [x], [x.type()])

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]
        out = output_storage[0]

        out[0] = self._state(x)               # get the numerical solution of ODE states

    def grad(self, inputs, output_grads):
        x = inputs[0]
        g = output_grads[0]

        grad_op = ODEGradop(self._numpy_vsp)  # pass the VSP when asked for gradient
        grad_op_apply = grad_op(x, g)

        return [grad_op_apply]

# 4th class
class solveCached(object):
    def __init__(self, times, n_params, n_outputs):

        self._times = times
        self._n_params = n_params
        self._n_outputs = n_outputs
        self._cachedParam = np.zeros(n_params)
        self._cachedSens = np.zeros((n_timesteps, n_outputs, n_params))
        self._cachedState = np.zeros((n_timesteps, n_outputs))

    def __call__(self, x):

        if np.all(x==self._cachedParam):
            state, sens = self._cachedState, self._cachedSens

        else:
            state, sens = ode_model.simulate_with_sensitivities(x, times)

        return state, sens

cached_solver=solveCached(times, n_odeparams, n_states)

def state(x):
    State, Sens = cached_solver(np.array(x, dtype=np.float64))
    cached_solver._cachedState, cached_solver._cachedSens, cached_solver._cachedParam = State, Sens, x
    return State.reshape((n_states*len(State),))

def numpy_vsp(x, g):
    numpy_sens = cached_solver(np.array(x, dtype=np.float64))[1].reshape((n_states*n_timesteps, len(x)))
    return numpy_sens.T.dot(g)

# Now instantiate the theano custom ODE op
my_ODEop = ODEop(state, numpy_vsp)

# prior for alphas
alpha0 = 0.2*np.ones(n_genes)

# prior for betas
beta0 = np.nanmean(exprtg[: , 1, :], axis = (0, ))

# prior for deltas
def expo(t, delta):
    global g3
    return g3 * np.exp(-delta * t)

delta0 = np.zeros(n_genes)
for gene_index in range(n_genes): # gene_index goes from 0 to n_genes-1
    xdata = times[3:8] - times[3]
    ydata = exprtg[: , 3:8, gene_index]
    popt = np.zeros(7)
    for k in range(7):
        g3 = exprtg[k, 3, gene_index]
        popt[k], pcov = curve_fit(expo, xdata, ydata[k])
    delta0[gene_index] = np.nanmean(popt)

# prior for etas
exprtgTF_normalized = exprtgTF[:, 1:-1] / exprtgTF[:, 0, np.newaxis]
#exprtgTF_normalized = exprtgTF[:, :-1] / exprtgTF[:, 0, np.newaxis]
eta0 = np.nanmean(exprtgTF_normalized, axis = (0, ))

# prior for gammas
gamma0 = eta0.max() / 2 * np.ones(n_genes)


# The probabilistic model
with pm.Model() as ExpressionModel:

    # Priors for unknown model parameters
    alpha = pm.Lognormal('alpha', mu=1., sd=0.1, shape=n_genes)
    beta  = pm.Lognormal('beta',  mu=0.05,  sd=10., shape=n_genes)
    gamma = pm.Lognormal('gamma', mu=1., sd=1.,  shape=n_genes)
    delta = pm.Lognormal('delta', mu=0.05, sd=0.1, shape=n_genes)
    eta   = pm.Lognormal('eta',   mu=1.,   sd=10., shape=n_timeintervals-1)
#    eta   = pm.Lognormal('eta',   mu=1.,   sd=10., shape=n_timeintervals)

    sigma = pm.Lognormal('sigma', mu=1.,   sd=1.,   shape=n_genes)

    # Forward model
    all_params = pm.math.stack((alpha, beta, gamma, delta), axis=-1)
    all_params = pm.math.flatten(all_params)
    all_params = pm.math.concatenate((all_params, eta))
    ode_sol = my_ODEop(all_params)
#    forward = ode_sol.reshape(exprtg.shape)
    forward = ode_sol.reshape((n_timesteps, n_genes))

    # Likelihood
    Y_obs = pm.Lognormal('Y_obs', mu=pm.math.log(forward), sd=sigma, observed=exprtg)

#    trace = pm.sample(1500, tune=1000, init='adapt_diag')
    trace = pm.sample(300, tune=300, init='adapt_diag')

#trace['diverging'].sum()

#with ExpressionModel:
#    pm.traceplot(trace);

#summary = pm.summary(trace)
#summary

Trying to explore the posterior, I checked the pm.summary(trace) command:

eta[0]     3.255860e+16  7.973735e+17   0.000  942411.813  3.233352e+16  ...     608.0   608.0     294.0     365.0   1.00
eta[1]     1.820000e-01  5.320000e-01   0.000       0.648  2.200000e-02  ...     577.0   577.0     212.0     230.0   1.02
eta[2]     4.600000e-02  5.590000e-01   0.000       0.092  2.300000e-02  ...     614.0   608.0     253.0     287.0   1.00
eta[3]     8.000000e-03  2.800000e-02   0.000       0.035  1.000000e-03  ...     677.0   620.0     318.0     208.0   1.00
eta[4]     4.000000e-03  1.000000e-02   0.000       0.023  0.000000e+00  ...     663.0   618.0     206.0     187.0   1.01
eta[5]     5.000000e-03  1.400000e-02   0.000       0.026  1.000000e-03  ...     694.0   627.0     380.0     262.0   1.00
eta[6]     3.000000e-03  1.000000e-02   0.000       0.014  0.000000e+00  ...     547.0   547.0     549.0     311.0   1.00
eta[7]     3.000000e-03  1.000000e-02   0.000       0.012  0.000000e+00  ...     611.0   611.0     490.0     399.0   1.00
eta[8]     9.000000e-03  3.200000e-02   0.000       0.045  1.000000e-03  ...     670.0   618.0     470.0     412.0   1.00

which shows an obvious problem with \eta_1.

Any help or hint in that direction would be greatly appreciated! Thank you all for your valuable input.

Topic		Replies	Views
Problems with bayesian ODEs parameter estimation Questions	2	1139	April 9, 2020
Even after reading the FAQ, still having trouble with a "Bad initial energy" error Questions	3	845	July 22, 2020
Bad Initial Energy Questions	9	1424	November 17, 2021
Bayesian for simple ODE model. ValueError: Setting an array element with a sequence Questions	1	2336	July 28, 2018
SamplingError: Bad initial energy Questions	1	2774	September 10, 2020

ParallelSamplingError: Bad initial energy

Related topics