ParallelSamplingError: Bad initial energy

Hi,

I am trying to apply a method seen in this notebook. I am interested in the first section: Bayesian inference of the parameters of an ODE.

My model is written

\frac{d X(t)}{dt}=\boldsymbol{f}\big(X(t),\boldsymbol{\theta}\big)

with X(t) = (\mu_1(t), \cdots, \mu_{12}(t)), \boldsymbol{\theta} = (\alpha_1, \beta_1, \gamma_1, \delta_1, \cdots, \alpha_{12}, \beta_{12}, \gamma_{12}, \delta_{12}, \eta_0, \cdots, \eta_9) and

\boldsymbol{f}\big(X(t),\boldsymbol{\theta}\big) = \begin{pmatrix} \alpha_1 + \frac{\beta_1 \eta(t)}{\gamma_1+\eta(t)} - \delta_1 \mu_1(t) \\ \cdots \\ \alpha_{12} + \frac{\beta_{12} \eta(t)}{\gamma_{12}+\eta(t)} - \delta_{12} \mu_{12}(t) \\ \end{pmatrix}

The function f is actually explicitly time-dependent through \eta which is piecewise constant:
\eta(t) = \eta_k iff t \in [t_k, t_{k+1}[ for a given time vector.

I closely followed the notebook and came up with the following:

# Taken from https://docs.pymc.io/notebooks/ODE_with_manual_gradients.html
# See also https://github.com/pymc-devs/pymc3/blob/master/docs/source/notebooks/ODE_with_manual_gradients.ipynb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import theano
from io import BytesIO
from scipy.integrate import odeint

THEANO_FLAGS = 'optimizer=fast_compile'
theano.config.exception_verbosity = 'high'
theano.config.floatX = 'float64'

n_conditions = 2
n_genes = 12
n_replicats = 4
times = np.array([0., 0.5, 1., 3.5, 6.5, 9.25, 12.5, 18.5, 22.5, 26.5, 30.5])
n_timeintervals = times.size-1 # = 10
n_timesteps = times.size # = 11

Binary = open('data.txt').read().replace(',', '.').encode()
expr = np.genfromtxt(BytesIO(Binary), missing_values = 'NA', filling_values = np.nan, skip_header = 1)
exprcrtg = expr[: , 3 : ].reshape(n_conditions, n_replicats, n_timesteps, n_genes)
exprtg = expr[: , 3 : ].reshape(n_conditions*n_replicats, n_timesteps, n_genes)
mu0  = np.nanmean(exprcrtg[:, :, 0, :], axis = (0, 1))

n_odeparams = 4*n_genes + n_timeintervals
n_states = n_genes

# 1st class
class ExpressionModel(object):
    def __init__(self, n_states, n_odeparams, y0=None):
        self._n_states = n_states
        self._n_odeparams = n_odeparams
        self._y0 = y0
    def simulate(self, parameters, times):
        return self._simulate(parameters, times, False)
    def simulate_with_sensitivities(self, parameters, times):
        return self._simulate(parameters, times, True)
    def _simulate(self, parameters, times, sensitivities):
        alpha = np.zeros(n_genes)
        beta  = np.zeros(n_genes)
        gamma = np.zeros(n_genes)
        delta = np.zeros(n_genes)
        for i in range(n_genes):
            alpha[i], beta[i], gamma[i], delta[i] = parameters[4*i : 4*(i+1)]
        eta = parameters[4*n_genes : ]
        def r(mu, t, p):
            current_index = np.searchsorted(times, t, side='right')-1
            if current_index >= n_timeintervals:
                current_index -= 1
            current_eta = eta[current_index]
            dmu_dt = alpha + beta * current_eta / (gamma + current_eta) - delta * mu
            return dmu_dt # return statement of r

        if sensitivities:
            def jac(mu):
                return np.diag(-delta) # return statement of jac

            def dfdp(mu, t):
                current_index = np.searchsorted(times, t, side='right')-1
                if current_index >= n_timeintervals:
                    current_index -= 1
                current_eta = eta[current_index]
                ret = np.zeros((self._n_states, self._n_odeparams)) # except the following entries
#                print('ret', ret.shape)
                for j in range(n_genes):

                    ret[j , 4*j]   = 1.
                    ret[j , 4*j+1] = current_eta/(gamma[j]+current_eta)
                    ret[j , 4*j+2] = - beta[j] * current_eta/(gamma[j]+current_eta)**2
                    ret[j , 4*j+3] = -mu[j]
#                print('4*n_genes+current_index', 4*n_genes+current_index)
                ret[:, 4*n_genes+current_index] = beta * gamma / (gamma+current_eta)**2
                return ret # return statement of dfdp

            def rhs(y_and_dydp, t, p):
                y = y_and_dydp[0:self._n_states]
#                print('y', y.shape)
#                print('y_and_dydp', y_and_dydp.shape)
#                print(y_and_dydp[self._n_states:].shape)
#                print(self._n_states, self._n_odeparams)
                dydp = y_and_dydp[self._n_states:].reshape((self._n_states,
                                                            self._n_odeparams))
                dydt = r(y, t, p)
                d_dydp_dt = np.matmul(jac(y), dydp) + dfdp(y, t)
                return np.concatenate((dydt, d_dydp_dt.reshape(-1))) # return statement of rhs

            y0 = np.zeros( n_states*n_odeparams + n_states )
#            print('y0', y0.shape)
            y0[0:n_states] = mu0
            result = odeint(rhs, y0, times, (parameters,), rtol=1e-6, atol=1e-5)
            values = result[:, 0:self._n_states]
            dvalues_dp = result[:, self._n_states:].reshape((n_timesteps,
                                                             self._n_states,
                                                             self._n_odeparams))
            return values, dvalues_dp # first return statement of _simulate
        else:
            values = odeint(r, mu0, times, (parameters,), rtol=1e-6, atol=1e-5)
            return values # second return statement of _simulate

ode_model = ExpressionModel(n_states, n_odeparams)

# 2nd class
class ODEGradop(theano.Op):
    def __init__(self, numpy_vsp):
        self._numpy_vsp = numpy_vsp

    def make_node(self, x, g):
        x = theano.tensor.as_tensor_variable(x)
        g = theano.tensor.as_tensor_variable(g)
        node = theano.Apply(self, [x, g], [g.type()])
        return node

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]

        g = inputs_storage[1]
        out = output_storage[0]
        out[0] = self._numpy_vsp(x, g)       # get the numerical VSP

# 3rd class
class ODEop(theano.Op):

    def __init__(self, state, numpy_vsp):
        self._state = state
        self._numpy_vsp = numpy_vsp

    def make_node(self, x):
        x = theano.tensor.as_tensor_variable(x)

        return theano.Apply(self, [x], [x.type()])

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]
        out = output_storage[0]

        out[0] = self._state(x)               # get the numerical solution of ODE states

    def grad(self, inputs, output_grads):
        x = inputs[0]
        g = output_grads[0]

        grad_op = ODEGradop(self._numpy_vsp)  # pass the VSP when asked for gradient
        grad_op_apply = grad_op(x, g)

        return [grad_op_apply]

# 4th class
class solveCached(object):
    def __init__(self, times, n_params, n_outputs):

        self._times = times
        self._n_params = n_params
        self._n_outputs = n_outputs
        self._cachedParam = np.zeros(n_params)
        self._cachedSens = np.zeros((n_timesteps, n_outputs, n_params))
        self._cachedState = np.zeros((n_timesteps,n_outputs))

    def __call__(self, x):

        if np.all(x==self._cachedParam):
            state, sens = self._cachedState, self._cachedSens

        else:
            state, sens = ode_model.simulate_with_sensitivities(x, times)

        return state, sens

cached_solver=solveCached(times, n_odeparams, n_states)

def state(x):
    State, Sens = cached_solver(np.array(x, dtype=np.float64))
    cached_solver._cachedState, cached_solver._cachedSens, cached_solver._cachedParam = State, Sens, x
    return State.reshape((n_states*len(State),))

def numpy_vsp(x, g):
    numpy_sens = cached_solver(np.array(x, dtype=np.float64))[1].reshape((n_states*n_timesteps, len(x)))
    return numpy_sens.T.dot(g)

# Now instantiate the theano custom ODE op
my_ODEop = ODEop(state, numpy_vsp)

# The probabilistic model
with pm.Model() as ExpressionModel:

    # Priors for unknown model parameters
    alpha = pm.Lognormal('alpha', mu=1.,   sd=0.5,  shape=n_genes)
    beta  = pm.Lognormal('beta',  mu=0.05, sd=0.05, shape=n_genes)
    gamma = pm.Lognormal('gamma', mu=1.,   sd=0.5,  shape=n_genes)
    delta = pm.Lognormal('delta', mu=0.05, sd=0.05, shape=n_genes)

    eta   = pm.Lognormal('eta',   mu=1.,   sd=0.05, shape=n_timeintervals)

    sigma = pm.Lognormal('sigma', mu=1.,   sd=1.,   shape=n_genes)

    # Forward model
    all_params = pm.math.stack((alpha, beta, gamma, delta), axis=-1)
    all_params = pm.math.flatten(all_params)
    all_params = pm.math.concatenate((all_params, eta))
    ode_sol = my_ODEop(all_params)
#    forward = ode_sol.reshape(exprtg.shape)
    forward = ode_sol.reshape((n_timesteps, n_genes))

    # Likelihood
    Y_obs = pm.Lognormal('Y_obs', mu=pm.math.log(forward), sd=sigma, observed=exprtg)

    trace = pm.sample(1500, tune=1000, init='adapt_diag')

trace['diverging'].sum()

with ExpressionModel:
    pm.traceplot(trace);

summary = pm.summary(trace)
summary

The data file is pasted here.

However, when run, I get the following error:

ParallelSamplingError: Bad initial energy

More precisely, I get the very informative message:

Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
Y_obs   NaN

I searched the forum and the FAQ for this error. Trying to debug allowed me to fix a trivial mistake, but not to resolve the issue.

I checked that the command Y_obs.tag.test_value[Y_obs.tag.test_value<=0.] returned an empty array so now I don’t know where to look at any more.

Any help would be very much appreciated.

Try pm.sample(..., init='adapt_diag'), as the random jitter in the initial condition might create bad energy when you have a model that is sensitive to the initial condition.

Isn’t it what I am already doing here?

Ah sorry I didnt notice that :sweat_smile:
The problem is that you have a row of data containing NaNs:
1 3 5 NA NA NA NA NA NA NA NA NA NA NA NA

Try excluding this row and rerun your model

Thank you Junpeng! Now the model is running.

I encounter other problems with the interpretation of the results. I’ll come back here if I cannot have better results.

1 Like

As a follow-up to the previous posts, I am seeking for help to understand why the results of the samples are not good.

I made a slight correction to the model in the OP as now \eta is normalized with

\eta(t) = 1 \quad \mbox{if} \quad t \in [t_0, t_1[

The model is not rocket science, and is a good way for me to get introduced to probabilistic programming.

# Taken from https://docs.pymc.io/notebooks/ODE_with_manual_gradients.html
# See also https://github.com/pymc-devs/pymc3/blob/master/docs/source/notebooks/ODE_with_manual_gradients.ipynb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import theano
from io import BytesIO
from scipy.integrate import odeint
from scipy.optimize import curve_fit

THEANO_FLAGS = 'optimizer=fast_compile'
theano.config.exception_verbosity = 'high'
theano.config.floatX = 'float64'

n_conditions = 2
n_genes = 12
n_replicats = 4
times = np.array([0., 0.5, 1., 3.5, 6.5, 9.25, 12.5, 18.5, 22.5, 26.5, 30.5])
n_timeintervals = times.size-1 # = 10
n_timesteps = times.size # = 11

Binary = open('data.txt').read().replace(',', '.').encode()
expr = np.genfromtxt(BytesIO(Binary), missing_values = 'NA', filling_values = np.nan, skip_header = 1)
#exprcrtg = expr[: , 3 : ].reshape(n_conditions, n_replicats, n_timesteps, n_genes)
exprtg = expr[: , 3 : ].reshape(7, n_timesteps, n_genes)
exprtgTF = expr[: , 7 ].reshape(7, n_timesteps)
mu0  = np.nanmean(exprtg[:, 0, :], axis = (0, 1))

n_odeparams = 4*n_genes + n_timeintervals-1 # eta is normalized with eta=1. on the first time interval
n_states = n_genes

# 1st class
class ExpressionModel(object):
    def __init__(self, n_states, n_odeparams, y0=None):
        self._n_states = n_states
        self._n_odeparams = n_odeparams
        self._y0 = y0
    def simulate(self, parameters, times):
        return self._simulate(parameters, times, False)
    def simulate_with_sensitivities(self, parameters, times):
        return self._simulate(parameters, times, True)
    def _simulate(self, parameters, times, sensitivities):
        alpha = np.zeros(n_genes)
        beta  = np.zeros(n_genes)
        gamma = np.zeros(n_genes)
        delta = np.zeros(n_genes)
        for i in range(n_genes):
            alpha[i], beta[i], gamma[i], delta[i] = parameters[4*i : 4*(i+1)]
        eta = np.concatenate(([1.], parameters[4*n_genes : ]))
#        eta = parameters[4*n_genes : ]
        def r(mu, t, p):
            current_index = np.searchsorted(times, t, side='right')-1
            if current_index >= n_timeintervals:
                current_index -= 1
            current_eta = eta[current_index]
            dmu_dt = alpha + beta * current_eta / (gamma + current_eta) - delta * mu
            return dmu_dt # return statement of r

        if sensitivities:
            def jac(mu):
                return np.diag(-delta) # return statement of jac

            def dfdp(mu, t):
                current_index = np.searchsorted(times, t, side='right')-1
                if current_index >= n_timeintervals:
                    current_index -= 1
                current_eta = eta[current_index]
                ret = np.zeros((self._n_states, self._n_odeparams)) # except the following entries
                for j in range(n_genes):
                    ret[j , 4*j]   = 1.
                    ret[j , 4*j+1] = current_eta/(gamma[j]+current_eta)
                    ret[j , 4*j+2] = - beta[j] * current_eta/(gamma[j]+current_eta)**2
                    ret[j , 4*j+3] = -mu[j]
                if current_index >= 1:
                    ret[:, 4*n_genes+current_index-1] = beta * gamma / (gamma+current_eta)**2
#                ret[:, 4*n_genes+current_index] = beta * gamma / (gamma+current_eta)**2
                return ret # return statement of dfdp

            def rhs(y_and_dydp, t, p):
                y = y_and_dydp[:self._n_states]
                dydp = y_and_dydp[self._n_states:].reshape((self._n_states,
                                                            self._n_odeparams))
                dydt = r(y, t, p)
                d_dydp_dt = np.matmul(jac(y), dydp) + dfdp(y, t)
                return np.concatenate((dydt, d_dydp_dt.reshape(-1))) # return statement of rhs

            y0 = np.zeros( n_states*n_odeparams + n_states )
            y0[0:n_states] = mu0
            result = odeint(rhs, y0, times, (parameters,), rtol=1e-6, atol=1e-5)
            values = result[:, :self._n_states]
            dvalues_dp = result[:, self._n_states:].reshape((n_timesteps,
                                                             self._n_states,
                                                             self._n_odeparams))
            return values, dvalues_dp # first return statement of _simulate
        else:
            values = odeint(r, mu0, times, (parameters,), rtol=1e-6, atol=1e-5)
            return values # second return statement of _simulate

ode_model = ExpressionModel(n_states, n_odeparams)

# 2nd class
class ODEGradop(theano.Op):
    def __init__(self, numpy_vsp):
        self._numpy_vsp = numpy_vsp

    def make_node(self, x, g):
        x = theano.tensor.as_tensor_variable(x)
        g = theano.tensor.as_tensor_variable(g)
        node = theano.Apply(self, [x, g], [g.type()])
        return node

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]

        g = inputs_storage[1]
        out = output_storage[0]
        out[0] = self._numpy_vsp(x, g)       # get the numerical VSP

# 3rd class
class ODEop(theano.Op):

    def __init__(self, state, numpy_vsp):
        self._state = state
        self._numpy_vsp = numpy_vsp

    def make_node(self, x):
        x = theano.tensor.as_tensor_variable(x)

        return theano.Apply(self, [x], [x.type()])

    def perform(self, node, inputs_storage, output_storage):
        x = inputs_storage[0]
        out = output_storage[0]

        out[0] = self._state(x)               # get the numerical solution of ODE states

    def grad(self, inputs, output_grads):
        x = inputs[0]
        g = output_grads[0]

        grad_op = ODEGradop(self._numpy_vsp)  # pass the VSP when asked for gradient
        grad_op_apply = grad_op(x, g)

        return [grad_op_apply]

# 4th class
class solveCached(object):
    def __init__(self, times, n_params, n_outputs):

        self._times = times
        self._n_params = n_params
        self._n_outputs = n_outputs
        self._cachedParam = np.zeros(n_params)
        self._cachedSens = np.zeros((n_timesteps, n_outputs, n_params))
        self._cachedState = np.zeros((n_timesteps, n_outputs))

    def __call__(self, x):

        if np.all(x==self._cachedParam):
            state, sens = self._cachedState, self._cachedSens

        else:
            state, sens = ode_model.simulate_with_sensitivities(x, times)

        return state, sens

cached_solver=solveCached(times, n_odeparams, n_states)

def state(x):
    State, Sens = cached_solver(np.array(x, dtype=np.float64))
    cached_solver._cachedState, cached_solver._cachedSens, cached_solver._cachedParam = State, Sens, x
    return State.reshape((n_states*len(State),))

def numpy_vsp(x, g):
    numpy_sens = cached_solver(np.array(x, dtype=np.float64))[1].reshape((n_states*n_timesteps, len(x)))
    return numpy_sens.T.dot(g)

# Now instantiate the theano custom ODE op
my_ODEop = ODEop(state, numpy_vsp)

# prior for alphas
alpha0 = 0.2*np.ones(n_genes)

# prior for betas
beta0 = np.nanmean(exprtg[: , 1, :], axis = (0, ))

# prior for deltas
def expo(t, delta):
    global g3
    return g3 * np.exp(-delta * t)

delta0 = np.zeros(n_genes)
for gene_index in range(n_genes): # gene_index goes from 0 to n_genes-1
    xdata = times[3:8] - times[3]
    ydata = exprtg[: , 3:8, gene_index]
    popt = np.zeros(7)
    for k in range(7):
        g3 = exprtg[k, 3, gene_index]
        popt[k], pcov = curve_fit(expo, xdata, ydata[k])
    delta0[gene_index] = np.nanmean(popt)

# prior for etas
exprtgTF_normalized = exprtgTF[:, 1:-1] / exprtgTF[:, 0, np.newaxis]
#exprtgTF_normalized = exprtgTF[:, :-1] / exprtgTF[:, 0, np.newaxis]
eta0 = np.nanmean(exprtgTF_normalized, axis = (0, ))

# prior for gammas
gamma0 = eta0.max() / 2 * np.ones(n_genes)


# The probabilistic model
with pm.Model() as ExpressionModel:

    # Priors for unknown model parameters
    alpha = pm.Lognormal('alpha', mu=1., sd=0.1, shape=n_genes)
    beta  = pm.Lognormal('beta',  mu=0.05,  sd=10., shape=n_genes)
    gamma = pm.Lognormal('gamma', mu=1., sd=1.,  shape=n_genes)
    delta = pm.Lognormal('delta', mu=0.05, sd=0.1, shape=n_genes)
    eta   = pm.Lognormal('eta',   mu=1.,   sd=10., shape=n_timeintervals-1)
#    eta   = pm.Lognormal('eta',   mu=1.,   sd=10., shape=n_timeintervals)

    sigma = pm.Lognormal('sigma', mu=1.,   sd=1.,   shape=n_genes)

    # Forward model
    all_params = pm.math.stack((alpha, beta, gamma, delta), axis=-1)
    all_params = pm.math.flatten(all_params)
    all_params = pm.math.concatenate((all_params, eta))
    ode_sol = my_ODEop(all_params)
#    forward = ode_sol.reshape(exprtg.shape)
    forward = ode_sol.reshape((n_timesteps, n_genes))

    # Likelihood
    Y_obs = pm.Lognormal('Y_obs', mu=pm.math.log(forward), sd=sigma, observed=exprtg)

#    trace = pm.sample(1500, tune=1000, init='adapt_diag')
    trace = pm.sample(300, tune=300, init='adapt_diag')

#trace['diverging'].sum()

#with ExpressionModel:
#    pm.traceplot(trace);

#summary = pm.summary(trace)
#summary

Trying to explore the posterior, I checked the pm.summary(trace) command:

eta[0]     3.255860e+16  7.973735e+17   0.000  942411.813  3.233352e+16  ...     608.0   608.0     294.0     365.0   1.00
eta[1]     1.820000e-01  5.320000e-01   0.000       0.648  2.200000e-02  ...     577.0   577.0     212.0     230.0   1.02
eta[2]     4.600000e-02  5.590000e-01   0.000       0.092  2.300000e-02  ...     614.0   608.0     253.0     287.0   1.00
eta[3]     8.000000e-03  2.800000e-02   0.000       0.035  1.000000e-03  ...     677.0   620.0     318.0     208.0   1.00
eta[4]     4.000000e-03  1.000000e-02   0.000       0.023  0.000000e+00  ...     663.0   618.0     206.0     187.0   1.01
eta[5]     5.000000e-03  1.400000e-02   0.000       0.026  1.000000e-03  ...     694.0   627.0     380.0     262.0   1.00
eta[6]     3.000000e-03  1.000000e-02   0.000       0.014  0.000000e+00  ...     547.0   547.0     549.0     311.0   1.00
eta[7]     3.000000e-03  1.000000e-02   0.000       0.012  0.000000e+00  ...     611.0   611.0     490.0     399.0   1.00
eta[8]     9.000000e-03  3.200000e-02   0.000       0.045  1.000000e-03  ...     670.0   618.0     470.0     412.0   1.00

which shows an obvious problem with \eta_1.

Any help or hint in that direction would be greatly appreciated! Thank you all for your valuable input.