Fitting a complex model with a large dataset: chain contains only diverging samples

ross-monaghan · December 11, 2019, 9:47am

I’m rather new to PyMC3 and trying my hand at using it to analyse some data we went over in class. Here’s the link to the paper which includes the model I am trying to analyse the data with. Due to the complexity of the model specified in the paper, and the number of data points, I cannot seem to get any inference to work, so am thinking I may be doing something fundamentally wrong.

First I extract the data

err = raw_data[:, 10]  # 1237 data points
obs = raw_data[:, 9]

I then define all of my priors. To be safe I have set them all as normal distributions, with the mean set to the fit parameters as found in the original paper, with a placeholder deviation.

with pm.Model() as model:

    sig_L_NR_0 = pm.Normal('sig_L_NR_0', mu=86.7, sigma=1, testval=86.7)
    a_L = pm.Normal('a_L', mu=0.0, sigma=1, testval=0.0)
    b_L = pm.Normal('b_L', mu=4.0294, sigma=1, testval=4.0294)
    c_L = pm.Normal('c_L', mu=3.1285, sigma=1, testval=3.1285)
    d_L = pm.Normal('d_L', mu=0.3340, sigma=1, testval=0.3340)
    e_L = pm.Normal('e_L', mu=4.9623, sigma=1, testval=4.9623)

    sig_T_NR_00 = pm.Normal('sig_L_NR_00', mu=246.1,
                            sigma=1, testval=246.1)
    sig_T_NR_01 = pm.Normal('sig_L_NR_01', mu=-89.4,
                            sigma=1, testval=-89.4)
    sig_T_NR_0 = tt.stacklists([sig_T_NR_00, sig_T_NR_01])

    a_T0 = pm.Normal('a_T0', mu=0.0675, sigma=1, testval=0.0675)
    a_T1 = pm.Normal('a_T1', mu=0.2098, sigma=1, testval=0.2098)
    a_T = tt.stacklists([a_T0, a_T1])

    b_T0 = pm.Normal('b_T0', mu=1.3501, sigma=1, testval=1.3501)
    b_T1 = pm.Normal('b_T1', mu=1.5715, sigma=1, testval=1.5715)
    b_T = tt.stacklists([b_T0, b_T1])

    c_T0 = pm.Normal('c_T0', mu=0.1205, sigma=1, testval=0.1205)
    c_T1 = pm.Normal('c_T1', mu=0.0907, sigma=1, testval=0.0907)
    c_T = tt.stacklists([c_T0, c_T1])

    d_T0 = pm.Normal('d_T0', mu=-0.003849, sigma=1, testval=-0.003849)
    d_T1 = pm.Normal('d_T1', mu=0.0010362, sigma=1, testval=0.0010362)
    d_T = tt.stacklists([d_T0, d_T1])

    A_T_i_00 = pm.Normal('A_T_i_00', mu=7.78, sigma=1, testval=7.78)
    A_T_i_01 = pm.Normal('A_T_i_01', mu=6.335, sigma=1, testval=6.335)
    A_T_i_02 = pm.Normal('A_T_i_02', mu=0.603, sigma=1, testval=0.603)
    A_T_i_03 = pm.Normal('A_T_i_03', mu=2.330, sigma=1, testval=2.330)
    A_T_i_04 = pm.Normal('A_T_i_04', mu=1.979, sigma=1, testval=1.979)
    A_T_i_05 = pm.Normal('A_T_i_05', mu=0.0225, sigma=1, testval=0.0225)
    A_T_i_06 = pm.Normal('A_T_i_06', mu=3.419, sigma=1, testval=3.419)

    A_T_i_0 = tt.stacklists(
        [A_T_i_00, A_T_i_01, A_T_i_02, A_T_i_03, A_T_i_04, A_T_i_05, A_T_i_06])

    A_L_i_00 = pm.Normal('A_L_i_00', mu=29.4140, sigma=1, testval=29.4140)
    A_L_i_01 = pm.Normal('A_L_i_01', mu=0.0, sigma=1, testval=0.0)
    A_L_i_02 = pm.Normal('A_L_i_02', mu=157.92, sigma=1, testval=157.92)
    A_L_i_03 = pm.Normal('A_L_i_03', mu=4.2160, sigma=1, testval=4.2160)
    A_L_i_04 = pm.Normal('A_L_i_04', mu=13.764, sigma=1, testval=13.764)
    A_L_i_05 = pm.Normal('A_L_i_05', mu=5.5124, sigma=1, testval=5.5124)
    A_L_i_06 = pm.Normal('A_TL_i_06', mu=11.0, sigma=1, testval=11.0)

    A_L_i_0 = tt.stacklists(
        [A_L_i_00, A_L_i_01, A_L_i_02, A_L_i_03, A_L_i_04, A_L_i_05, A_L_i_06])

    a_i0 = pm.Normal('a_i0', mu=4.229, sigma=1, testval=4.229)
    a_i1 = pm.Normal('a_i1', mu=6823.2, sigma=1, testval=6823.2)
    a_i2 = pm.Normal('a_i2', mu=21.24, sigma=1, testval=21.24)
    a_i3 = pm.Normal('a_i3', mu=-0.288, sigma=1, testval=-0.288)
    a_i4 = pm.Normal('a_i4', mu=-0.562, sigma=1, testval=-0.562)
    a_i5 = pm.Normal('a_i5', mu=462.13, sigma=1, testval=462.13)
    a_i6 = pm.Normal('a_i6', mu=0.0, sigma=1, testval=0.0)

    a_i = tt.stacklists(
        [a_i0, a_i1, a_i2, a_i3, a_i4, a_i5, a_i6])

    b_i0 = pm.Normal('b_i0', mu=1.260, sigma=1, testval=1.260)
    b_i1 = pm.Normal('b_i1', mu=33521.0, sigma=1, testval=33521.0)
    b_i2 = pm.Normal('b_i2', mu=0.056, sigma=1, testval=0.056)
    b_i3 = pm.Normal('b_i3', mu=0.186, sigma=1, testval=0.186)
    b_i4 = pm.Normal('b_i4', mu=0.390, sigma=1, testval=0.390)
    b_i5 = pm.Normal('b_i5', mu=0.192, sigma=1, testval=0.192)
    b_i6 = pm.Normal('b_i6', mu=0.0, sigma=1, testval=0.0)

    b_i = tt.stacklists(
        [b_i0, b_i1, b_i2, b_i3, b_i4, b_i5, b_i6])

    c_i0 = pm.Normal('c_i0', mu=2.124, sigma=1, testval=2.124)
    c_i1 = pm.Normal('c_i1', mu=2.569, sigma=1, testval=2.569)
    c_i2 = pm.Normal('c_i2', mu=2.489, sigma=1, testval=2.489)
    c_i3 = pm.Normal('c_i3', mu=0.064, sigma=1, testval=0.064)
    c_i4 = pm.Normal('c_i4', mu=0.549, sigma=1, testval=0.549)
    c_i5 = pm.Normal('c_i5', mu=1.914, sigma=1, testval=1.914)
    c_i6 = pm.Normal('c_i6', mu=1.0, sigma=1, testval=1.0)

    c_i = tt.stacklists(
        [c_i0, c_i1, c_i2, c_i3, c_i4, c_i5, c_i6])

    d_i0 = pm.Normal('d_i0', mu=19.910, sigma=1, testval=19.910)
    d_i1 = pm.Normal('d_i1', mu=3856.5, sigma=1, testval=3856.5)
    d_i2 = pm.Normal('d_i2', mu=97.046, sigma=1, testval=97.046)
    d_i3 = pm.Normal('d_i3', mu=0.038, sigma=1, testval=0.038)
    d_i4 = pm.Normal('d_i4', mu=0.314, sigma=1, testval=0.314)
    d_i5 = pm.Normal('d_i5', mu=0.054, sigma=1, testval=0.054)
    d_i6 = pm.Normal('d_i6', mu=1.895, sigma=1, testval=1.895)

    d_i = tt.stacklists(
        [d_i0, d_i1, d_i2, d_i3, d_i4, d_i5, d_i6])

    e_i0 = pm.Normal('e_i0', mu=0.226, sigma=1, testval=0.226)
    e_i1 = pm.Normal('e_i1', mu=0.657, sigma=1, testval=0.657)
    e_i2 = pm.Normal('e_i2', mu=0.310, sigma=1, testval=0.310)
    e_i3 = pm.Normal('e_i3', mu=1.218, sigma=1, testval=1.218)
    e_i4 = pm.Normal('e_i4', mu=3.0, sigma=1, testval=3.0)
    e_i5 = pm.Normal('e_i5', mu=1.309, sigma=1, testval=1.309)
    e_i6 = pm.Normal('e_i6', mu=0.514, sigma=1, testval=0.514)

    e_i = tt.stacklists(
        [e_i0, e_i1, e_i2, e_i3, e_i4, e_i5, e_i6])

    Gamma_i0 = pm.Normal('Gamma_i0', mu=0.136, sigma=1, testval=0.136)
    Gamma_i1 = pm.Normal('Gamma_i1', mu=0.22, sigma=1, testval=0.22)
    Gamma_i2 = pm.Normal('Gamma_i2', mu=0.083, sigma=1, testval=0.083)
    Gamma_i3 = pm.Normal('Gamma_i3', mu=0.096, sigma=1, testval=0.096)
    Gamma_i4 = pm.Normal('Gamma_i4', mu=0.109, sigma=1, testval=0.109)
    Gamma_i5 = pm.Normal('Gamma_i5', mu=0.379, sigma=1, testval=0.379)
    Gamma_i6 = pm.Normal('Gamma_i6', mu=0.38, sigma=1, testval=0.38)

    Gamma_i = tt.stacklists(
        [Gamma_i0, Gamma_i1, Gamma_i2, Gamma_i3, Gamma_i4, Gamma_i5, Gamma_i6])

    M_i0 = pm.Normal('M_i0', mu=1.232, sigma=1, testval=1.232)
    M_i1 = pm.Normal('M_i1', mu=1.535, sigma=1, testval=1.535)
    M_i2 = pm.Normal('M_i2', mu=1.520, sigma=1, testval=1.520)
    M_i3 = pm.Normal('M_i3', mu=1.680, sigma=1, testval=1.680)
    M_i4 = pm.Normal('M_i4', mu=1.650, sigma=1, testval=1.650)
    M_i5 = pm.Normal('M_i5', mu=1.440, sigma=1, testval=1.440)
    M_i6 = pm.Normal('M_i6', mu=1.9, sigma=1, testval=1.9)

    M_i = tt.stacklists(
        [M_i0, M_i1, M_i2, M_i3, M_i4, M_i5, M_i6])

I then extract the kinematic factors from the data and throw all the variables into the model equations:

W_2 = np.array(raw_data[:, 4]).reshape((len(raw_data), 1))
Q_2 = np.array(raw_data[:, 3]).reshape((len(raw_data), 1))
e = np.array(raw_data[:, 6])
G = np.array(raw_data[:, 7])

W = np.sqrt(W_2)
Q_4 = Q_2**2
delta_W = W - (M_p + m_p)
Q_2_mod = 1 + (Q_2 / 0.91)

K = (W_2 - M_p_2) / (2 * M_p)
K_CM = (W_2 - M_p_2) / (2 * W)

x_ = Q_2 / (2 * M_p * ((W_2 - M_p**2 + Q_2) / (2 * M_p)))
x0 = 1 / (1 + ((W_2 - ((M_p + m_p)**2)) / (Q_2 + 0.05)))
x1 = 1 / (1 + ((W_2 - ((M_p + m_p)**2)) / (Q_2 + 0.125)))
t = np.log(np.log((Q_2 + 4.2802) / (0.33**2)) /
           (np.log((4.2802) / (0.33**2))))

e_p_cm = (W_2 + m_p**2 - M_p_2) / (2 * W)
p_p_cm = np.sqrt(np.maximum(0, e_p_cm**2 - m_p_2))
e_p_2_cm = (W_2 + (2. * m_p)**2 - M_p_2) / (2 * W)
p_p_2_cm = np.sqrt(np.maximum(0, e_p_2_cm**2 - (2. * m_p)**2))
e_eta_cm = (W_2 + (m_eta * m_eta) - M_p_2) / (2 * W)
p_eta_cm = np.sqrt(np.maximum(0, e_eta_cm**2 - m_eta**2))

K_i = (np.power(M_i, 2) - M_p_2) / (2 * M_p)
K_CM_i = np.divide(np.power(M_i, 2) - M_p_2, 2 * M_i)

e_p_cm_i = np.divide(np.power(M_i, 2) + m_p_2 - M_p_2, 2 * M_i)
e_p_2_cm_i = np.divide(np.power(M_i, 2) + 4 * m_p_2 - M_p_2, 2 * M_i)
e_eta_cm_i = np.divide(np.power(M_i, 2) + (m_eta**2) - M_p_2, 2 * M_i)

p_p_cm_i = tt.sqrt(tt.maximum(0, e_p_cm_i**2 - m_p_2))
p_p_2_cm_i = tt.sqrt(tt.maximum(0, e_p_2_cm_i**2 - (2 * m_p)**2))
p_eta_cm_i = tt.sqrt(tt.maximum(0, e_eta_cm_i**2 - m_eta**2))

p_eta_cm_i = tt.set_subtensor(p_eta_cm_i[0], 1)
p_eta_cm_i = tt.set_subtensor(p_eta_cm_i[5], 1)

g_pion = Gamma_i * ((p_p_cm / p_p_cm_i)**(1. + (2. * mom))) * \
    (((p_p_cm_i**2 + X0**2.) / (p_p_cm**2. + X0**2.))**mom)

g_eta = Gamma_i * ((p_eta_cm / p_eta_cm_i)**(1. + (2. * mom))) * \
    (((p_eta_cm_i**2 + X0**2.) / (p_eta_cm**2. + X0**2.))**mom)

g_2_pion = (W / M_i) * Gamma_i * \
    ((p_p_2_cm / p_p_2_cm_i)**(4. + (2. * mom))) * \
    (((p_p_2_cm_i**2 + X0**2.) /
      (p_p_2_cm**2. + X0**2.))**(2 + mom))

Gamma_photon_i = ((K_CM_i**2 + X0**2) / (K_CM**2 + X0**2)) * \
    ((Gamma_i) * (K_CM / K_CM_i)**2)

sig_L_NR = np.squeeze(sig_L_NR_0 *
                      (((1 - x1)**(a_L * t + b_L)) / (1 - x_)) *
                      ((Q_2**c_L) / ((Q_2 + 0.125)**(1 + c_L))) *
                      (x1**(d_L + e_L * t)))

sig_T_NR = tt.sum(
    x0 *
    ((delta_W**(eye)) *
     ((sig_T_NR_0) /
      ((Q_2 + a_T)**(b_T + (Q_2 * c_T) + (Q_4 * d_T))))), axis=1)

A_T_i = ((A_T_i_0) / (Q_2_mod**c_i)) * \
    (1 + ((Q_2 * a_i) / (1 + Q_2 * b_i)))

A_L_i = A_L_i_0 * \
    (Q_2 / (1 + Q_2 * d_i)) * (tt.exp(-1 * Q_2 * e_i))

g_tot = branch_1 * g_pion + \
    branch_2 * g_2_pion + \
    branch_3 * g_eta

BW_i = ((K_i * K_CM_i) / (K * K_CM)) * \
    (((g_tot) * (Gamma_photon_i)) /
     ((Gamma_i) * ((W_2 - (M_i**2))**2 + (((M_i) * (g_tot))**2))))

sig_T_R_i = W * (BW_i) * (A_T_i**2)
sig_T_R = tt.sum(sig_T_R_i, axis=1)
sig_L_R_i = W * (BW_i) * (A_L_i**2)
sig_L_R = tt.sum(sig_L_R_i, axis=1)

sig_T = sig_T_R + sig_T_NR
sig_L = sig_L_R + sig_L_NR
sig = (1000.0 * G * sig_T) + (1000.0 * G * e * sig_L)

FInally I run it all

y = pm.Normal('y', mu=sig, sigma=err, observed=obs)

trace = pm.sample(2000, init='adapt_diag')
burnt_trace = trace[1000:]

The output of this is:

ValueError: Mass matrix contains zeros on the diagonal. 
The derivative of RV `M_i6`.ravel()[0] is zero.
.... and so on

Something I noticed which I don’t understand enough to understand is that if i set tune=100, it will run but each draw will diverge with the error being:

/usr/local/lib/python3.7/site-packages/xarray/core/nputils.py:223: RuntimeWarning: All-NaN slice 
encountered
result = getattr(npmodule, name)(values, axis=axis, **kwargs)
The chain contains only diverging samples. The model is probably misspecified.

Now I’ve run through all of the check_test_point(), and they seem fine, and I know the model with those parameters work’s quite well, so I’m unsure of what’s going so terribly wrong. Any help would be appreciated as I’m rather lost.

junpenglao · December 11, 2019, 1:14pm

Since you are setting the parameter to centered on the true value (or at least a local maximum), the gradient around the maximum is usually 0 thus why giving you error. Try instead just setting them to 0.
also, you can do

L = pm.Normal('L', mu=np.asarray([0., 4.0294, 3.1285, 0.3340, 4.9623]), sigma=1.)

instead of

    a_L = pm.Normal('a_L', mu=0.0, sigma=1, testval=0.0)
    b_L = pm.Normal('b_L', mu=4.0294, sigma=1, testval=4.0294)
    c_L = pm.Normal('c_L', mu=3.1285, sigma=1, testval=3.1285)
    d_L = pm.Normal('d_L', mu=0.3340, sigma=1, testval=0.3340)
    e_L = pm.Normal('e_L', mu=4.9623, sigma=1, testval=4.9623)

AlexAndorra · December 11, 2019, 4:17pm

No need for shape=5 kwarg here Junpeng?

ross-monaghan · December 11, 2019, 9:21pm

Ah yeah that would definitely explain why the gradients are all zeroing out if I’m starting all the variables at their extrema. I’ll try this out and get back.

Also is this method of determining variables ‘correct’, it feels very inefficient and somewhat of an abuse of PyMC3 just to throw all of these equations into the model?

junpenglao · December 12, 2019, 6:14am

It should be fine as the model code is only execute once to build the graph. However, to improve readability you might consider moving the transformation on data outside, and write function so that the code is more compact.

ross-monaghan · December 12, 2019, 7:07am

Thank you @junpenglao for your help, much appreciated. However, after changing the means I got the same error:

ValueError: Mass matrix contains zeros on the diagonal. 
The derivative of RV `M_i6`.ravel()[0] is zero.
....

After a bit of testing I realised the gradient of p_eta_cm_i threw up some nan values which were breaking everything.

I was doing the following to remove some zeros:

p_eta_cm_i = tt.sqrt(tt.maximum(0, e_eta_cm_i**2 - m_eta**2))

# Remove any zeros so we don't divide by them
p_eta_cm_i = tt.set_subtensor(p_eta_cm_i[0], 1)
p_eta_cm_i = tt.set_subtensor(p_eta_cm_i[5], 1)

which I assumed would work as I thought gradients worked with subtensors. However, by completely rewriting p_eta_cm_i as

p_eta_cm_i = tt.zeros(7)
p_eta_cm_i = tt.inc_subtensor(p_eta_cm_i[[0, 5]], 1)
p_eta_cm_i = tt.inc_subtensor(p_eta_cm_i[[1, 2, 3, 4, 6]], tt.sqrt(
    tt.maximum(0, e_eta_cm_i[[1, 2, 3, 4, 6]]**2 - m_eta**2)))

The gradient is well defined and it infers smoothly (albeit slowly). Not sure why the second subtensor implementation worked while the other did not though. Thank you again for your help though

Topic		Replies	Views
Sampling crashes with "The derivative of RV [whatever] is zero" when using a custom likelihood defined with pm.Potential Questions	4	4014	February 23, 2019
206 samples :: Mass matrix contains zeros on the diagonal Questions	16	1505	June 11, 2021
"ValueError: Mass matrix contains zeros on the diagonal" and "pymc3 the derivative of rv is zero" Questions	1	698	October 18, 2021
ValueError: Mass matrix contains zeros on the diagonal Questions	1	2214	March 16, 2020
Reinfocement learning model - derivative of RV is zero Questions	5	1767	April 10, 2019

Fitting a complex model with a large dataset: chain contains only diverging samples

Related topics