Defining an array to store RV before they are introduced in the likelihood

I have written a model (thanks to all the help from the pymc3 developpers) in which I compare observations with a parametrised model (T_low, ne, S2_abund and S3_abund)

Each point has a different uncertainty (because of how they are observed) so I define a normal likelihood for each data point:

from os import environ
environ["MKL_THREADING_LAYER"] = "GNU"
import pymc3 as pm
import numpy as np
import theano.tensor as tt
import matplotlib.pyplot as plt


def S2_emis(temp_range, den_range, a, b, c, d, e):
    return a + b / temp_range + c * np.log10(temp_range) + np.log10(
        1 + e * den_range)


def S3_emis(temp_range, den_range, a, b, c):
    return a + b / temp_range + c * np.log10(temp_range)


def S2_emis_tt(temp_range, den_range, a, b, c, d, e):
    return a + b / temp_range + c * tt.log10(temp_range) + tt.log10(
        1 + e * den_range)


def S3_emis_tt(temp_range, den_range, a, b, c):
    return a + b / temp_range + c * tt.log10(temp_range)

# Lines list
labels_array = np.array(
    ['S2_6717A', 'S2_6731A', 'S3_6312A', 'S3_9069A', 'S3_9531A'])
ions_array = np.array(['S2', 'S2', 'S3', 'S3', 'S3'])
range_lines = np.arange(labels_array.size)
abund = {'S2': 5.48, 'S3': 6.36}
func = {'S2': S2_emis, 'S3': S3_emis}
funcPm3 = {'S2': S2_emis_tt, 'S3': S3_emis_tt}

# True data
Te_true, ne_true = 12500.0, 115.0
te_err, ne_err = 500, 25
abund_true = {'S2': 5.48, 'S3': 6.36}

coeffs_dict = {
    'S2_6717A':
    np.array([4.600e+00, -9.185e+03, 4.219e-01, 1.000e+00, -4.815e-04]),
    'S2_6731A':
    np.array([4.494e+00, -9.018e+03, 4.091e-01, 1.000e+00, 1.166e-04]),
    'S3_6312A':
    np.array([3.012e+00, -1.629e+04, 5.648e-01]),
    'S3_9069A':
    np.array([3.077e+00, -6.182e+03, 5.895e-01]),
    'S3_9531A':
    np.array([3.470e+00, -6.182e+03, 5.895e-01])
}

# Generate synthetic observational values
flux_array = np.empty(labels_array.size)
for i in np.arange(labels_array.size):
    # Get line label
    line_label = labels_array[i]

    # Ion emissivity function
    emisfunc = func[ions_array[i]]

    # Line flux
    flux_array[i] = abund_true[ions_array[i]] + emisfunc(
        Te_true, ne_true, *coeffs_dict[line_label]) - 12.0
# Artificial uncertainty on the measurements
err_array = np.abs(flux_array * 0.05)

# Run pymc3 model
with pm.Model() as model:
    # Physical conditions priors
    T_low = pm.Normal('T_low', mu=10000.0, sd=1000.0)
    n_e = pm.Normal('n_e', mu=80, sd=50)

    # Composition priors
    abund_dict = {
        'S2': pm.Uniform('S2_abund', lower=0, upper=10),
        'S3': pm.Uniform('S3_abund', lower=0, upper=10)
    }

    for i in range_lines:
        # Line data
        line_label = labels_array[i]
        line_ion = ions_array[i]
        line_abund = abund_dict[line_ion]
        line_coeffs = coeffs_dict[line_label]
        line_func = funcPm3[line_ion]

        # Line parameters
        line_flux = line_abund + line_func(T_low, n_e, *line_coeffs) - 12.0

        # Individual Likelihood for each object
        chiSq = pm.Normal(
            line_label + '_Y',
            mu=line_flux,
            sd=err_array[i],
            observed=flux_array[i])
    for RV in model.basic_RVs:
        print(RV.name, RV.logp(model.test_point))

    # Launch model
    trace = pm.sample(10000, tune=2000)
    
print pm.summary(trace)
pm.traceplot(trace)
plt.show()

I would like to test the impact on the result if I define a global multivariable normal distribution as my likelihood with all the data points.

At this point however, I am not sure which is right container to store the line_fluxes defined at each step: Should it be an empty symbolic tensor flow array created at each step? A shared variable array which I filled up at each iteration?

Thanks for any advice.

(I edited your post using markdown code format so it is easier to copy and paste)
FYI you can do model.check_test_point() instead of

for RV in model.basic_RVs:
        print(RV.name, RV.logp(model.test_point))

You need to create a theano tensor/pymc3 RV line_flux that has the same first dimension as the observed, couple ways to do so:

    line_flux = []
    for i in ...:
        line_flux.append(...)
    line_flux = tt.stack(line_flux)
    line_flux = tt.zeros(flux_array.shape)
    for i in ...:
        line_flux_i = line_abund + line_func(T_low, n_e, *line_coeffs) - 12.0
        line_flux = tt.inc_subtensor(line_flux[i], line_flux_i)
1 Like

Thank you very much!! That worked great:

from os import environ
environ["MKL_THREADING_LAYER"] = "GNU"
import pymc3 as pm
import numpy as np
import theano
import theano.tensor as tt
import matplotlib.pyplot as plt
theano.config.compute_test_value = "ignore"

def S2_emis(temp_range, den_range, a, b, c, d, e):
    return a + b / temp_range + c * np.log10(temp_range) + np.log10(1 + e * den_range)


def S3_emis(temp_range, den_range, a, b, c):
    return a + b / temp_range + c * np.log10(temp_range)


def S2_emis_tt(temp_range, den_range, a, b, c, d, e):
    return a + b / temp_range + c * tt.log10(temp_range) + tt.log10(1 + e * den_range)


def S3_emis_tt(temp_range, den_range, a, b, c):
    return a + b / temp_range + c * tt.log10(temp_range)


# Lines list
labels_array = np.array(['S2_6717A', 'S2_6731A', 'S3_6312A', 'S3_9069A', 'S3_9531A'])
ions_array = np.array(['S2', 'S2', 'S3', 'S3', 'S3'])
range_lines = np.arange(labels_array.size)
abund = {'S2': 5.48, 'S3': 6.36}
func = {'S2': S2_emis, 'S3': S3_emis}
funcPm3 = {'S2': S2_emis_tt, 'S3': S3_emis_tt}

# True data
Te_true, ne_true = 12500.0, 115.0
te_err, ne_err = 500, 25
abund_true = {'S2': 5.48, 'S3': 6.36}

coeffs_dict = {'S2_6717A': np.array([4.600e+00, -9.185e+03, 4.219e-01, 1.000e+00, -4.815e-04]),
               'S2_6731A': np.array([4.494e+00, -9.018e+03, 4.091e-01, 1.000e+00, 1.166e-04]),
               'S3_6312A': np.array([3.012e+00, -1.629e+04, 5.648e-01]),
               'S3_9069A': np.array([3.077e+00, -6.182e+03, 5.895e-01]),
               'S3_9531A': np.array([3.470e+00, -6.182e+03, 5.895e-01])}

# Generate synthetic observational values
flux_array = np.empty(labels_array.size)
for i in np.arange(labels_array.size):
    # Get line label
    line_label = labels_array[i]

    # Ion emissivity function
    emisfunc = func[ions_array[i]]

    # Line flux
    flux_array[i] = abund_true[ions_array[i]] + emisfunc(Te_true, ne_true, *coeffs_dict[line_label]) - 12.0

# Artificial uncertainty on the measurements
err_array = np.abs(flux_array * 0.05)
cov_array = err_array**2 * np.eye(flux_array.size)

# Run pymc3 model
line_flux_tt = tt.zeros(flux_array.size)
with pm.Model() as model:

    # Physical conditions priors
    T_low = pm.Normal('T_low', mu=10000.0, sd=1000.0)
    n_e = pm.Normal('n_e', mu=80, sd=50)

    # Composition priors
    abund_dict = {'S2': pm.Uniform('S2_abund', lower=0, upper=10),
                  'S3': pm.Uniform('S3_abund', lower=0, upper=10)}

    for i in range_lines:

        # Line data
        line_label = labels_array[i]
        line_ion = ions_array[i]
        line_abund = abund_dict[line_ion]
        line_coeffs = coeffs_dict[line_label]
        line_func = funcPm3[line_ion]

        # Line parameters
        line_flux_i = line_abund + line_func(T_low, n_e, *line_coeffs) - 12.0
        line_flux_tt = tt.inc_subtensor(line_flux_tt[i], line_flux_i)

        #Likelihood individual line
        #chiSq = pm.Normal(line_label + '_Y', mu=line_flux, sd=err_list[i], observed=fluxes_list[i])

    # Global normal Likelihood for all lines
    #Y = pm.Normal('Y', mu=line_flux_tt, sd=err_array, observed=flux_array)

    # Global multivariable likelihood for all lines
    Y = pm.MvNormal('Y', mu=line_flux_tt, cov=cov_array, observed=flux_array)

    # model.check_test_point()

    # Launch model
    trace = pm.sample(10000, tune=2000)

print pm.summary(trace)
pm.traceplot(trace)
plt.show()
  1. I tried the second approach (somehow it looked more natural since I know the number of data points). I got this error however from theano:

    ValueError: Cannot compute test value: input 0 (Alloc.0) of Op Subtensor{int64}(Alloc.0, Constant{0}) missing default value.
    Backtrace when that variable is created:
    flux_array = np.empty(labels_array.size)

Which I solved adding this line to the code:

theano.config.compute_test_value = "ignore"

  1. The “.check test_point()” attribute is in the 3.5 version right? I will be happy to try it, it is neat.

  2. I run some tests defining several normal likelihoods, a single normal with all the points from my tensor array and single multivariable normal distribution. I am getting the same results which I think was the expected behaviour given the uncorrelated data points. Is there however, any advantage from the simulation point of view to define only one likelihood instead of several?

Thanks again!!

1 Like

Yes the .check_test_point is in the newest release.

You can simulated correlated data and check how different the estimation is - you might find MvNormal model quite a bit slower when you have lots of data, but in this case it should not matter too much.

2 Likes

that’s a great idea :slight_smile:

Thanks again