Hello everyone,
I am trying to fit the posterior trace regression line onto my data which shows how good the fit of the model is but I am running into a few problems.
When using the trace sd values it returns a flat line because the values are nowhere near the true values due to the model returning very large values for amp and cen when I try to get the trace sd means close to 1, greatly affecting the corner plot too.
Therefore, to just focus on the cen and amp trace values, I used the true sd values for the line fit which worked for another model using randomly generated data but not for this model.
Here is the code, please skip to the model:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pymc as pm
import corner
import arviz as az
from scipy.optimize import curve_fit
from scipy.signal import find_peaks
# Importing data
o_data = pd.read_csv('PMMA_Oxygen_Data.csv', names=['x', 'y'], header=None)
x = o_data['x']
y = o_data['y']
# Data into array
x = np.array(x)
y = np.array(y)
# Extracting true values of peaks using find peaks
peaks, _ = find_peaks(y, height=2397, distance = 1)
# Extracting the x & y values
xpeaks = x[peaks]
ypeaks = y[peaks]
# Finding true standard deviations peaks using Gaussian function
def gaussian(x, amp, cen, sd):
return (amp/np.sqrt(2.0*np.pi*sd**2)) * np.exp(-(x-cen)**2 / (2.0*sd**2))
# curve_fit fits Gaussian to each peak
peak_1, _ = curve_fit(gaussian, x, y, p0 = [ypeaks[0], xpeaks[0], 1])
peak_2, _ = curve_fit(gaussian, x, y, p0 = [ypeaks[1], xpeaks[1], 1])
# Assigning variables to true values
t_amp1 = ypeaks[1]
t_cen1 = xpeaks[1]
t_sd1 = peak_1[2]
t_amp2 = ypeaks[0]
t_delta = 2
t_sd2 = peak_2[2]
# MODEL
gaussian_model = pm.Model()
with pm.Model() as gaussian_model:
# Defines prior parameters of the counts & corrected binding energy (eV)
amp1 = pm.Normal('amp1', mu=2519, sigma=20, initval=2515)
cen1 = pm.Normal('cen1', mu=511, sigma=5, initval=509)
sd1 = pm.HalfNormal('sd1', sigma=1)
amp2 = pm.Normal('amp2', mu=2365.5, sigma=20, initval=2360)
delta = pm.LogNormal('delta', mu=0.7, sigma=0.1)
sd2 = pm.HalfNormal('sd2', sigma=1)
# Expected value of outcome from Gaussian equation
mu = (amp1/np.sqrt(2.0*np.pi*sd1**2)) * np.exp(-(x-cen1)**2 / (2.0*sd1**2)) + \
(amp2/np.sqrt(2.0*np.pi*sd2**2)) * np.exp(-(x-(cen1+delta))**2 / (2.0*sd2**2))
# Defines likelihood
peak_1 = pm.Normal('peak_1', mu=mu[0], sigma=sd1, observed=y)
peak_2 = pm.Normal('peak_2', mu=mu[1], sigma=sd2, observed=y)
# Sampling from posterior
trace = pm.sample(
draws=5500, tune=5500, chains=5, cores=5, return_inferencedata=True
)
# Estimating values
map_estimate = pm.sample(model=gaussian_model)
pm.summary(trace)
# Plotting data into scatter graph
fig, ax = plt.subplots(figsize=(15, 10))
ax.set_xlabel('Centre', fontsize = 22.5)
ax.set_ylabel('Amp', fontsize = 22.5)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.grid(linewidth = 1)
plt.plot(x, y, linestyle='-', marker='o', c='b');
# Bimodal Gaussian function
def gaussian2(x, amp1, cen1, sd1, amp2, delta_cen, sd2):
return (amp1/np.sqrt(2.0*np.pi*sd1**2)) * np.exp(-(x-cen1)**2 / (2.0*sd1**2)) + \
(amp2/np.sqrt(2.0*np.pi*sd2**2)) * np.exp(-(x-(cen1+delta_cen))**2 / (2.0*sd2**2))
# Using posterior trace values to fit Gaussian line
trace_means = pm.summary(trace).loc[:,'mean']
plt.plot(x, gaussian2(x,
trace_means['amp1'], trace_means['cen1'], true_sd1,
trace_means['amp2'], trace_means['delta'], true_sd2),
'g--', linewidth = 2.5)
ax.set_xbound(lower=520, upper=528)
ax.set_ybound(lower=230, upper=2750)
To explain the model a bit, the pm.summary does return the true values and the delta parameter separates peaks so there is no explicit ordering of the centre value as the 2nd peak is positioned. LogNormal is used to give positive values (this method was suggested by my uni mentor).
This is the graph it returns:
This is the pm.summary:
This is what my mentor is trying to make me aim (better if possible):
And here is the data too.
Oxygen_Data.csv (6.8 KB)
This is what I achieved for a different model with randomly generated data: