Solving question 8M9 from Bayesian Modeling and Computation in Python

Tony_Jubran · March 9, 2024, 11:58am

the question is “In the GitHub repository you will find a data set of the distribution of citations of scientific papers. Use SMC-ABC to fit a g-and-k distribution to this dataset. Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a
suitable fit.” the code i wrote is

%matplotlib inline
import arviz as az
import matplotlib.pyplot as plt
import pymc as pm
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from scipy.stats import norm, uniform
import scipy.stats as stats
from scipy.stats import norm


#reading data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values

#define the G-and-K quantile class
class g_and_k_quantile:
    def __init__(self):
        self.quantile_normal = stats.norm(0, 1).ppf

    def ppf(self, x, a, b, g, k):
        z = self.quantile_normal(x)
        return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z

#step 2: estimate G-and-K parameters
def objective(params, data):
    a, b, g, k = params
    gk = g_and_k_quantile()
    quantiles = np.linspace(0.01, 0.99, len(data))  #avoiding 0 and 1
    theoretical = gk.ppf(quantiles, a, b, g, k)
    empirical = np.quantile(data, quantiles)
    return np.sum((theoretical - empirical) ** 2)

initial_params = [np.median(values), np.std(values), 0.1, 0.1]
result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead')
a, b, g, k = result.x

#step 3
gk = g_and_k_quantile()
simulated_data = gk.ppf(np.random.uniform(0.01, 0.99, 1000), a, b, g, k)

# Plotting the original data
plt.figure(figsize=(10, 5))
plt.hist(values, bins=50, alpha=0.6, color='blue', label='Actual Data')
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Actual Data')
plt.legend()
plt.show()

# Plotting the G-and-K distribution data
plt.figure(figsize=(10, 5))
plt.hist(simulated_data, bins=50, density=True, alpha=0.6, color='red', label='Simulated G-and-K')
plt.xlabel('Simulated Value')
plt.ylabel('Density')
plt.title('Histogram of Simulated G-and-K Distribution')
plt.legend()
plt.show()
def fit_gk_distribution(values, epsilon):
    def objective(params, data):
        a, b, g, k = params
        gk = g_and_k_quantile()
        quantiles = np.linspace(0.01, 0.99, len(data))  # Avoid 0 and 1
        theoretical = gk.ppf(quantiles, a, b, g, k)
        empirical = np.quantile(data, quantiles)
        return np.sum((theoretical - empirical) ** 2)

    result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead', options={'xatol': epsilon, 'fatol': epsilon})
    return result.x

# Experiment with different values of epsilon
epsilons = [1e-6, 1e-7, 1e-8]
results = []

for eps in epsilons:
    params = fit_gk_distribution(values, eps)
    results.append((eps, params))

# Evaluate each result (this step is more conceptual, depending on your evaluation criteria)
for eps, params in results:
    print(f"Epsilon: {eps}, Parameters: {params}")
    # Here, include your model evaluation code, which might involve
    # calculating goodness-of-fit, plotting, or other statistical measures

# Choose the epsilon that results in the best model fit according to your criteria

def distance(simulated_data, actual_data):
    # Compute quantiles for both datasets
    quantiles_to_compare = np.linspace(0.01, 0.99, 100)  # Avoid 0 and 1 to prevent infinite values
    simulated_quantiles = np.quantile(simulated_data, quantiles_to_compare)
    actual_quantiles = np.quantile(actual_data, quantiles_to_compare)
    
    # Calculate the sum of squared differences
    ssd = np.sum((simulated_quantiles - actual_quantiles) ** 2)
    
    return ssd


def smc_abc(data, prior_func, simulate_func, distance_func, tolerance_decay, max_iterations):
    population = prior_func(size=100)  # Initial population
    tolerances = np.linspace(start=tolerance_decay, stop=0, num=max_iterations)
    populations = [population]
    
    for tolerance in tolerances:
        new_population = []
        while len(new_population) < len(population):
            candidate = prior_func(size=1)
            simulated_data = simulate_func(candidate)
            if distance_func(simulated_data, data) < tolerance:
                new_population.append(candidate)
        populations.append(new_population)
    
    return populations

def gk_prior(size=1):
    # Replace with the appropriate prior distributions for your context
    a_samples = norm.rvs(loc=0, scale=10, size=size)
    b_samples = np.abs(norm.rvs(loc=0, scale=2, size=size))  # b should be positive
    g_samples = norm.rvs(loc=0, scale=1, size=size)
    k_samples = norm.rvs(loc=0, scale=1, size=size)
    
    # Each parameter set is a row in the 2D array
    return np.column_stack((a_samples, b_samples, g_samples, k_samples))

# Define a function to simulate data from the g-and-k distribution
def simulate_from_gk(params):
    # Use the g-and-k quantile function to simulate data
    gk_quantile = g_and_k_quantile()
    uniform_samples = np.random.uniform(0, 1, 1000)  # Number of data points to simulate
    
    # Simulate data for each parameter set
    simulated_datasets = [gk_quantile.ppf(uniform_samples, *param_set) for param_set in params]
    return simulated_datasets

    
# Read the data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values

# Set up SMC-ABC parameters
tolerance_decay = 1.0  # Starting tolerance, this should be tuned
max_iterations = 10  # The number of iterations to run the SMC-ABC, this should be tuned

# Run the SMC-ABC algorithm
populations = smc_abc(
    data=values,
    prior_func=gk_prior,
    simulate_func=simulate_from_gk,
    distance_func=distance,
    tolerance_decay=tolerance_decay,
    max_iterations=max_iterations
)

but i dont think my code answers the question, can you help me write the right code or find the full solution of this problem please?

OriolAbril · September 6, 2024, 6:35pm

cc @aloctavodia @junpenglao

Topic		Replies	Views
Beginning user, problem set correctly (experimental data fit)? v5 modeling	2	328	January 16, 2023
ABC-SMC: recommendation how to choose epsilon Questions	2	704	December 22, 2019
SMC -ABC and further projects at PYMC3 Development	9	1206	March 4, 2019
Updating posteriors for scanning a chemical system Questions	15	882	May 2, 2019
How to plot scipy probability distribution from pymc3? Questions	0	415	May 25, 2020

Solving question 8M9 from Bayesian Modeling and Computation in Python

Related topics