the question is “In the GitHub repository you will find a data set of the distribution of citations of scientific papers. Use SMC-ABC to fit a g-and-k distribution to this dataset. Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a
suitable fit.” the code i wrote is
%matplotlib inline
import arviz as az
import matplotlib.pyplot as plt
import pymc as pm
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from scipy.stats import norm, uniform
import scipy.stats as stats
from scipy.stats import norm
#reading data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values
#define the G-and-K quantile class
class g_and_k_quantile:
def __init__(self):
self.quantile_normal = stats.norm(0, 1).ppf
def ppf(self, x, a, b, g, k):
z = self.quantile_normal(x)
return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z
#step 2: estimate G-and-K parameters
def objective(params, data):
a, b, g, k = params
gk = g_and_k_quantile()
quantiles = np.linspace(0.01, 0.99, len(data)) #avoiding 0 and 1
theoretical = gk.ppf(quantiles, a, b, g, k)
empirical = np.quantile(data, quantiles)
return np.sum((theoretical - empirical) ** 2)
initial_params = [np.median(values), np.std(values), 0.1, 0.1]
result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead')
a, b, g, k = result.x
#step 3
gk = g_and_k_quantile()
simulated_data = gk.ppf(np.random.uniform(0.01, 0.99, 1000), a, b, g, k)
# Plotting the original data
plt.figure(figsize=(10, 5))
plt.hist(values, bins=50, alpha=0.6, color='blue', label='Actual Data')
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Actual Data')
plt.legend()
plt.show()
# Plotting the G-and-K distribution data
plt.figure(figsize=(10, 5))
plt.hist(simulated_data, bins=50, density=True, alpha=0.6, color='red', label='Simulated G-and-K')
plt.xlabel('Simulated Value')
plt.ylabel('Density')
plt.title('Histogram of Simulated G-and-K Distribution')
plt.legend()
plt.show()
def fit_gk_distribution(values, epsilon):
def objective(params, data):
a, b, g, k = params
gk = g_and_k_quantile()
quantiles = np.linspace(0.01, 0.99, len(data)) # Avoid 0 and 1
theoretical = gk.ppf(quantiles, a, b, g, k)
empirical = np.quantile(data, quantiles)
return np.sum((theoretical - empirical) ** 2)
result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead', options={'xatol': epsilon, 'fatol': epsilon})
return result.x
# Experiment with different values of epsilon
epsilons = [1e-6, 1e-7, 1e-8]
results = []
for eps in epsilons:
params = fit_gk_distribution(values, eps)
results.append((eps, params))
# Evaluate each result (this step is more conceptual, depending on your evaluation criteria)
for eps, params in results:
print(f"Epsilon: {eps}, Parameters: {params}")
# Here, include your model evaluation code, which might involve
# calculating goodness-of-fit, plotting, or other statistical measures
# Choose the epsilon that results in the best model fit according to your criteria
def distance(simulated_data, actual_data):
# Compute quantiles for both datasets
quantiles_to_compare = np.linspace(0.01, 0.99, 100) # Avoid 0 and 1 to prevent infinite values
simulated_quantiles = np.quantile(simulated_data, quantiles_to_compare)
actual_quantiles = np.quantile(actual_data, quantiles_to_compare)
# Calculate the sum of squared differences
ssd = np.sum((simulated_quantiles - actual_quantiles) ** 2)
return ssd
def smc_abc(data, prior_func, simulate_func, distance_func, tolerance_decay, max_iterations):
population = prior_func(size=100) # Initial population
tolerances = np.linspace(start=tolerance_decay, stop=0, num=max_iterations)
populations = [population]
for tolerance in tolerances:
new_population = []
while len(new_population) < len(population):
candidate = prior_func(size=1)
simulated_data = simulate_func(candidate)
if distance_func(simulated_data, data) < tolerance:
new_population.append(candidate)
populations.append(new_population)
return populations
def gk_prior(size=1):
# Replace with the appropriate prior distributions for your context
a_samples = norm.rvs(loc=0, scale=10, size=size)
b_samples = np.abs(norm.rvs(loc=0, scale=2, size=size)) # b should be positive
g_samples = norm.rvs(loc=0, scale=1, size=size)
k_samples = norm.rvs(loc=0, scale=1, size=size)
# Each parameter set is a row in the 2D array
return np.column_stack((a_samples, b_samples, g_samples, k_samples))
# Define a function to simulate data from the g-and-k distribution
def simulate_from_gk(params):
# Use the g-and-k quantile function to simulate data
gk_quantile = g_and_k_quantile()
uniform_samples = np.random.uniform(0, 1, 1000) # Number of data points to simulate
# Simulate data for each parameter set
simulated_datasets = [gk_quantile.ppf(uniform_samples, *param_set) for param_set in params]
return simulated_datasets
# Read the data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values
# Set up SMC-ABC parameters
tolerance_decay = 1.0 # Starting tolerance, this should be tuned
max_iterations = 10 # The number of iterations to run the SMC-ABC, this should be tuned
# Run the SMC-ABC algorithm
populations = smc_abc(
data=values,
prior_func=gk_prior,
simulate_func=simulate_from_gk,
distance_func=distance,
tolerance_decay=tolerance_decay,
max_iterations=max_iterations
)
but i dont think my code answers the question, can you help me write the right code or find the full solution of this problem please?