from pymc.math import where
import pandas as pd
import pymc as pm
from pymc.math import log,exp
from pytensor import tensor as pt
import numpy as np
import nutpie
from numpy import pi
from sklearn.preprocessing import StandardScaler
import pymc.sampling_jax
import numpyro
import jax
numpyro.set_host_device_count(4)
def standardize(x, mu, sigma):
return (x - mu) / sigma
def standardNormCdf(x):
return 0.5 + 0.5 * pm.math.erf(x / pm.math.sqrt(2))
def getContributionFromInterval(interval, mu, sigma):
logT = log(interval)
a = standardize(logT, mu, sigma)
return log(1 - standardNormCdf(a))
def computeFailureRate(sigma, mu, t, theta, data):
logT = log(t)
a1 = standardize(logT, mu, sigma)
a2 = 0.5 * log(2 * pi * pow(sigma, 2))
B = 0.5 * pow(a1, 2)
C = log(1 - standardNormCdf(a1))
D = (theta * data).sum(axis=-1).reshape((-1,1))
failureRate = ((-1) * logT) - a2 - B - C + D
return failureRate
def getSurvival(time_to_event, mu, sigma, theta, features):
contributionFromFinalInterval = (-1) * getContributionFromInterval(time_to_event, mu, sigma)
theta_features_vector = (theta * features).sum(axis=-1)
theta_features_vector = theta_features_vector.reshape((-1, 1))
survival = (contributionFromFinalInterval * exp(theta_features_vector))
# survival = survival.sum(axis=1).reshape((-1, 1))
return survival
def runModel():
bcphm_model = pm.Model()
with bcphm_model:
data = pd.read_csv('data/precovid_SF.csv')
data.sort_values(by=['event'], inplace=True)
data.loc[(data.FirstTimeHomeBuyer.isna()) & data.LoanPurpose.isin(
['Refinance', 'CashOutRefi']), 'FirstTimeHomeBuyer'] = 'Not Applicable'
data['PMI'].fillna(0, inplace=True)
data = data[data.time_to_event != 0]
data.dropna(inplace=True)
data['ClosingDt'] = pd.to_datetime(data['ClosingDt']).dt.year
data = data.groupby('event', group_keys=False).apply(lambda x: x.sample(frac=0.2))
events = np.where(data.event.values == "default", 0, np.where(data.event.values == "prepayment", 1, 2)).reshape(
(-1, 1))
time_to_event = data.time_to_event.values
time_to_event_shape = time_to_event.shape
time_to_event = time_to_event.reshape(time_to_event_shape[0], 1)
data['isSingleBorrower'] = data['isSingleBorrower'].map({0: 'No', 1: 'Yes'})
features = data.drop(
columns=['LoanNumber', 'time_to_default', 'time_to_prepayment', 'State', 'ClosingDt', 'event',
'time_to_event'], axis=1)
categorical = [col for col in features.columns if features[col].dtype == "O"]
quantitative = set(features.columns) - set(categorical)
# features = np.repeat(features[:, np.newaxis, :], lifetime.shape[1], axis=1)
categorical_dummies = pd.get_dummies(features.loc[:, list(categorical)], columns=categorical, drop_first=True)
sc = StandardScaler()
standardized_quantitative = pd.DataFrame(sc.fit_transform(features.loc[:, list(quantitative)]),
columns=list(quantitative))
model_input = pd.concat(
[categorical_dummies.reset_index(drop=True), standardized_quantitative.reset_index(drop=True)],
axis=1)
model_input.replace({False: 0, True: 1}, inplace=True)
features_shape = model_input.shape
features_num = model_input.values
features_num = pm.MutableData('features_num', features_num)
events = pm.MutableData('events', events)
theta_D = pm.Normal('theta_D', mu=0, sigma=100, shape=features_shape[1])
theta_P = pm.Normal('theta_P', mu=0, sigma=100, shape=features_shape[1])
mu_D = pm.Normal('mu_D', mu=0, sigma=10)
mu_P = pm.Normal('mu_P', mu=0, sigma=10)
sigma_D = pm.Exponential('sigma_D', .01)
sigma_P = pm.Exponential('sigma_P', .01)
def logp(time_to_event, mu_P, mu_D, sigma_P,
sigma_D, theta_D, theta_P, event,
features):
failureRate = where(
pt.eq(event, 0),
computeFailureRate(sigma_D, mu_D, time_to_event, theta_D, features),
where(pt.eq(event, 1),
computeFailureRate(sigma_P, mu_P, time_to_event, theta_P, features),
0)
)
defaultSurvival = getSurvival(time_to_event, mu_D, sigma_D,
theta_D, features)
prepaymentSurival = getSurvival(time_to_event, mu_P,
sigma_P, theta_P, features)
return (failureRate - defaultSurvival - prepaymentSurival).flatten()
likelihood = pm.CustomDist('LL', mu_P, mu_D,
sigma_P, sigma_D, theta_D, theta_P,
events, features_num,
logp=logp,
observed=time_to_event)
compiled_model = nutpie.compile_pymc_model(bcphm_model)
trace_pymc = nutpie.sample(compiled_model)
return trace
@ricardoV94 I am having trouble in getting this sampling to converge. Could you please take a look and see any obvious issues? Surprisingly it works on my local machine with vanilla sampler and with smaller fragment of dataset.
Same code with Numpyro on aws sagemaker gives me samples where the variables are stuck and every proposal is rejected.