Hi all. I’ve used arviz reloo function models containing with two likelihoods and 110 observations (simulated). However, I doubt one thing of my approach. Initially I put both likelihoods together, such as idata.log_likelihood[‘y’] = idata.log_likelihood[‘yh’]+idata.log_likelihood[‘yf’] (idata is an arviz inference data object). However, that is problematic for the reloo wrapper, as adding log-likelihoods in this way would add an extra dimension to the log-likelihood array:

```
idata = az.from_pymc3(trace, model=mod, **idata_kwargs)
idata.log_likelihood['y'] = idata.log_likelihood['yh']+idata.log_likelihood['yf']
idata.log_likelihood['y'].shape
Out[6]: (2, 1000, 110, 110)
```

The problem is that the wrapper would iterate 110 by 110 times (12100 total) when the log-likelihood has this shape. However, I’m only interested in iterating over the 110 points and resample if either ‘yf’ or ‘yh’ have a k_hat over 0.7. I could not find a way to get the wrapper to do that. So I decided to take the easy way and use the average log-likehood of ‘yh’ and ‘yf’ :

```
idata.log_likelihood['y'] = idata.log_likelihood['y'].mean(axis=3)
```

**Is this approach valid?**. The reloo results from each model are actually quite reasonable when I do this, and when I compare the ELPDs of the three models I’m sampling, results are very close to PSIS-LOO (even so PSIS gets most k_hats over 0.7 for each model, that’s why I used reloo). All details are ina repo [here](: SDT_Bayesian_workflow/model_comparison at main · SimonErnesto/SDT_Bayesian_workflow · GitHub). But here’s the full code for Model 1 (just in case):

```
# -*- coding: utf-8 -*-
import os
import pymc3 as pm
import arviz as az
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
np.random.seed(33)
####### This wrapper function is taken literally from Arviz just to make easier
####### checking up index and arrays dimensions
"""Stats functions that require refitting the model."""
import logging
import numpy as np
from arviz import loo
from arviz.stats.stats_utils import logsumexp as _logsumexp
__all__ = ["reloo"]
_log = logging.getLogger(__name__)
def reloo(wrapper, loo_orig=None, k_thresh=0.7, scale=None, verbose=True):
required_methods = ("sel_observations", "sample", "get_inference_data", "log_likelihood__i")
not_implemented = wrapper.check_implemented_methods(required_methods)
if not_implemented:
raise TypeError(
"Passed wrapper instance does not implement all methods required for reloo "
f"to work. Check the documentation of SamplingWrapper. {not_implemented} must be "
"implemented and were not found."
)
if loo_orig is None:
loo_orig = loo(wrapper.idata_orig, pointwise=True, scale=scale)
loo_refitted = loo_orig.copy()
khats = loo_refitted.pareto_k
print('khats: '+str(khats.shape))
loo_i = loo_refitted.loo_i
scale = loo_orig.loo_scale
if scale.lower() == "deviance":
scale_value = -2
elif scale.lower() == "log":
scale_value = 1
elif scale.lower() == "negative_log":
scale_value = -1
lppd_orig = loo_orig.p_loo + loo_orig.loo / scale_value
n_data_points = loo_orig.n_data_points
# if verbose:
# warnings.warn("reloo is an experimental and untested feature", UserWarning)
if np.any(khats > k_thresh):
#print('index: '+str(idx))
for idx in np.argwhere(khats.values > 0.7):
if verbose:
_log.info("Refitting model excluding observation %d", idx)
new_obs, excluded_obs = wrapper.sel_observations(idx)
fit = wrapper.sample(new_obs)
idata_idx = wrapper.get_inference_data(fit)
log_like_idx = wrapper.log_likelihood__i(excluded_obs, idata_idx).values.flatten()
loo_lppd_idx = scale_value * _logsumexp(log_like_idx, b_inv=len(log_like_idx))
khats[idx] = 0
loo_i[idx] = loo_lppd_idx
loo_refitted.loo = loo_i.values.sum()
loo_refitted.loo_se = (n_data_points * np.var(loo_i.values)) ** 0.5
loo_refitted.p_loo = lppd_orig - loo_refitted.loo / scale_value
return loo_refitted
else:
_log.info("No problematic observations")
return loo_orig
path = ""
os.chdir(path)
p = 55 #number of participants per group
g = 2 # number of groups: hint and no-hint
##### hint group, high accuracy
miss1 = (np.random.binomial(n=25, p=0.05, size=p)).astype(int) #misses percentage
hit1 = np.repeat(25, p)-miss1 #hits percentage
fa1 = (np.random.binomial(n=75, p=0.15, size=p)).astype(int) #false alarms percentage
cr1 = np.repeat(75, p)-fa1 #correct rejections percentage
##### no-hint group, low accuracy
miss2 = (np.random.binomial(n=25, p=0.15, size=p)).astype(int) #misses percentage
hit2 = np.repeat(25, p)-miss2 #hits percentage
fa2 = (np.random.binomial(n=75, p=0.5, size=p)).astype(int) #false alarms percentage
cr2 = np.repeat(75, p)-fa2 #correct rejections percentage
miss = np.array([miss1, miss2])
hit = np.array([hit1, hit2])
fa = np.array([fa1, fa2])
cr = np.array([cr1, cr2])
signal = miss + hit # signal
noise = cr + fa # noise
def cdf(x):
#Cumulative distribution function of standard Gaussian
return 0.5 + 0.5 * pm.math.erf(x / pm.math.sqrt(2))
############## Fixed model
def compile_mod(hit, signal, fa, noise):
with pm.Model() as model:
d = pm.Normal('d', 0.0, 0.5, shape=hit.shape) #discriminability d'
c = pm.Normal('c', 0.0, 2.0, shape=hit.shape) #bias c
h = pm.Deterministic('h', cdf(0.5*d - c)).flatten() # hit rate
f = pm.Deterministic('f', cdf(-0.5*d - c)).flatten() # false alarm rate
yh = pm.Binomial('yh', p=h, n=signal.flatten(), observed=hit.flatten()) # sampling for Hits, S is number of signal trials
yf = pm.Binomial('yf', p=f, n=noise.flatten(), observed=fa.flatten()) # sampling for FAs, N is number of noise trials
return model
### define sampling arguments
sample_kwargs = {"draws":1000, "tune":1000, "chains":2, "cores":1, "init":'advi'}
with compile_mod(hit,signal,fa,noise) as mod:
trace = pm.sample(**sample_kwargs)
dims = {"y": ["time"]}
idata_kwargs = {
"dims": dims,
}
idata = az.from_pymc3(trace, model=mod, **idata_kwargs)
idata.log_likelihood['y'] = idata.log_likelihood['yh']+idata.log_likelihood['yf']
idata.log_likelihood = idata.log_likelihood.drop(['yh', 'yf'])
idata.log_likelihood['y'] = idata.log_likelihood['y'].mean(axis=3)
loo_orig = az.loo(idata, pointwise=True)
loo_orig
class Wrapper(az.SamplingWrapper):
def __init__(self, hit, signal, fa, noise, **kwargs):
super(Wrapper, self).__init__(**kwargs)
self.hit = hit
self.signal = signal
self.fa = fa
self.noise = noise
def sample(self, modified_observed_data):
with self.model(**modified_observed_data) as mod:
idata = pm.sample(
**self.sample_kwargs,
return_inferencedata=True,
idata_kwargs={"log_likelihood": True} )
self.pymc3_model = mod
idata.log_likelihood['y'] = idata.log_likelihood['yh']+idata.log_likelihood['yf']
idata.log_likelihood = idata.log_likelihood.drop(['yh', 'yf'])
idata.log_likelihood['y'] = idata.log_likelihood['y'].mean(axis=3)
idata = idata.log_likelihood['y']
return idata
def get_inference_data(self, idata):
#idata = az.from_pymc3(trace, model=self.pymc3_model, **self.idata_kwargs)
#idata.pymc3_trace = trace
return idata
def log_likelihood__i(self, excluded_observed_data, idata__i):
log_lik__i = idata.log_likelihood['y']
#print(log_lik__i)
return log_lik__i
def sel_observations(self, idx):
print("index: "+str(idx))
mask = np.isin(np.arange(len(self.signal.flatten())), idx)
sigi = np.array([ self.signal.flatten()[~mask] ])
hi = np.array([ self.hit.flatten()[~mask] ])
noisi = np.array([ self.noise.flatten()[~mask] ])
fai = np.array([ self.fa.flatten()[~mask] ])
sige = np.array([ self.signal.flatten()[mask] ])
he = np.array([ self.hit.flatten()[mask] ])
noise = np.array([ self.noise.flatten()[~mask] ])
fae = np.array([ self.fa.flatten()[~mask] ])
data__i = {"signal":sigi, "hit":hi, 'fa':fai, 'noise':noisi}
data_ex = {"signal":sige, "hit":he, 'fa':fae, 'noise':noise}
return data__i, data_ex
wrapper = Wrapper(model=compile_mod, hit=hit,
signal=signal,
fa=fa,
noise=noise,
sample_kwargs=sample_kwargs,
idata_kwargs=idata_kwargs)
loo_relooed = reloo(wrapper, loo_orig=loo_orig)
loo_relooed
reloo_df = pd.DataFrame(loo_relooed)
reloo_df.to_csv("mod1_reloo.csv")
reloo_k = loo_relooed.pareto_k
k_df = pd.DataFrame(reloo_k)
k_df.to_csv("mod1_k_hats.csv")
az.plot_khat(loo_relooed)
plt.savefig('mod1_khat.png', dpi=300)
plt.close()
loo_i = loo_relooed.loo_i
np.save("mod1_loo_i.npy", loo_i)
file = open("mod1_reloo.obj","wb")
pickle.dump(loo_relooed,file)
file.close()
```

k_hats look very good after relooing:

I sampled 2 other models, both multilevel, one using varying priors and the other using multivarite varying priors with an LKJ prior for covariances (see [here](: SDT_Bayesian_workflow/model_comparison at main · SimonErnesto/SDT_Bayesian_workflow · GitHub)). These models showed fewer bad k_hats with PSIS. And the model comparison looks reasonable:

**PSIS-LOO without averaging log-likelihoods:**

**PSIS-LOO using average log-likelihoods:**

**re-LOO using average log-likelihoods:**

In short, the approach works and gives reasonable results. The model comparison using averaged log-likelihoods even seem more sensible to me (but maybe I’m wrong). But I’m still wondering about the validity/legitimacy of this approach (is it justified and how?).

Many thanks in advance for any help/advice with this.

PS: I got this working based on this thread: Model comparison issue: "Found several log likelihood arrays var_name cannot be None" · Issue #1404 · arviz-devs/arviz · GitHub