Hi. Many thanks again for taking the time to answer this. I’m not sure whether it is necessary to re-scale if using this file: “preprocessed_xyt.csv”. One problem of putting the time in hours rather than in days is that the time magnitude will be too big respect to space in km. This breaks the model immediately generating a nan initial value for y.
There a couple of things I don’t really understand. Why would the masking method using numpy be different from doing something like timeD[timeD<0] = 0 ? or something like at.tril(at.ones(timeD.shape))? I’ve tried both and they perform faster, but divergences are even worst than with the numpy method. And results do not match either:
# -*- coding: utf-8 -*-
import pandas as pd
import pymc as pm
import pytensor.tensor as at
import arviz as az
import numpy as np
from shapely import Polygon
from datetime import datetime
from scipy.spatial import distance_matrix
df = pd.read_csv("preprocessed_xyt.csv")
xyt = df
#band width constants for Epanechikov kernel:
bw_time = 14
bw_space = 1.609 #1.6 kilometers
start_date = datetime(2010,1,1)
end_date = datetime(2010,1,31)
time_window = end_date-start_date
time_window = time_window.days+1
xyt = xyt.sort_values(by="T")
xyt['T'] = xyt['T']
xyt["hour"] = np.round(xyt["T"].values) % 24 + 1
xyt["hour_idx"] = xyt.hour.values.astype("int").astype("str")
phat_hour = [len(xyt[xyt.hour==xyt.hour.values[h]])/len(xyt) for h in range(len(xyt))]
xy = np.array([xyt['X'], xyt['Y']])
spaceD = distance_matrix(xy.T, xy.T) #spcae distance matrix in km
space_window = Polygon(xy.T).area #np.pi*(np.max(spaceD)*0.5)**2 #
T = xyt['T'].values #- min(xyt['T'].values)
TT = np.array([T,T])
timeD = distance_matrix(TT.T, TT.T)
## Define Epanechikov kernel
def epa(x,ls):
d = (0.75*(1 - (x/ls)**2))/ls
d[d < 0] = 0
return d
d_space = sum(epa(spaceD, bw_space)) #np.array([epa(s, bw_space) for s in spaceD])#Epanechikov kernel for space
d_time = sum(epa(timeD, bw_time))/len(xyt) #np.array([epa(t, bw_time) for t in timeD]) #Epanechikov kernel for time
muhat = d_space*d_time #intensity estimate from Epanechikov kernels
#Gaussian kernel
def gk(x, ls):
return (0.5*np.pi)*at.exp((-0.5 * x**2)/ls**2)/ls**2
mu_xy = d_space
mu_t = d_time
T = xyt['T'].values - min(xyt['T'].values) #I don't quite understand the reason for this
timeD = T[:, None] - T[None]
timeD2 = time_window - T
n = len(xyt)
# addition_mask = np.zeros_like(timeD)
# addition_mask[np.tril_indices_from(addition_mask, k=-1)] = 1
# addition_mask = at.as_tensor_variable(addition_mask)
with pm.Model() as mod:
lengthscaleS = pm.TruncatedNormal("lengthscaleS", 0, 10, lower=0) #length scale space
lengthscaleT = pm.TruncatedNormal("lengthscaleT", 0, 10, lower=0) #length scale time
a = pm.TruncatedNormal("a", 0, 10, lower=0) #excitation parameter
gs = gk(spaceD, lengthscaleS) #gaussian kernel
mu0 = pm.TruncatedNormal("mu0", 0, 1, lower=0)
mu = pm.TruncatedNormal("mu", 0, 1, lower=0)
mu_xyt = mu0 + mu*muhat
lam0 = (a * lengthscaleT * at.exp(-timeD * lengthscaleT)) * gs
lam = mu_xyt + (lam0 * at.tril(at.ones(timeD.shape))).sum(axis=-1)
muSTintegral = len(xyt)
def lp(name, lam, a, lengthscaleT, mu, mu0, time, integral):
l = [at.sum(at.log(lam)) - integral * mu - mu0 * space_window * time_window +
a * at.sum(at.exp(-time*lengthscaleT)) - integral]
return l[0]
y = pm.Potential("y", lp("ylp", lam, a, lengthscaleT, mu, mu0, timeD2, muSTintegral))
with mod:
idata = pm.sample(1000, tune=1000, chains=4, cores=12)#, target_accept=0.95)
pos = idata.stack(sample = ['chain', 'draw']).posterior
summ = az.summary(idata, var_names=["lengthscaleS","lengthscaleT","a", "mu"])
print(summ) ...
There were 3099 divergences after tuning. Increase `target_accept` or reparameterize.
mean sd hdi_3% ... ess_bulk ess_tail r_hat
lengthscaleS 0.000 0.000 0.000 ... 293.0 639.0 1.01
lengthscaleT 158.484 7.299 144.881 ... 607.0 718.0 1.01
a 158.590 6.585 147.135 ... 431.0 474.0 1.01
mu 0.004 0.004 0.000 ... 802.0 750.0 1.01
I’ve compared the muhat values I obtain with the Epanechikov kernel and they match those in the R code generally well (same for the time scale and space scale). So I don’t see why would this generate a problem here. So, my impression is that what needs to be fixed is not really sampling speed, but speed problems seem to be a consequence of something wrong with the model’s structure. (It’s quite telling that divergences start happening during tuning). I’ve reproduced the model directly from the repo, but maybe I missed something.