Any suggestions on speeding up the model?

Hi
I have a simple model, but it takes longer time (3 hours) then I expected. I cannot find the bottleneck of the script. Codes and data are attached. Many thanks in advance, for the suggestions you provided.

Combo_Dmatrix.csv (11.0 KB)
COMBO_OTU.csv (15.5 KB)
truevalues.csv (1.0 KB)
ynew3.csv (18.8 KB)

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import pymc as pm
import pytensor.tensor as pt
import arviz as az
import pytensor
import time

D_phylo = pd.read_csv('Combo_Dmatrix.csv')
COMBO_OTU = pd.read_csv('COMBO_OTU.csv')
y_obs = pd.read_csv('ynew3.csv')
y_true = pd.read_csv('truevalues.csv')

D_phylo = D_phylo.to_numpy()
D_phylo = D_phylo[:,1:]
COMBO_OTU = COMBO_OTU.to_numpy()
COMBO_OTU = COMBO_OTU[:,1:]

y_obs = y_obs.to_numpy()
y_obs = y_obs[:,1:]
y_true = y_true.to_numpy()

K1_combo = int(62 * 0.85)
# rho = 0.25
# sigma = 1
cov =  1**2* np.exp(-2* 0.25* D_phylo)
samples = y_obs.shape[0]
specise = y_obs.shape[1]
phai = np.zeros([samples,1])

N = np.sum(COMBO_OTU[0:samples,:], axis=1).reshape(-1, 1)

tt = samples
chain_length = 2000
tune_length = 1000
spe = ['spe{}'.format(i) for i in range(specise)]
sam = ['sam{}'.format(i) for i in range(tt)]

coords = {"specie": spe, "sample": sam}

ind = pt.arange(specise)
cov = pt.as_tensor_variable(cov)

pp = pt.as_tensor_variable(np.zeros([tt,specise]))
true_abundance = pt.as_tensor_variable(np.zeros([tt,specise]))
obs_like = np.array(y_obs[:,:].tolist())
NN = np.sum(np.array(y_obs[:,:].tolist()),axis = 1)

zk = pt.arange(specise)
zp = pt.ones(specise)

start = time.time() 

def def_test2(ii, pll, U, ind_tem, expend_con):
    p2 = pll
    tem = pm.math.sum(pm.math.exp(U[ii, :p2-1]))
    concatenated_values = pm.math.concatenate((pm.math.exp(U[ii, :p2-1]) / (1 + tem), [1 / (1 + tem)]))
    updated_expend_con = pytensor.ifelse.ifelse(pt.eq(p2,0),
                                        pt.set_subtensor(expend_con[ii,:], pt.zeros(62)+1/62),
                                        pt.set_subtensor(expend_con[ii, ind_tem], concatenated_values))
    return updated_expend_con


def cholesky(A):
    n = A.shape[0]
    L = pt.zeros((n, n))
    
    def step(i, j, L, A):
        L = pytensor.ifelse.ifelse(pt.eq(i,j),
                                       pt.set_subtensor(L[i, j], pt.sqrt(A[i, i] - pt.sum(L[i, :i] ** 2))),
                                       pt.set_subtensor(L[i,j], (A[i, j] - pt.sum(L[i, :j] * L[j, :j])) / L[j, j] ))
        return L
    z = pt.tril_indices(n)
    
    L, _ = pytensor.scan(step,
                         sequences=[z[0], z[1]],
                         non_sequences=[A],
                         outputs_info = L)
    
    return L[-1]    

if __name__ == '__main__':
    with pm.Model(coords=coords) as Multi:        
        #########Prior########
        
        T = pm.Uniform('T', lower = 1, upper = 100)
        p = pm.Deterministic('p',1-pm.math.exp(-D_phylo[-1,:]/T)) # the last column is the reference taxon
        delta = pm.Bernoulli("delta",p = p, shape = specise)   #np.ones([specise])
        ind_tem = ind[pm.math.lt(delta[:],1)]
        pll = ind_tem.shape[0] 
                
        z_ind_tem = zk   
        ind_tem = pytensor.ifelse.ifelse(~pt.all(pll), z_ind_tem, ind_tem)
        zpll =    pytensor.ifelse.ifelse(~pt.all(pll), ind_tem.shape[0], pll)  
        
        theta = pm.Normal('theta', mu = 1 , sigma = 1, shape = specise)
        
        th_t = pt.zeros([specise])
        cov_t = pt.diag(pt.ones([specise]))
        th_t = pt.set_subtensor(th_t[0:zpll], theta[ind_tem])
        cov_t = pt.set_subtensor(cov_t[0:zpll,0:zpll], cov[ind_tem][:, ind_tem])
        
        U = pm.MvNormal("U", mu = th_t, cov = cov_t, shape = (tt,zpll)) 
       
                              
        ii = pt.arange((tt))
        expend_con = pt.zeros((tt,specise))+1e-10
        Results, _ = pytensor.scan(def_test2,
                                    sequences = [ii],
                                    non_sequences = [pll, U, ind_tem, expend_con])       
        expend_con = pt.set_subtensor(expend_con[ii,:],Results[ii,ii,:])      
        
        ########Likelihood########                         
        likelihood_y = pm.Multinomial('likelihood', n =NN[0:tt], p = expend_con, observed = obs_like[0:tt,:]) #N should be total counts of species of each sample
               
        ########sampling######## 
        trace = pm.sample(draws = 8000,  tune = 4000, chains = 4, cores = 4,  step = pm.NUTS()) #chain_length, tune = tune_length,        
        # az.plot_trace(trace)
        
        ########output########
        summary = az.summary(trace,hdi_prob=0.95,stat_focus = 'mean')
        print(summary)

Haven’t run the code yet but have you gotten any warnings alongside the output (that you should get if you have the latest version of PyMC)? Specifically a maximum tree depth warning, you can get the tree depth by putting this at the end of the code:

print(trace.sample_stats['tree_depth'])

which will return an array so if you have lots of 9s (for example) in the array then that means there are 2^9 branches of this tree, put simply is that it is spending a lot longer calculating the hamiltonian trajectory of each sampling step and so the sampling takes longer.

Just my preliminary thought before diving in since that is something that is affecting my own model at the moment! Have you done any other profiling on this code that you can provide, I think PyMC has a profiling function (though I have yet to use it) which could provide insights into where the computational expense is going?

Edit: I’m running the code now so I can look at the tree depth bit

Scans are slow, so that’s a candidate for where to look. Is the cholesky function just doing a cholesky decomposition? If so, you should be using not pt.linalg.cholesky. That will be a lot faster because the forward computation is farmed out to LAPACK and the backward computation is done using matrix adjoints, eliminating iterative computation.

The results of the tree depth investigation (I dropped the draws to 100 and tuning to 1000 down to save time) is that they came out as 6s:

Which is still somewhat high so is a contributing factor though again probably not your sole issue. Actions for tree depth is that I think you can do some tweaking like reparameterisation is usually a big one, you already have quite high tuning and that action didn’t fix my one and the long runtime you experience mean it probably doesn’t help your model either. You can lower the max tree depth as an argument but it means your exploration is less efficient and I think results in biased inference so not ideal.

Edit: I have just realised that increasing tree depth doesn’t necessarily mean it will take longer, since a smaller tree depth during the tuning phase means the sampler will adapt to what it sees and by exploring the full distribution during tuning can lead to more efficient sampling for your draws so actually I’m not sure you would need to edit tree depth at all and your are better off looking for improvements in other aspects

Thank you for the reply. Does PyMC provide a function, like profiler in MALTAB, to visualize the time consumption on each line and help to locate the bottleneck?

Thank you so much for the detailed explanation. I am trying to find a function which can help me to find the reason on a slow sampling. In my understanding, the code above pm.smaple() only take ~0.5s, so the whole program should not be that slow. Maybe the reality is more complicate than I thought. Many thanks again!

PyMC uses pytensor as a computational backend, which does lazy execution. Before you call pm.sample, nothing is computed; only a compute graph is constructed. This code will always execute quickly as a result – it’s totally independent of the complexity of the computation.

You can see here for a discussion about profiling. I’m pretty sure the bottleneck due to your scan, and I’m still curious why you’re implementing a cholesky decomposition by hand.

Haha. Yes, I wrote the Cholesky decomposition by hand but finally did not use it. At beginning, I was think the pm.MvNormal on a 62-dimention may cost a lot of time and want to simplify it by cholesky decomposition. Then I found cholesky decomposition did not improved a lot (May be I am wrong, I will try the built in decomposition according to you suggestion). I keep that function just because I wrote it by myself and reluctant to delete, like a watermark for myself.

It seems your explanation makes me clear. Scan used a function defined by myself, which may drag down the sampling. Thank you for this information.

PyMC uses pytensor as a computational backend, which does lazy execution. Before you call pm.sample, nothing is computed; only a compute graph is constructed. This code will always execute quickly as a result – it’s totally independent of the complexity of the computation.

Thank you so much for inspiring me. Indeed, the scan is the main problem. I replaced scan by vectorization and the running time decreased to 10 minutes from around 3.5 hours.

Many thanks again!

I paste the revised code here for my notes.

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import pymc as pm
import pytensor.tensor as pt
import arviz as az
import pytensor
import time

D_phylo = pd.read_csv('Combo_Dmatrix.csv')
COMBO_OTU = pd.read_csv('COMBO_OTU.csv')
y_obs = pd.read_csv('ynew3.csv')
y_true = pd.read_csv('truevalues.csv')

D_phylo = D_phylo.to_numpy()
D_phylo = D_phylo[:,1:]
COMBO_OTU = COMBO_OTU.to_numpy()
COMBO_OTU = COMBO_OTU[:,1:]

y_obs = y_obs.to_numpy()
y_obs = y_obs[:,1:]
y_true = y_true.to_numpy()

K1_combo = int(62 * 0.85)
# rho = 0.25
# sigma = 1
cov =  1**2* np.exp(-2* 0.25* D_phylo)
samples = y_obs.shape[0]
specise = y_obs.shape[1]
phai = np.zeros([samples,1])

N = np.sum(COMBO_OTU[0:samples,:], axis=1).reshape(-1, 1)

tt = samples
spe = ['spe{}'.format(i) for i in range(specise)]
sam = ['sam{}'.format(i) for i in range(tt)]
coords = {"specie": spe, "sample": sam}

ind = pt.arange(specise)
cov = pt.as_tensor_variable(cov)

pp = pt.as_tensor_variable(np.zeros([tt,specise]))
true_abundance = pt.as_tensor_variable(np.zeros([tt,specise]))
obs_like = np.array(y_obs[:,:].tolist())
NN = np.sum(np.array(y_obs[:,:].tolist()), axis = 1)
zk = pt.arange(specise)

start = time.time() 
print("RUNing...")
if __name__ == '__main__':
    with pm.Model(coords=coords) as Multi:        
        #########Prior########       
        T = pm.Uniform('T', lower = 1, upper = 100)
        p = pm.Deterministic('p',1-pm.math.exp(-D_phylo[-1,:]/T)) # the last column is the reference taxon
        delta = pm.Bernoulli("delta",p = p, shape = specise)   #np.ones([specise])
        ind_tem = ind[pm.math.lt(delta[:],1)]
        pll = ind_tem.shape[0] 
                
        ind_tem = pytensor.ifelse.ifelse(~pt.all(pll), zk, ind_tem)
        zpll =    pytensor.ifelse.ifelse(~pt.all(pll), ind_tem.shape[0], pll)  
        
        theta = pm.Normal('theta', mu = 1 , sigma = 1, shape = specise)
        
        th_t = pt.zeros([specise])
        cov_t = pt.diag(pt.ones([specise]))
        th_t = pt.set_subtensor(th_t[0:zpll], theta[ind_tem])
        cov_t = pt.set_subtensor(cov_t[0:zpll,0:zpll], cov[ind_tem][:, ind_tem])       
        U = pm.MvNormal("U", mu = th_t, cov = cov_t, shape = (tt,zpll))       
        
        expend_con = pt.zeros((tt,specise))+1e-10
        tem = pm.math.sum(pm.math.exp(U[:, :pll-1]),axis = 1).reshape([tt,1])
        concatenated_values = pm.math.concatenate((pm.math.exp(U[:, :pll-1]) / (1 + tem), 1 / (1 + tem)),axis = 1)
        expend_con = pytensor.ifelse.ifelse(pt.eq(pll,0),
                                        pt.set_subtensor(expend_con[:,:], pt.zeros((tt,specise))+1/specise),
                                        pt.set_subtensor(expend_con[:, ind_tem], concatenated_values))
               
        ########Likelihood########                         
        likelihood_y = pm.Multinomial('likelihood', n =NN[0:tt], p = expend_con, observed = obs_like[0:tt,:])
               
        ########sampling######## 
        trace = pm.sample(draws = 2000,  tune = 1000, chains = 4, cores = 4,  step = pm.NUTS())   
        # az.plot_trace(trace)
        
        ########output########
        summary = az.summary(trace,hdi_prob=0.95,stat_focus = 'mean')
        print(summary)
        summary.to_excel('test2.xlsx')
        trace.to_netcdf("trace_information.nc")
        #trace = az.from_netcdf("trace_information.nc")
2 Likes