I’m trying my hand a multiclass classification with GP’s. I already have a basic model that works where each GP if independent of the rest. I softmax their outputs to get the probability. I’m trying to build a ICM model, but pymc’s computation time has gone up to an unreasonable extent. EST for MCMC is around 500+ hours on a relatively good machine, SVDG doesn’t even initialize and even ADVI take dozens of hours. Is there any way to expediate the computations here? I don’t see why it’s so slow, save for the weird input shape required by the coreg kernel (I’ve had to duplicate all rows K times, for K categories to massage it the correct format). The following non ICM version works acceptably well:
# MGP withought coregionalization
with pymc.Model() as categorical_simple_model:
factors=5
N,M=X_train.shape
gps=[]
fs=[]
G = 1e7
σ_w = pymc.Normal(f'σ_w', mu=10., sigma=3, shape=M)
σ_b= pymc.Normal(f'σ_b', mu=0.0, sigma=0.1)
for factor,loc in list(core_mappings[('origin', 'location', 'loc')].items()):
μ = pymc.gp.mean.Constant(0)
# μ_g = pymc.Normal(f'μ_g_{loc}', mu=10.0, sigma=1.5)
# σ_g =pymc.Normal(f'σ_g_{loc}', mu=2.5, sigma=0.5)
μ_g = 10.0
σ_g = 1.0
η = pymc.Normal(f'η_{loc}', mu=μ_g, sigma=σ_g)
κ = MultiLayerPerceptronKernel(M, variance=η**2, bias_variance=σ_b**2, weight_variance=σ_w**2+1e-4)
gp = pymc.gp.Latent(mean_func=μ, cov_func=κ)
gps.append(gp)
f = gp.prior(f'f_{loc}'.format(loc=loc),
X=x_tensor)
fs.append(f)
f = pymc.Deterministic('f', at.stack(*fs).T)
p = pymc.Deterministic('p', at.nnet.softmax(f, axis=1))
# y = pymc.Multinomial('y', p=p ,observed=Y_train.values, n=1)
y = pymc.Categorical('y', p=p, observed=y_tensor)
The ICM version takes hundreds of hours to run:
with pymc.Model() as categorical_ICM:
LOWER_BOUND_NON_NEGATIVE = 1e-4
σ_n = pymc.TruncatedNormal('σ_n', mu=10.0, sigma=2, lower=LOWER_BOUND_NON_NEGATIVE, shape=x.shape[0], initval=(np.ones(x.shape[0])*1.0))
k = pymc.TruncatedNormal('k', mu=10.0, sigma=2,lower=LOWER_BOUND_NON_NEGATIVE ,shape=factors)
W = (-1)*pymc.TruncatedNormal('W', mu=10.0, sigma=2.0,lower=LOWER_BOUND_NON_NEGATIVE, shape=(factors,2) )
σ_b = pymc.TruncatedNormal('σ_b', mu=3.0, sigma=1.0, lower=LOWER_BOUND_NON_NEGATIVE)
σ_w = pymc.TruncatedNormal('σ_w',mu=3.0, sigma=0.8,lower=LOWER_BOUND_NON_NEGATIVE ,shape=x.shape[1])
G = 1
η = pymc.TruncatedNormal('η',mu=10, sigma=3.0, lower=LOWER_BOUND_NON_NEGATIVE)
K = MultiLayerPerceptronKernel(x.shape[1],active_dims=[i for i in range(1,x.shape[1])],
variance = G*(η+1.0), bias_variance = σ_b**2, weight_variance=σ_w**2)
B = pymc.gp.cov.Coregion(X.shape[1], kappa=k, W=W, active_dims=[0])
k_noise = pymc.gp.cov.WhiteNoise(σ_n)
κ = (K+k_noise)*B
μ = pymc.gp.mean.Constant(c=0)
gp = pymc.gp.Latent(mean_func=μ, cov_func=κ)
_f = gp.prior('_f', X=x_tensor)
f = pymc.Deterministic('f', _f.reshape((-1, factors)))
p = pymc.Deterministic('p', at.nnet.softmax(f, axis=1))
y_obs = pymc.Categorical('y_obs',p=p ,observed=y_tensor)