How to implement Zero-Inflated Multinomial?

Ok what do you think of this?

with pm.Model() as m_slope:
    mus_cats = []
    for p in range(Ncategories - 1):
        sd_dist = pm.Exponential.dist(1)
        packed_chol = pm.LKJCholeskyCov(f"chol_cov_{p}", eta=4, n=2, sd_dist=sd_dist)
        chol = pm.expand_packed_triangular(2, packed_chol, lower=True)
        cov = pm.math.dot(chol, chol.T)

        # extract rho and sds:
        cov = pm.math.dot(chol, chol.T)
        sigma_ab = pm.Deterministic(f"sigma_cluster_{p}", tt.sqrt(tt.diag(cov)))
        corr = tt.diag(sigma_ab ** -1).dot(cov.dot(tt.diag(sigma_ab ** -1)))
        r = pm.Deterministic(f"Rho_{p}", corr[np.triu_indices(2, k=1)])

        ab = pm.Normal(
            f"ab_{p}", mu=np.array([-1.8, -0.5]), sd=np.array([0.1, 0.2]), shape=2
        )

        # varying effects:
        ab_cluster = pm.MvNormal(
            f"ab_cluster{p}", mu=ab, chol=chol, shape=(Nclusters, 2)
        )

        mus_cats.append(
            ab_cluster[cluster_id, 0]
            + ab_cluster[cluster_id, 1] * predictor
        )
    mus_cats = tt.as_tensor_variable(mus_cats).T

    # append last category:
    vary_pivot = tt.as_tensor_variable(np.full(shape=(Nclusters, 1), fill_value=-2.2))
    mus_cats = tt.horizontal_stack(mus_cats, vary_pivot[cluster_id])

    # preferences in each cluster:
    lat_p = tt.nnet.softmax(mus_cats)

    # keep only preferences for available categories:
    cat_prob = available_cats * lat_p
    # normalize preferences:
    cat_prob = cat_prob / tt.sum(cat_prob, axis=1, keepdims=True)
    R = pm.Multinomial("R", n=N, p=cat_prob, observed=sim_R)

There is no divergences yet, but sampling is reeeeeaaaaaally slow, so I’m probably doing something wrong :thinking: Note that the intercept-only model samples very fast, so I think it’s adding the covariance structure that slows things up.
Do you see a way to optimize the above implementation?
Thank you very much in advance :champagne: