Implementing hierarchical priors encoded in tree structures

Based on your suggestion, below is my version that can run at least. However, the output posterior does not give the expected results.

import pytensor
import numpy as np
import pytensor.tensor as pt
import pymc as pm

from pytensor.tensor.variable import TensorVariable
from pymc.pytensorf import collect_default_updates

# say we have 7 variables in a binary tree of 3 levels
n_levels = 3

num_samples = 500

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(RANDOM_SEED)

# the coefficients in the linear regression model
beta = np.arange(2**n_levels - 1)

X = np.random.random((num_samples, len(beta)))

# we hide the variables corresponding to the leaf nodes
X[:, 3:] = 0.0

y = np.dot(X, beta) + rng.normal(scale=0.01, size=num_samples)

def hierarchical_prior_dist(init_vector, n_levels, size):
    def grow_tree(level: int, tree):
        parent_start = 2 ** (level - 1) - 1
        parent_end = 2**level - 1
        parent_idx = pt.arange(parent_start, parent_end)

        child_start = 2**level - 1
        child_stop = 2 ** (level + 1) - 1
        child_idx = pt.arange(child_start, child_stop)

        children = pm.Normal.dist(
            mu=tree[parent_idx], sigma=1, size=(2, parent_idx.shape[0])
        )

        tree = pt.set_subtensor(tree[child_idx], children.T.ravel())

        return tree, collect_default_updates(outputs=[tree])

    init_tree = pm.Normal.dist(0, 1, shape=init_vector.shape)
    res, _ = pytensor.scan(
        grow_tree,
        sequences=[pt.arange(1, n_levels)],
        outputs_info=[init_tree],
    )
    return res[-1]


coords = {
    "predictor": np.arange(2**n_levels - 1),
    "sample": np.arange(X.shape[0]),
}


with pm.Model(coords=coords) as model:

    X_var = pm.MutableData(
        "X",
        X,
        dims=["sample", "predictor"],
    )

    init_beta = pt.zeros(2**n_levels - 1)
    init_beta = pt.set_subtensor(init_beta[0], pm.Normal.dist(0, 1))

    beta = pm.CustomDist(
        "beta",
        init_beta,
        n_levels,
        dist=hierarchical_prior_dist,
        logp=lambda value, *dist_params:  -((value[0]**2).sum() + (value[1:3]**2 / 2).sum() + (value[3:]**2 / 3).sum()),
        dims=["predictor"],
    )

    sigma = pm.HalfCauchy("sigma", beta=5)

    likelihood = pm.Normal("y", mu=pm.math.dot(X_var, beta), sigma=sigma, observed=y)

    trace = pm.sample()

print(trace.posterior.beta.mean(axis=1).mean(axis=0).to_series())

And the output is something like:

predictor
0    0.000491
1    0.999587
2    1.999850
3    0.003031  # I would expect the value to be close to 1.0, since it's the child of variable 1
4    0.013212  # same as above
5    0.006006  # I would expect the value to be close to 2.0, since it's the child of variable 2
6    0.005180  # same as above

As can be seen, the mean of posteroir of variable 3-6 are close to 0, instead of the mean of the parents’ posterior.

What am I missing?

My hunch is that the logp function is defined incorrectly. To get the expected output, the mu of the posterior should be used.

Something like


    beta = pm.CustomDist(
        "beta",
        init_beta,
        n_levels,
        dist=hierarchical_prior_dist,
        logp=lambda value, *dist_params: -(
            (value[0] ** 2).sum()
            + ((value[1] - 0) ** 2).sum()
            + ((value[2] - 0) ** 2).sum()
            + ((value[3] - 1) ** 2).sum()
            + ((value[4] - 1) ** 2).sum()
            + ((value[5] - 2) ** 2).sum()
            + ((value[6] - 2) ** 2).sum()
        ),
        dims=["predictor"],
    )

However, I don’t see a way to access the posterior inside the logp function.