Implementing hierarchical priors encoded in tree structures

I would like to model a linear regression problem, in which the input random variables’ relations are encoded in a tree structure.

More specifically, each tree node corresponds to an input random variable. For each non-root node n, denote the corresponding input variable as X_n. Further, denote n's parent node as p(n) and the corresponding variable as X_{p(n)}. Then we have the following relations:

  • Each input random variable follows a normal distribution, whose mean follows another normal distribution defined by its parent.
  • The random variable corresponding to tree root follows is normally distributed with a fixed mean.

In the case of a binary tree of 3 levels (with 7 variables), the model looks like this:

Finally, for simplicity, all normal distributions in this example have the same \sigma.

Below is a MWE, in which a binary tree is used.

  • You can specify the number of levels by setting n_levels.
  • The tree structure is encoded in a “parent array”, where the i-th element contains the index of parent of the i-th node.
import numpy as np
import pymc as pm

def build_parent_array(n_levels: int) -> np.ndarray:
    """
    utility function to build the parent array for a binary tree of n levels

    examples

    >> build_parent_array(1) # [-1]
    >> build_parent_array(2) # [-1, 0, 0]
    >> build_parent_array(3) # [-1, 0, 0, 1, 1, 2, 2]
    """
    assert n_levels >= 1
    parent_mapping = [-1]
    cur = 0
    for l in range(1, n_levels):
        for par in range(2 ** (l - 1)):
            parent_mapping += [cur, cur]
            cur += 1
    return np.array(parent_mapping, dtype=int)

n_levels = 7  # you may decrease this value to make the code run

parent_array = build_parent_array(n_levels)
print(f"number of variables: {len(parent_array)}")

SIGMA = 10
with pm.Model() as model:
    input_vars = []
    for var_id, parent in enumerate(parent_array):
        if parent == -1:
            input_vars.append(
                pm.Normal("var-" + str(var_id), mu=0, sigma=SIGMA)
            )
        else:
            input_vars.append(
                pm.Normal(
                    "var-" + str(var_id), mu=input_vars[parent], sigma=SIGMA
                )
            )
    inputs = pm.Deterministic("inputs", pm.math.stack(input_vars))
    pm.sample()

However, I observe that the code does not “scale” with the tree size.

For instance setting n_levels beyond 6 would give the following error.

fatal error: bracket nesting level exceeded maximum of 256

My questions are:

  • If we restrict to the above modeling, how can I circumvent the maximum nesting level limit?
  • Is there any modeling approach to achieve a similar effect, i.e., a variable prior depends on its parent, and does not suffer from the above issue?

The symbolic structure for doing recursive computation is pytensor.scan. You can read about its usages here. You just have to change how you’re thinking about the problem. First, you will need to store the whole population of nodes in a single flat vector. At each step, you have to compute the parent locations, mutate the parents, then store the children. Here’s a binary tree to illustrate what I mean:

import numpy as np
n_levels = 7

# This will hold the whole population of nodes
binary_tree = np.zeros(2 ** n_levels - 1)

for level in range(1, n_levels):
    parent_start = 2 ** (level - 1) - 1
    parent_end = 2 ** level - 1
    parent_idx = np.arange(parent_start, parent_end)
    
    child_start = 2 ** level -1 
    child_stop = 2 ** (level + 1) - 1
    child_idx = np.arange(child_start, child_stop)
    
    # Here the mutation is just to add one to the left node and subtract one from the right node
    binary_tree[child_idx] = binary_tree[parent_idx].repeat(2) + np.tile(np.array([1, -1]), len(parent_idx))

Visualize the results:

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

geneology = np.r_[-1, np.arange(2 ** (n_levels-1) - 1).repeat(2)]
edges = pd.DataFrame(np.c_[geneology, np.arange(2 ** n_levels - 1)], columns=['parent', 'child'])
edges['weight'] = binary_tree

G = nx.from_pandas_edgelist(edges.iloc[1:], 'parent', 'child', edge_attr='weight', create_using=nx.DiGraph)
pos = nx.drawing.nx_pydot.graphviz_layout(G, prog='dot', root=0)

fig, ax = plt.subplots(figsize=(14,4))
nx.draw_networkx(G, pos, node_color=edges.weight, labels=edges.weight, ax=ax, )

Looks right to me, so we can go ahead and implement this in pytensor using a scan and random variables:

import pytensor
import pytensor.tensor as pt
import pymc as pm

def grow_tree(level, tree, sigma):
    parent_start = 2 ** (level - 1) - 1
    parent_end = 2 ** level - 1
    parent_idx = pt.arange(parent_start, parent_end)
    
    child_start = 2 ** level -1 
    child_stop = 2 ** (level + 1) - 1
    child_idx = pt.arange(child_start, child_stop)
    
    # Shape of children is (2, n_parents) because each parent has 2 offspring
    children = pm.Normal.dist(mu=tree[parent_idx], 
                                sigma=sigma, 
                                size=(2, parent_idx.shape[0]))
    
    # Need to transpose children because ravel operates row-major. Children is currently shape
    # (child, parent), which will ravel to [child 1 of parent 1, child1 of parent 2, child 1 of parent 3, 
    # ...], but we want [child 1 of parent 1, child 2 of parent 1, child 1 of parent 2, ...]
    tree = pt.set_subtensor(tree[child_idx], children.T.ravel())
    
    return tree

n_levels = 7
init_tree = pt.zeros(2 ** n_levels - 1)
init_tree = pt.set_subtensor(init_tree[0], pm.Normal.dist(0, 1))

res, _ = pytensor.scan(grow_tree,
                       sequences=[pt.arange(1, n_levels)],
                       outputs_info=[init_tree],
                       non_sequences=[1]) 

final_tree = res[-1]
lowest_level = final_tree[2**(n_levels - 1) - 1:2 ** n_levels - 1]

You can do lowest_level.eval() to check what you get out. If you want to use this inside a pymc model, you will have some additional challenges, because you are missing some boilerplate related to scan and CustomDistributions, see here for guidance. More importantly, I don’t think PyMC can infer the logp of models with slicing – @ricardoV94 ? Luckily your model is just a really strange random walk with drift, so it should be possible to work out the logp yourself and implement it.

Hi, many thanks for your detailed reply! I’ve learned a lot (I’m neither fluent in probabilistic modeling nor in PyMC)

I tried your suggestion above and at least managed to make the code run. However, I was not able to achieve the result I’m looking for.

I will attach my partially-working solution in a separate reply.

My goal:

  • I would like to build a linear model to solve a regression problem, in which the input variables correspond to different “categories”
  • the categories can be organized into a tree structure, for instance, a general category (e.g., fruit) contains more specific ones (e.g., apple and orange) as children
  • Finally, I would like to incorporate such relations into the regression model, in that distribution of a general category provides information about that of its children.
    • this would be useful if very few observations on the children variables can be obtained

Below is an example to illustrate the expected outcome.

(Note that this example is not using the vectorized approach suggested above, thus does not scale.)

Say I have 5 variables/categories, A to E, with the relations that A is the root and is parent of B and C. And B is the parent of D and E.

from treelib import Tree

# say we have 5 variables, with the following relations
tree_dict = {"A": {"B": {"D": None, "E": None}, "C": None}}

# Create a new tree
tree = Tree()


# Function to add nodes recursively
def add_nodes(tree, parent, name_to_children):
    for name, children in name_to_children.items():
        tree.create_node(name, name, parent=parent)
        if children:
            add_nodes(tree, name, children)


# Add nodes to the tree
add_nodes(tree, None, tree_dict)

# Print the tree
print(tree)

which gives the following tree structure

A
├── B
│   ├── D
│   └── E
└── C

Then I build the model using recursion as follows

import numpy as np
import pymc as pm
from pymc import HalfCauchy, Model, Normal

RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)


def get_model(tree, X_data, y_data):
    predictor_names = list(sorted(tree.nodes.keys()))
    coords = {
        "predictor": predictor_names,
        "sample": np.arange(X_data.shape[0]),
    }
    with Model(coords=coords) as model:
        SIGMA = 10

        def build_beta(tree, node_name, parent_predictor):
            """construct the input variables recursively, note that it does not scale for large trees"""
            ret = {}
            name = f"beta_{node_name}"
            if parent_predictor is None:
                # root
                predictor = Normal(name=node_name, mu=0, sigma=SIGMA)
            else:
                predictor = Normal(
                    name=node_name, mu=parent_predictor, sigma=SIGMA
                )

            ret[node_name] = predictor

            for child in tree.children(node_name):
                ret |= build_beta(tree, child.identifier, predictor)
            return ret

        X = pm.MutableData(
            "X",
            X_data,
            dims=["sample", "predictor"],
        )

        # Define priors
        sigma = HalfCauchy("sigma", beta=5)
        beta_var_dict = build_beta(tree, tree.root, None)
        beta = pm.Deterministic(
            "beta",
            pm.math.stack(
                [
                    beta_var_dict[pred_name]
                    for pred_name in model.coords["predictor"]
                ]
            ),
            dims="predictor",
        )

        # Define likelihood
        likelihood = Normal(
            "y", mu=pm.math.dot(X, beta), sigma=sigma, observed=y_data
        )
    return model

To illustrate my point, I fit the model on some synthetic data, in which certain variables are “masked out”. The expectation is that the information of masked out variables can inferred based on their parents’ posterior.

In the example below, I deliberately masked the values corresponding to D and E, therefore nothing can be learned about D and E from the data if we assume the variables are independent.

However, under the above model specification, D and E, are children of B. Therefore, the mean of their posterior should be close to the mean of B’s posterior (which is 2)


# beta is the coefficients of the underlying model
#                A  B  C  D  E
beta = np.array([1, 2, 3, 4, 5])

num_samples = 500

X = np.random.random((num_samples, len(beta)))
# "hide" D and E from the input data by zeroing them out
X[:, -2:] = 0

y = np.dot(X, beta) + rng.normal(scale=0.001, size=num_samples)

with get_model(tree, X, y) as model:
    pm.set_data({"X": X})
    trace = pm.sample(tune=3000)
print(trace.posterior.beta.mean(axis=1).mean(axis=0).to_series())

The output is something like:

predictor
A    1.000126
B    1.999723
C    3.000123
D    2.016451 -> close to B's mean
E    1.975278 -> same as above

Note that D and E’s posterior mean “inherits” from B’s posterior mean, due to the tree structure incorporated in the model.

My questions are perhaps a bit open:

  • assume that we can implement the above model using pytensor.scan (as suggested in the prevoius reply), is such approach a reasonable way to incorporate structural dependency among the variables?

I have also been thinking of using multi-variate Gaussian The main idea is to use the covariance matrix to capture the inter-dependency among the variables. However, I do not see a good way to initialize the covariance matrix.

Based on your suggestion, below is my version that can run at least. However, the output posterior does not give the expected results.

import pytensor
import numpy as np
import pytensor.tensor as pt
import pymc as pm

from pytensor.tensor.variable import TensorVariable
from pymc.pytensorf import collect_default_updates

# say we have 7 variables in a binary tree of 3 levels
n_levels = 3

num_samples = 500

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(RANDOM_SEED)

# the coefficients in the linear regression model
beta = np.arange(2**n_levels - 1)

X = np.random.random((num_samples, len(beta)))

# we hide the variables corresponding to the leaf nodes
X[:, 3:] = 0.0

y = np.dot(X, beta) + rng.normal(scale=0.01, size=num_samples)

def hierarchical_prior_dist(init_vector, n_levels, size):
    def grow_tree(level: int, tree):
        parent_start = 2 ** (level - 1) - 1
        parent_end = 2**level - 1
        parent_idx = pt.arange(parent_start, parent_end)

        child_start = 2**level - 1
        child_stop = 2 ** (level + 1) - 1
        child_idx = pt.arange(child_start, child_stop)

        children = pm.Normal.dist(
            mu=tree[parent_idx], sigma=1, size=(2, parent_idx.shape[0])
        )

        tree = pt.set_subtensor(tree[child_idx], children.T.ravel())

        return tree, collect_default_updates(outputs=[tree])

    init_tree = pm.Normal.dist(0, 1, shape=init_vector.shape)
    res, _ = pytensor.scan(
        grow_tree,
        sequences=[pt.arange(1, n_levels)],
        outputs_info=[init_tree],
    )
    return res[-1]


coords = {
    "predictor": np.arange(2**n_levels - 1),
    "sample": np.arange(X.shape[0]),
}


with pm.Model(coords=coords) as model:

    X_var = pm.MutableData(
        "X",
        X,
        dims=["sample", "predictor"],
    )

    init_beta = pt.zeros(2**n_levels - 1)
    init_beta = pt.set_subtensor(init_beta[0], pm.Normal.dist(0, 1))

    beta = pm.CustomDist(
        "beta",
        init_beta,
        n_levels,
        dist=hierarchical_prior_dist,
        logp=lambda value, *dist_params:  -((value[0]**2).sum() + (value[1:3]**2 / 2).sum() + (value[3:]**2 / 3).sum()),
        dims=["predictor"],
    )

    sigma = pm.HalfCauchy("sigma", beta=5)

    likelihood = pm.Normal("y", mu=pm.math.dot(X_var, beta), sigma=sigma, observed=y)

    trace = pm.sample()

print(trace.posterior.beta.mean(axis=1).mean(axis=0).to_series())

And the output is something like:

predictor
0    0.000491
1    0.999587
2    1.999850
3    0.003031  # I would expect the value to be close to 1.0, since it's the child of variable 1
4    0.013212  # same as above
5    0.006006  # I would expect the value to be close to 2.0, since it's the child of variable 2
6    0.005180  # same as above

As can be seen, the mean of posteroir of variable 3-6 are close to 0, instead of the mean of the parents’ posterior.

What am I missing?

My hunch is that the logp function is defined incorrectly. To get the expected output, the mu of the posterior should be used.

Something like


    beta = pm.CustomDist(
        "beta",
        init_beta,
        n_levels,
        dist=hierarchical_prior_dist,
        logp=lambda value, *dist_params: -(
            (value[0] ** 2).sum()
            + ((value[1] - 0) ** 2).sum()
            + ((value[2] - 0) ** 2).sum()
            + ((value[3] - 1) ** 2).sum()
            + ((value[4] - 1) ** 2).sum()
            + ((value[5] - 2) ** 2).sum()
            + ((value[6] - 2) ** 2).sum()
        ),
        dims=["predictor"],
    )

However, I don’t see a way to access the posterior inside the logp function.

If you have a fixed tree structure with known depth and relationships, for example country → state → city, you can use a solution like this one (althought it appears that code has some bugs!). It seems like what you want is slightly different from the binary branching you presented in the first post? Maybe it would be best if you go back to the beginning and explain what your exact problem setup and model is, and we can go from there.

1 Like

Apology about the confusion! Yes I used binary tree for the sake of exposition.

In my case, the input tree is a general tree (with arbitrary depth and branching factor, etc). For instance, country -> state -> city -> street -> building number -> ....

I will check the telescoping example and get back to you if needed!

The ‘telescoping’ technique works for me! Thanks for sharing the pointer.

Below is an MWE for my use case:

import pytensor
import pandas as pd
import pymc as pm
import numpy as np

# create a binary tree of 3 levels with 7 nodes
# the dataframe contains leaves only, so 4 rows
df = pd.DataFrame(
    [
        ["A", "B", "D"],
        ["A", "B", "E"],
        ["A", "C", "F"],
        ["A", "C", "G"],
    ],
    columns=["level_0", "level_1", "level_2"],
)


def make_var_at_level(name: str, var_of_parent_level, mapping: np.ndarray, dims=None):
    """create an RV on a certain level, based on the RV of its parent level"""

    return pm.Normal(name, mu=var_of_parent_level[mapping], sigma=5, dims=dims)


level_0_idx, level_0_labels = pd.factorize(df["level_0"])
level_1_idx, level_1_labels = pd.factorize(df["level_1"])

edges = df[["level_0", "level_1", "level_2"]].drop_duplicates()

level_0_idx_by_level_1 = (
    edges[["level_0", "level_1"]]
    .drop_duplicates()
    .set_index("level_1")["level_0"]
    .sort_index()
    .map({k: i for i, k in enumerate(level_0_labels)})
    .values
)
level_1_idx_by_level_2 = (
    edges[["level_1", "level_2"]]
    .drop_duplicates()
    .set_index("level_2")["level_1"]
    .sort_index()
    .map({k: i for i, k in enumerate(level_1_labels)})
    .values
)


# create the toy dataset for a linear regression problem
num_samples = 500

n_levels = 3
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(RANDOM_SEED)

# the coefficients in the linear regression model
# 0, 1, 2, 3, 4, 5, 6
# however, we 'mask out' the trailing 4 entries (corresponding to the leaves)
beta = np.arange(2**n_levels - 1)

X = np.random.random((num_samples, len(beta)))

# we hide the variables corresponding to the leaf nodes
X[:, 3:] = 0.0

y = np.dot(X, beta) + rng.normal(scale=0.01, size=num_samples)

coords = {
    "level_0": ["A"],
    "level_1": ["B", "C"],
    "level_2": ["D", "E", "F", "G"],
    "predictor": ["A", "B", "C", "D", "E", "F", "G"],
}
with pm.Model(coords=coords):
    X_var = pm.MutableData(
        "X",
        X,
        dims=["sample", "predictor"],
    )

    beta_level_0 = pm.Normal("beta_level_0", 0, 1, dims="level_0")
    beta_level_1 = make_var_at_level(
        "beta_level_1", beta_level_0, mapping=level_0_idx_by_level_1, dims="level_1"
    )
    beta_level_2 = make_var_at_level(
        "beta_level_2", beta_level_1, mapping=level_1_idx_by_level_2, dims="level_2"
    )

    beta = pm.Deterministic(
        "beta",
        pm.math.concatenate([beta_level_0, beta_level_1, beta_level_2]),
        dims="predictor",
    )

    sigma = pm.HalfCauchy("sigma", beta=5)

    likelihood = pm.Normal("y", mu=pm.math.dot(X_var, beta), sigma=sigma, observed=y)

    trace = pm.sample()

print(trace.posterior.beta.mean(axis=1).mean(axis=0))

The expected output should be something like

array([4.34063445e-04, 9.99617637e-01, 1.99989857e+00, 9.32247531e-01,
       1.04552970e+00, 2.06996202e+00, 2.01758882e+00])

Note that the trailing 4 entries should “inherit” values from their parents.