Understanding dimensions and groups with dot product in hierarchical model

I’m trying to build a hierarchical model using the dot product method for the logit calc as shown in the code below:

D = 1000
model_df = pd.DataFrame({
    'group_name':np.random.randint(0,5, size=D),
    'a':np.random.binomial(n=1, p=.65, size=D),
    'b':np.random.binomial(n=1, p=.35, size=D),
    'target':np.random.binomial(n=1, p=.5, size=D),
})

print(model_df.corr())
print(model_df.describe())

model_features = ['a', 'b']

group_idxs, groups = pd.factorize(model_df.group_name)
coords = {
    "group": groups,
    "group_id": np.arange(len(group_idxs)),
    "feature_id" : np.arange(len(model_features))
}
n_features = len(model_features)
with pm.Model(coords=coords) as model:

    group_idx = pm.Data("group_idx", group_idxs, dims="group_id")

    regressors = pm.ConstantData("regressors", model_df[model_features], dims=("group_id","feature_id"))

    target = pm.ConstantData("target", model_df.target)

    #define priors
    mu_intercept = pm.Normal('mu_intercept', mu=0, sigma=10)
    sigma_intercept = pm.HalfCauchy('sigma_intercept', 5)
    intercept = pm.Normal('intercept', mu=mu_intercept, sigma=sigma_intercept, dims="group_id")

    mu_pooled = pm.Normal('mu_pooled', mu=0, sigma=5)
    sigma_pooled = pm.HalfCauchy('sigma_pooled', 5)
    pooled_betas = pm.Normal('pooled_betas', mu=mu_pooled, sigma=sigma_pooled, dims=("feature_id", "group_id"))

    logit = intercept[indication_idx] + regressors[group_idx].dot(pooled_betas[group_idx]).sum(axis=1)

    p_i = pm.Deterministic('p_i', pm.math.invlogit(logit))

    #define the bernoulli likelihood
    y_observed = pm.Bernoulli('y_obs', p=p_i, observed=target)
    trace = pm.sample(1000, tune=1000, target_accept=.9, cores=2)

The above returns the following error:

/pymc/data.py:671: UserWarning: The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`, which is equivalent to using `pm.MutableData()`. In v4.1.0 the default changed to `pm.Data(mutable=False)`, equivalent to `pm.ConstantData`. Use `pm.ConstantData`/`pm.MutableData` or pass `pm.Data(..., mutable=False/True)` to avoid this warning.
  warnings.warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/aesara/compile/function/types.py:975, in Function.__call__(self, *args, **kwargs)
    973 try:
    974     outputs = (
--> 975         self.vm()
    976         if output_subset is None
    977         else self.vm(output_subset=output_subset)
    978     )
    979 except Exception:

IndexError: index 2 is out of bounds for axis 0 with size 2

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
Cell In[176], line 46
     44 #define the bernoulli likelihood
     45 y_observed = pm.Bernoulli('y_obs', p=p_i, observed=target)
---> 46 trace = pm.sample(1000, tune=1000, target_accept=.9, cores=2)

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/sampling.py:533, in sample(draws, step, init, n_init, initvals, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, jitter_max_retries, return_inferencedata, idata_kwargs, mp_ctx, **kwargs)
    531         [kwargs.setdefault(k, v) for k, v in nuts_kwargs.items()]
    532     _log.info("Auto-assigning NUTS sampler...")
--> 533     initial_points, step = init_nuts(
    534         init=init,
    535         chains=chains,
    536         n_init=n_init,
    537         model=model,
    538         random_seed=random_seed_list,
    539         progressbar=progressbar,
    540         jitter_max_retries=jitter_max_retries,
    541         tune=tune,
    542         initvals=initvals,
    543         **kwargs,
    544     )
    546 if initial_points is None:
    547     # Time to draw/evaluate numeric start points for each chain.
    548     ipfns = make_initial_point_fns_per_chain(
    549         model=model,
    550         overrides=initvals,
    551         jitter_rvs=filter_rvs_to_jitter(step),
    552         chains=chains,
    553     )

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/sampling.py:2487, in init_nuts(init, chains, n_init, model, random_seed, progressbar, jitter_max_retries, tune, initvals, **kwargs)
   2480 _log.info(f"Initializing NUTS using {init}...")
   2482 cb = [
   2483     pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff="absolute"),
   2484     pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff="relative"),
   2485 ]
-> 2487 initial_points = _init_jitter(
   2488     model,
   2489     initvals,
   2490     seeds=random_seed_list,
   2491     jitter="jitter" in init,
   2492     jitter_max_retries=jitter_max_retries,
   2493 )
   2495 apoints = [DictToArrayBijection.map(point) for point in initial_points]
   2496 apoints_data = [apoint.data for apoint in apoints]

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/sampling.py:2381, in _init_jitter(model, initvals, seeds, jitter, jitter_max_retries)
   2379 if i < jitter_max_retries:
   2380     try:
-> 2381         model.check_start_vals(point)
   2382     except SamplingError:
   2383         # Retry with a new seed
   2384         seed = rng.randint(2**30, dtype=np.int64)

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/model.py:1791, in Model.check_start_vals(self, start)
   1785     valid_keys = ", ".join(self.named_vars.keys())
   1786     raise KeyError(
   1787         "Some start parameters do not appear in the model!\n"
   1788         f"Valid keys are: {valid_keys}, but {extra_keys} was supplied"
   1789     )
-> 1791 initial_eval = self.point_logps(point=elem)
   1793 if not all(np.isfinite(v) for v in initial_eval.values()):
   1794     raise SamplingError(
   1795         "Initial evaluation of model at starting point failed!\n"
   1796         f"Starting values:\n{elem}\n\n"
   1797         f"Initial evaluation results:\n{initial_eval}"
   1798     )

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/model.py:1832, in Model.point_logps(self, point, round_vals)
   1826 factors = self.basic_RVs + self.potentials
   1827 factor_logps_fn = [at.sum(factor) for factor in self.logp(factors, sum=False)]
   1828 return {
   1829     factor.name: np.round(np.asarray(factor_logp), round_vals)
   1830     for factor, factor_logp in zip(
   1831         factors,
-> 1832         self.compile_fn(factor_logps_fn)(point),
   1833     )
   1834 }

File ~/.cache/pypoetry/virtualenvs/ml-XNmo5OXX-py3.8/lib/python3.8/site-packages/pymc/aesaraf.py:695, in PointFunc.__call__(self, state)
...
 - TensorConstant{(1,) of 0}, Shape: (1,), ElemSize: 1 Byte(s), TotalSize: 1 Byte(s)
 TotalSize: 64186.0 Byte(s) 0.000 GB
 TotalSize inputs: 56138.0 Byte(s) 0.000 GB

The below accomplishes what I want, but the code isn’t as clean.

D = 1000


model_df = pd.DataFrame({
    'group_name':np.random.randint(0,5, size=D),
    'a':np.random.binomial(n=1, p=.65, size=D),
    'b':np.random.binomial(n=1, p=.35, size=D),
    'target':np.random.binomial(n=1, p=.5, size=D),
})

print(model_df.corr())
print(model_df.describe())

model_features = ['a', 'b']

group_idxs, groups = pd.factorize(model_df.group_name)
coords = {
    "group": groups,
    "group_id": np.arange(len(group_idxs)),
    "feature_id" : np.arange(len(model_features))
}
n_features = len(model_features)
with pm.Model(coords=coords) as model:

    group_idx = pm.Data("group_idx", group_idxs, dims="group_id")

    regressors = {}
    for feat_name in model_features:
        regressors[feat_name] = pm.Data(feat_name, model_df[feat_name], dims='group_id')

    target = pm.ConstantData("target", model_df.target)

    #define priors
    mu_intercept = pm.Normal('mu_intercept', mu=0, sigma=10)
    sigma_intercept = pm.HalfCauchy('sigma_intercept', 5)
    intercept = pm.Normal('intercept', mu=mu_intercept, sigma=sigma_intercept, dims="group_id")

    mu_pooled = pm.Normal('mu_pooled', mu=0, sigma=5)
    sigma_pooled = pm.HalfCauchy('sigma_pooled', 5)
    #pooled_betas = pm.Normal('pooled_betas', mu=mu_pooled, sigma=sigma_pooled, dims=("group_id","feature_id"))

    pooled_betas = {}
    for feat_name in model_features:
        pooled_betas[feat_name] = pm.Normal(f'{feat_name}_beta', mu=mu_pooled, sigma=sigma_pooled, dims="group_id")

    logit = intercept[indication_idx]
    for feat_name in model_features:
        logit += regressors[feat_name][group_idx] * pooled_betas[feat_name][group_idx]

    p_i = pm.Deterministic('p_i', pm.math.invlogit(logit))

    #define the bernoulli likelihood
    y_observed = pm.Bernoulli('y_obs', p=p_i, observed=target)
    trace = pm.sample(1000, tune=1000, target_accept=.9, cores=2)

I sense that I’m missing something simple. If so, any suggestions are much appreciated. Thanks!

The proximate cause of your error is that you are indexing the first dimension of pooled_betas using group_idx in the logit = line, but the groups are on the second dimension. That’s where the indexing error comes from (feature_id only has size 2)

There’s another glitch though, because you shouldn’t be using a dot product here. When you use indexes to access batched hierarchical parameters, you will have to do the dot product “by hand”. As written, regressors [size (1000, 2)] dot pooled_betas[group_idx] [size (2, 1000)], you will get a 1000, 1000 matrix where the (i, j)-th entry has the feature vector x_i associated with row i, multiplied by the coefficients \beta_j associated with row j.

Obviously you only want x_i \beta_i (the diagonal of your 1000, 1000 matrix), so you can just do a normal product then take the sum:

logit = intercept[group_idx] + (regressors * pooled_betas[:, group_idx].T).sum(axis=-1)

Now you have (1000, 2) * (1000, 2) which gives rows with [x_{i,1} \cdot \beta_{i, 1}, \quad x_{i, 2} \cdot \beta_{i, 2]}. Summing away the last dimension gives you the final “dot product”.

Note that I had to transpose pooled_betas. This could be avoided if you swap the dims when you define it.

1 Like

Thanks for the quick reply. You’re right about the shape mismatch, and the use of the normal product + sum vs the dot product.

In implementing the amendments, I realized that my previous setup was creating separate coefficients for each sample instead of each group. I’ve now changed the dims to use “group” instead of “group_id”, so that there is only one set of coefficients for each group. Unfortunately, however, I’m still getting a similar error.

D = 1000
model_df = pd.DataFrame({
    'group_name':np.random.randint(0,5, size=D),
    'a':np.random.binomial(n=1, p=.65, size=D),
    'b':np.random.binomial(n=1, p=.35, size=D),
    'target':np.random.binomial(n=1, p=.5, size=D),
}).astype(int)

print(model_df.corr())
print(model_df.describe())

model_features = ['a', 'b']

group_idxs, groups = pd.factorize(model_df.group_name.sort_values())
coords = {
    "group": groups,
    "group_id": np.arange(len(group_idxs)),
    "feature_id" : np.arange(len(model_features))
}
n_features = len(model_features)
with pm.Model(coords=coords) as model:

    group_idx = pm.ConstantData("group_idx", group_idxs, dims="group_id")

    regressors = pm.ConstantData("regressors", model_df[model_features], dims=("group", "feature_id"))

    target = pm.ConstantData("target", model_df.target)

    #define priors
    mu_intercept = pm.Normal('mu_intercept', mu=0, sigma=10)
    sigma_intercept = pm.HalfCauchy('sigma_intercept', 5)
    intercept = pm.Normal('intercept', mu=mu_intercept, sigma=sigma_intercept, dims="group")

    mu_pooled = pm.Normal('mu_pooled', mu=0, sigma=5)
    sigma_pooled = pm.HalfCauchy('sigma_pooled', 5)
    pooled_betas = pm.Normal('pooled_betas', mu=mu_pooled, sigma=sigma_pooled, dims=("feature_id", "group"))

    logit = intercept[group_idx] + (regressors[group_idx] * pooled_betas[group_idx]).sum(axis=1)

    p_i = pm.Deterministic('p_i', pm.math.invlogit(logit))

    #define the bernoulli likelihood
    y_observed = pm.Bernoulli('y_obs', p=p_i, observed=target)
    trace = pm.sample(1000, tune=1000, target_accept=.9, cores=2)

You are still indexing pooled_betas by group_idx on the feature dimension

you’re right, thanks.