How to properly use dims on a 3 level hierarchical model

Hello.

I have a sales forecasting model (example below uses generated toy data) with three separate levels. Store location informs over all business line sales which informs item level sales. I’ve checked out this to help construct the model Jupyter Notebook Viewer (nbviewer.org)

That said, I’m not sure how to properly use dims. For the mean/std of items, I use the mean/std of business line and location for business line. Is that correct?

Ultimately, I want to predict item sales at each location.

I’ve tried the below code to start out with.

import scipy.stats as stats
import pandas as pd
import pymc as pm
import numpy as np
import random
import pymc.sampling_jax

import random
random.seed(439)

import aesara
from aesara import tensor as at
import arviz as az

#generate toy data
data = stats.poisson(mu=[5,2,1,7,2]).rvs([1440, 5]).T.ravel()
dates = pd.date_range('2016-01-01', freq='M', periods=72)
products = random.sample(range(1, 100),20)
business_line = ['nutrition', 'home_care', 'beauty', 'durables', 'personal_care']*4
items = dict(zip(products, business_line))
locations = [f'location_{i}' for i in range(5)]
df = pd.DataFrame(data, index=pd.MultiIndex.from_product([dates, locations, items]), columns=['eaches'])
df.reset_index(inplace=True)
df.rename(columns = {'level_0':'date','level_1':'location', 'level_2':'product'}, inplace=True)
df['business_line'] = df['product'].map(items)
df.set_index(['date', 'location', 'product'],inplace=True)

df_train = df.loc[:'2020-12-31',] 
df_test = df.loc['2020-01-31':]


def create_fourier_features(t, n, p=365.25):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1)

location_idxs, locations = pd.factorize(df_train.index.get_level_values(1))
time_idxs, times = pd.factorize(df_train.index.get_level_values(0))
item_idxs, items = pd.factorize(df_train.index.get_level_values(2))
bl_idxs, bl = pd.factorize(df_train['business_line'])
eaches = np.array(df_train['eaches'])

t = time_idxs/max(time_idxs)

month = pd.get_dummies(df_train.index.get_level_values(0).month)
sept = np.array(month[9])
march = np.array(month[3])
month = np.array(df_train.index.get_level_values(0).month)

n_yearly = 10
n_monthly = 5

# Weekly data
yearly_fourier = create_fourier_features(t, n_yearly, 12)
monthly_fourier = create_fourier_features(t, n_monthly, 1)

location_idxs, locations = pd.factorize(df_train.index.get_level_values(1))
time_idxs, times = pd.factorize(df_train.index.get_level_values(0))
n_changepoints = 8


s = np.linspace(0, np.max(t), n_changepoints+2)[1:-1]


a = (t[:, None] > s)*1

n_components = 10

def create_fourier_features(t, n, p=365.25):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1)

month = pd.get_dummies(df_train.index.get_level_values(0).month)
sept = np.array(month[9])
march = np.array(month[3])
month = np.array(df_train.index.get_level_values(0).month)

n_yearly = 10
n_monthly = 5

# Weekly data
yearly_fourier = create_fourier_features(t, n_yearly, 12)
monthly_fourier = create_fourier_features(t, n_monthly, 1)

coords_={
    "location":locations,
    "item":items,
    "business_line":bl,
    "yearly_components": [f'yearly_{f}_{i+1}' for f in ['cos', 'sin'] for i in range(n_yearly)],
    "monthly_components":[f'monthly_{f}_{i+1}' for f in ['cos', 'sin'] for i in range(n_monthly)],
    'changepoints':df_train.index.get_level_values(0)[np.argwhere(np.diff(a, axis=0) != 0)[:, 0]],
    "obs_id":[f'{loc}_{time.year}_month_{time.month}_item_{item}' for time, loc, item in df_train.index.values]
}

with pm.Model(coords=coords_) as partial_pooled_model:
    
    location_idxs_ = pm.MutableData('location_idx', location_idxs, dims = 'obs_id')
    item_idxs_ = pm.MutableData('item_idx', item_idxs, dims = 'obs_id')
    bl_idxs_ = pm.MutableData('bl_idx', bl_idxs, dims = 'obs_id')
    t_ = pm.MutableData('t', t)
    a_ = pm.MutableData('a', a)
    s_ = pm.MutableData('s', s)
    eaches_ = pm.MutableData('eaches', eaches)
    #location
    mu_k_location = pm.Normal('mu_k_location', 0, 1, dims = 'location')
    sigma_k_location = pm.HalfCauchy('sigma_k_location', 5, dims = 'location')
   
    mu_m_location = pm.Normal('mu_m_location', 0, 1, dims = 'location')
    sigma_m_location = pm.HalfCauchy('sigma_m_location', 5, dims = 'location')
    
    #business line
    mu_k_bl = pm.Normal('mu_k_bl', mu_k_location, sigma_k_location, dims = ('business_line'))
    sigma_k_bl = pm.HalfCauchy('sigma_k_bl', 5, dims = ('business_line'))
    
    mu_m_bl = pm.Normal('mu_m_bl', mu_m_location, sigma_m_location, dims = ('business_line'))
    sigma_m_bl = pm.HalfCauchy('sigma_m_bl', 5, dims = ('business_line'))

    #items
    mu_k_item = pm.Normal('mu_k_item', mu_k_bl, sigma_k_bl)
    sigma_k_item = pm.HalfCauchy('sigma_k_item', 5)
    k = pm.Normal('k_item', mu_k_item, sigma_k_item, dims = ( 'item'))
    
    mu_m_item = pm.Normal('mu_m_item', mu_m_bl, sigma_m_bl)
    sigma_m_item = pm.HalfCauchy('sigma_m_item', 5)
    m = pm.Normal('m_item', mu_m_item, sigma_m_item, dims = ('item'))
    
    delta = pm.Laplace('delta', 0, 0.1, dims = ('item', 'location', 'changepoints'))

    growth = pm.Deterministic('growth', k[item_idxs_] + (a_[time_idxs] * delta[location_idxs_, :]).sum(axis=1), dims='obs_id')
    offset = pm.Deterministic('offset', m[item_idxs_] + (a_[time_idxs] * -s_[None, :] * delta[location_idxs_, :]).sum(axis=1), dims='obs_id')
    trend = pm.Deterministic('trend', growth[item_idxs_] * t_[time_idxs] + offset[item_idxs_, location_idxs_], dims='obs_id')

    predictions =  pm.Deterministic('predictions', np.exp(trend[item_idxs_, location_idxs_]))

    pm.Poisson('obs',
               predictions,
               observed=eaches_,
               dims = 'obs_id')
    
    trace = pymc.sampling_jax.sample_numpyro_nuts(tune=3000, draws = 4000)

The graph looks like the following:

When I try to sample, I get the following error message.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/aesara/compile/function/types.py in __call__(self, *args, **kwargs)
    975                 self.vm()
--> 976                 if output_subset is None
    977                 else self.vm(output_subset=output_subset)

IndexError: index 5 is out of bounds for axis 0 with size 5

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
/tmp/ipykernel_30576/3383356212.py in <module>
     71 
     72     # trace = pm.sample(tune=3000, draws=4000)
---> 73     trace = pymc.sampling_jax.sample_numpyro_nuts(tune=3000, draws = 4000)
     74 # pm.model_to_graphviz( partial_pooled_model)

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
    478         chains=chains,
    479         initvals=initvals,
--> 480         random_seed=random_seed,
    481     )
    482 

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in _get_batched_jittered_initial_points(model, chains, initvals, random_seed, jitter, jitter_max_retries)
    163         seeds=_get_seeds_per_chain(random_seed, chains),
    164         jitter=jitter,
--> 165         jitter_max_retries=jitter_max_retries,
    166     )
    167     initial_points = [list(initial_point.values()) for initial_point in initial_points]

/opt/conda/lib/python3.7/site-packages/pymc/sampling.py in _init_jitter(model, initvals, seeds, jitter, jitter_max_retries)
   2379             if i < jitter_max_retries:
   2380                 try:
-> 2381                     model.check_start_vals(point)
   2382                 except SamplingError:
   2383                     # Retry with a new seed

/opt/conda/lib/python3.7/site-packages/pymc/model.py in check_start_vals(self, start)
   1785                 )
   1786 
-> 1787             initial_eval = self.point_logps(point=elem)
   1788 
   1789             if not all(np.isfinite(v) for v in initial_eval.values()):

/opt/conda/lib/python3.7/site-packages/pymc/model.py in point_logps(self, point, round_vals)
   1826             for factor, factor_logp in zip(
   1827                 factors,
-> 1828                 self.compile_fn(factor_logps_fn)(point),
   1829             )
   1830         }

/opt/conda/lib/python3.7/site-packages/pymc/aesaraf.py in __call__(self, state)
    685 
    686     def __call__(self, state):
--> 687         return self.f(**state)
    688 
    689 

/opt/conda/lib/python3.7/site-packages/aesara/compile/function/types.py in __call__(self, *args, **kwargs)
    990                     node=self.vm.nodes[self.vm.position_of_error],
    991                     thunk=thunk,
--> 992                     storage_map=getattr(self.vm, "storage_map", None),
    993                 )
    994             else:

/opt/conda/lib/python3.7/site-packages/aesara/link/utils.py in raise_with_op(fgraph, node, thunk, exc_info, storage_map)
    532         # Some exception need extra parameter in inputs. So forget the
    533         # extra long error message in that case.
--> 534     raise exc_value.with_traceback(exc_trace)
    535 
    536 

/opt/conda/lib/python3.7/site-packages/aesara/compile/function/types.py in __call__(self, *args, **kwargs)
    974             outputs = (
    975                 self.vm()
--> 976                 if output_subset is None
    977                 else self.vm(output_subset=output_subset)
    978             )

IndexError: index 5 is out of bounds for axis 0 with size 5
Apply node that caused the error: AdvancedSubtensor1(k_item, item_idx)
Toposort index: 5
Inputs types: [TensorType(float64, (None,)), TensorType(int32, (None,))]
Inputs shapes: [(5,), (6000,)]
Inputs strides: [(8,), (4,)]
Inputs values: [array([-1.97750852,  1.219579  , -0.59117939,  0.59691309, -1.58638952]), 'not shown']
Outputs clients: [[InplaceDimShuffle{x,0}(AdvancedSubtensor1.0)]]

Backtrace when the node is created (use Aesara flag traceback__limit=N to make it longer):
  File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2975, in run_cell
    raw_cell, store_history, silent, shell_futures, cell_id
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3029, in _run_cell
    return runner(coro)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
    coro.send(None)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3472, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3552, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_30576/3383356212.py", line 39, in <module>
    growth = pm.Deterministic('growth', k[item_idxs_] + (a_[time_idxs] * delta[location_idxs_, :]).sum(axis=1), dims='obs_id')

HINT: Use the Aesara flag `exception_verbosity=high` for a debug print-out and storage map footprint of this Apply node.

So here are some questions.

When I move down the heirarchy, should I add the previous heirarchy’s dims to the next? So for business line, I have dims = ('location', 'bl') and for items, dims = ('location', 'bl', 'item') instead of dims=('item')?

And if that’s the case, should I add all indexes to the deterministic variables?

When I move down the heirarchy, should I add the previous heirarchy’s dims to the next?

Either way is fine: Broadcasts to higher-dimensional variables (eg from “location” to (“location”, “time”)), or a long-form specification where the downstream variable remains 1-dimensional, but becomes longer through heavy reliance on subindexing. So from “location” to “location_time”.
The long-form specification is often needed if there is missing data. For example if there are a different number of time points for each location.

To debug shape errors such as the one you encountered, the most pragmatic way I’m aware of is to place something like assert some_variable.eval().shape == (1, 2, 3) in every second line.
It will be slow to recompile the model with every .eval(), but once you found the line with the shape problem, you can remove the asserts again…

1 Like