Creating hierarchical models using 3 dimensional xarray data

mattiasthalen · October 6, 2020, 1:34pm

Solution for 1(2?) hierarchical levels

Replaced the data with the samples from the real world:

import xarray as xr
import arviz as az
import pymc3 as pm
import numpy as np

# Create xarray with test data
coords = {'exercise': ['Squat', 'Overhead Press', 'Deadlift'],
          'date': ['2020-02-24', '2020-02-26', '2020-03-21', '2020-03-25', '2020-04-06', '2020-07-25', '2020-08-10'],
          'observation': [1, 2, 3, 4, 5, 6]}

dims = coords.keys()

velocities = [[[1.198, 1.042, 0.875, 0.735, 0.574, 0.552],
               [1.261, 0.993, 0.857, 0.828, 0.624, 0.599],
               [1.224, 1.028, 0.990, 0.742, 0.595, 0.427],
               [1.112, 0.999, 0.865, 0.751, 0.607, 0.404],
               [1.157, 1.010, 0.849, 0.716, 0.593, 0.536],
               [1.179, 1.060, 0.898, 0.783, 0.615, 0.501],
               [1.209, 1.192, 0.979, 1.025, 0.875, 0.887]],
              
              [[0.911, 0.933, 0.779, 0.629, 0.528, 0.409],
               [1.004, 0.863, 0.770, 0.611, 0.511, 0.376],
               [0.980, 0.828, 0.766, 0.611, 0.529, 0.349],
               [1.024, 0.933, 0.841, 0.734, 0.551, 0.287],
               [0.985, 0.852, 0.599, 0.522, 0.313, 0.234],
               [0.996, 0.763, 0.758, 0.542, 0.463, 0.378],
               [1.066, 0.915, 0.786, 0.686, 0.565, 0.429]],
              
              [[0.654, 1.004, 0.727, 0.483, 0.344, 0.317],
               [0.885, 0.738, 0.577, 0.495, 0.335, 0.291],
               [0.749, 0.695, 0.539, 0.461, 0.395, 0.279],
               [0.801, 0.548, 0.404, 0.424, 0.337, 0.338],
               [0.770, 0.642, 0.602, 0.493, 0.394, 0.290],
               [0.766, 0.545, 0.426, 0.431, 0.349, 0.329],
               [0.787, 0.640, 0.480, 0.401, 0.395, 0.304]]]

loads = [[[20.0,  40.0,  60.0,  80.0, 100.0, 110.0],
          [20.0,  40.0,  60.0,  80.0, 100.0, 110.0],
          [20.0,  40.0,  60.0,  80.0, 100.0, 117.5],
          [20.0,  40.0,  60.0,  80.0, 100.0, 115.0],
          [20.0,  40.0,  60.0,  80.0, 100.0, 110.0],
          [20.0,  40.0,  60.0,  80.0, 100.0, 110.0],
          [20.0,  30.0,  40.0,  50.0,  60.0,  70.0]],
         
         [[20.0,  25.0,  30.0,  35.0,  40.0,  45.0],
          [20.0,  25.0,  30.0,  35.0,  40.0,  45.0],
          [20.0,  25.0,  30.0,  35.0,  40.0,  45.0],
          [20.0,  25.0,  30.0,  35.0,  40.0,  45.0],
          [20.0,  30.0,  40.0,  45.0,  50.0,  52.5],
          [20.0,  30.0,  35.0,  40.0,  42.5,  45.0],
          [20.0,  25.0,  30.0,  35.0,  40.0,  45.0]],
         
         [[60.0,  80.0, 100.0, 120.0, 140.0, 145.0],
          [60.0,  80.0, 100.0, 120.0, 140.0, 145.0],
          [60.0,  80.0, 100.0, 120.0, 127.5, 140.0],
          [60.0, 100.0, 120.0, 125.0, 140.0, 145.0],
          [60.0,  80.0, 100.0, 120.0, 130.0, 140.0],
          [60.0, 100.0, 120.0, 125.0, 130.0, 132.5],
          [70.0,  90.0, 120.0, 135.0, 140.0, 150.0]]]

dataset = xr.Dataset({'velocity': (dims, velocities),
                      'load': (dims, loads)},
                     coords = coords)

dataset['velocity_std'] = (dataset['velocity'] - dataset['velocity'].mean())/dataset['velocity'].std()
dataset['load_std'] = (dataset['load'] - dataset['load'].mean())/dataset['load'].std()

dataset

New model:

# Create PyMC
n_exercises = dataset.dims['exercise']
n_dates = dataset.dims['date']
n_observations = dataset.dims['observation']

exercise_idx = [[[i]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]

velocity = dataset['velocity_std']
load = dataset['load_std']

with pm.Model(coords = coords) as model:
    # Inputs
    exercise_idx_shared = pm.Data('exercise_idx', exercise_idx, dims = dims)

    velocity_shared = pm.Data('velocity', velocity, dims = dims)
    load_shared = pm.Data('load', load, dims = dims)

    # Global parameters
    global_intercept = pm.Normal('global_intercept', mu = 0, sigma = 1)
    global_intercept_sd = pm.Exponential('global_intercept_sd', lam = 1)

    global_slope = pm.Normal('global_slope', mu = 0, sigma = 1)
    global_slope_sd = pm.Exponential('global_slope_sd', lam = 1)

    # Exercise parameters
    exercise_intercept = pm.Normal('exercise_intercept', mu = global_intercept, sigma = global_intercept_sd, dims = 'exercise')
    exercise_slope = pm.Normal('exercise_slope', mu = global_slope, sigma = global_slope_sd, dims = 'exercise')

    # Model error
    sigma = pm.Exponential('sigma', lam = 1)

    # Expected value
    mu = exercise_intercept[exercise_idx_shared] + exercise_slope[exercise_idx_shared]*velocity_shared

    # Likelihood of observed values
    likelihood = pm.Normal('likelihood',
                           mu = mu,
                           sigma = sigma,
                           observed = load_shared,
                           dims = dims)

As I understand, the trick was to get the exercise_idx down to the lowest level in the matrix:

exercise_idx = [[[i]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]

Solution for 2(3?) level hierarchy
The key was once again the indexing. In addition to exercise_idx, the equivivalent for date_idx was needed, and the intermediate indexing exercise_date_idx. The first ones are the full array with the values replaced by the index number of the corresponding coord. The intermediate index list brings the index from the upper (exercise) coord down to the current (date) coord.

# Create PyMC
n_exercises = dataset.dims['exercise']
n_dates = dataset.dims['date']
n_observations = dataset.dims['observation']

exercise_date_idx = [[i]*n_dates for i in np.arange(n_exercises)]
exercise_idx = [[[i]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]
date_idx = [[[j]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]

velocity = dataset['velocity_std']
load = dataset['load_std']

with pm.Model(coords = coords) as model:
    # Inputs
    exercise_date_idx_shared = pm.Data('exercise_date_idx', exercise_date_idx, dims = ['exercise', 'date'])
    exercise_idx_shared = pm.Data('exercise_idx', exercise_idx, dims = dims)
    date_idx_shared = pm.Data('date_idx', date_idx, dims = dims)

    velocity_shared = pm.Data('velocity', velocity, dims = dims)
    load_shared = pm.Data('load', load, dims = dims)

    # Global parameters
    global_intercept = pm.Normal('global_intercept', mu = 0, sigma = 1)
    global_intercept_sd = pm.Exponential('global_intercept_sd', lam = 1)

    global_slope = pm.Normal('global_slope', mu = 0, sigma = 1)
    global_slope_sd = pm.Exponential('global_slope_sd', lam = 1)

    # Exercise parameters
    exercise_intercept = pm.Normal('exercise_intercept', mu = global_intercept, sigma = global_intercept_sd, dims = 'exercise')
    exercise_intercept_sd = pm.Exponential('exercise_intercept_sd', lam = 1, dims = 'exercise')

    exercise_slope = pm.Normal('exercise_slope', mu = global_slope, sigma = global_slope_sd, dims = 'exercise')
    exercise_slope_sd = pm.Exponential('exercise_slope_sd', lam = 1, dims = 'exercise')
    
    # Date parameters
    date_intercept = pm.Normal('date_intercept', mu = exercise_intercept[exercise_date_idx_shared], sigma = exercise_intercept_sd[exercise_date_idx_shared], dims = ['exercise', 'date'])
    date_slope = pm.Normal('date_slope', mu = exercise_slope[exercise_date_idx_shared], sigma = exercise_slope_sd[exercise_date_idx_shared], dims = ['exercise', 'date'])

    # Model error
    sigma = pm.Exponential('sigma', lam = 1)

    # Expected value
    mu = date_intercept[exercise_idx_shared, date_idx_shared] + date_slope[exercise_idx_shared, date_idx_shared]*velocity_shared

    # Likelihood of observed values
    likelihood = pm.Normal('likelihood',
                           mu = mu,
                           sigma = sigma,
                           observed = load_shared,
                           dims = dims)

Topic		Replies	Views
Input dimension mismatch in hierarchical mixture model Questions	0	665	May 19, 2021
Impute results in mismatch dimensions in dims and data v5	6	1509	September 28, 2022
Dimension error for Hierarchical model version agnostic	11	376	April 13, 2023
Confused about building a compound distribution hierarchical Bayes model with continuous and categorical inputs in pymc3 Questions	1	778	June 8, 2019
Time-series regression - shape error / Input dimension mis-match Questions time_series	8	1145	August 6, 2022

Creating hierarchical models using 3 dimensional xarray data

Solution for 1(2?) hierarchical levels

Related topics