I’m trying to use a model for data with missing values.
The error I get is: ValueError: different number of dimensions on data and dims: 3 vs 5
It seems like a shape issue for the missing values.
Here’s the code to replicate:
import pymc as pm
import numpy as np
import xarray as xr
import pandas as pd
# Create xarray with test data
coords = {'exercise': ['Squat', 'Overhead Press', 'Deadlift'],
'date': ['2020-02-24', '2020-02-26', '2020-03-21', '2020-03-25', '2020-04-06', '2020-07-25', '2020-08-10'],
'observation': [1, 2, 3, 4, 5, 6]}
dims = list(coords.keys())
velocities = np.array([[[1.198, 1.042, 0.875, 0.735, 0.574, 0.552],
[1.261, 0.993, 0.857, 0.828, 0.624, 0.599],
[1.224, 1.028, 0.990, 0.742, 0.595, 0.427],
[1.112, 0.999, 0.865, 0.751, 0.607, 0.404],
[1.157, 1.010, 0.849, 0.716, 0.593, 0.536],
[1.179, 1.060, 0.898, 0.783, 0.615, 0.501],
[1.209, 1.192, 0.979, 1.025, 0.875, 0.887]],
[[0.911, 0.933, 0.779, 0.629, 0.528, 0.409],
[1.004, 0.863, 0.770, 0.611, 0.511, 0.376],
[0.980, 0.828, 0.766, 0.611, 0.529, 0.349],
[1.024, 0.933, 0.841, 0.734, 0.551, 0.287],
[0.985, 0.852, 0.599, 0.522, 0.313, 0.234],
[0.996, 0.763, 0.758, 0.542, 0.463, 0.378],
[1.066, 0.915, 0.786, 0.686, 0.565, 0.429]],
[[0.654, 1.004, 0.727, 0.483, 0.344, 0.317],
[0.885, 0.738, 0.577, 0.495, 0.335, 0.291],
[0.749, 0.695, 0.539, 0.461, 0.395, 0.279],
[0.801, 0.548, 0.404, 0.424, 0.337, 0.338],
[0.770, 0.642, 0.602, 0.493, 0.394, 0.290],
[0.766, 0.545, 0.426, 0.431, 0.349, 0.329],
[0.787, 0.640, 0.480, 0.401, 0.395, 0.304]]])
loads = np.array([[[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 117.5],
[20.0, 40.0, 60.0, 80.0, 100.0, 115.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 30.0, 40.0, 50.0, 60.0, 70.0]],
[[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 30.0, 40.0, 45.0, 50.0, 52.5],
[20.0, 30.0, 35.0, 40.0, 42.5, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0]],
[[60.0, 80.0, 100.0, 120.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 127.5, 140.0],
[60.0, 100.0, 120.0, 125.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 130.0, 140.0],
[60.0, 100.0, 120.0, 125.0, 130.0, 132.5],
[70.0, 90.0, 120.0, 135.0, 140.0, 150.0]]])
mask = np.random.binomial(n = 1, p = 0.8, size = velocities.shape)
dataset = (xr.Dataset({'velocity': (dims, velocities),
'load': (dims, loads),
'mask': (dims, mask)},
coords = coords)
.set_coords(['mask']))
dataset['velocity_std'] = (dataset['velocity'] - dataset['velocity'].mean())/dataset['velocity'].std()
dataset['load_std'] = (dataset['load'] - dataset['load'].mean())/dataset['load'].std()
dataset['velocity_std_masked'] = dataset['velocity_std'].where(dataset['mask'])
dataset['load_std_masked'] = dataset['load_std'].where(dataset['mask'])
# Create PyMC
n_exercises = dataset.dims['exercise']
n_dates = dataset.dims['date']
n_observations = dataset.dims['observation']
exercise_date_idx = [[i]*n_dates for i in np.arange(n_exercises)]
exercise_idx = [[[i]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]
date_idx = [[[j]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]
velocity = dataset['velocity_std_masked']
load = dataset['load_std_masked']
with pm.Model(coords = coords) as model:
# Inputs
velocity = pm.Normal('velocity',
mu = 0.0,
sigma = 1.0,
observed = velocity,
dims = dims)
# Global parameters
global_intercept = pm.Normal('global_intercept', mu = 0, sigma = 1)
global_intercept_sd = pm.Exponential('global_intercept_sd', lam = 1)
global_slope = pm.Normal('global_slope', mu = 0, sigma = 1)
global_slope_sd = pm.Exponential('global_slope_sd', lam = 1)
# Exercise parameters
exercise_intercept = pm.Normal('exercise_intercept', mu = global_intercept, sigma = global_intercept_sd, dims = 'exercise')
exercise_intercept_sd = pm.Exponential('exercise_intercept_sd', lam = 1, dims = 'exercise')
exercise_slope = pm.Normal('exercise_slope', mu = global_slope, sigma = global_slope_sd, dims = 'exercise')
exercise_slope_sd = pm.Exponential('exercise_slope_sd', lam = 1, dims = 'exercise')
# Date parameters
date_intercept = pm.Normal('date_intercept', mu = exercise_intercept[exercise_date_idx], sigma = exercise_intercept_sd[exercise_date_idx], dims = ['exercise', 'date'])
date_slope = pm.Normal('date_slope', mu = exercise_slope[exercise_date_idx], sigma = exercise_slope_sd[exercise_date_idx], dims = ['exercise', 'date'])
# Model error
sigma = pm.Exponential('sigma', lam = 1)
# Expected value
mu = date_intercept[exercise_idx, date_idx] + date_slope[exercise_idx, date_idx]*velocity
# Likelihood of observed values
likelihood = pm.Normal('load',
mu = mu,
sigma = sigma,
observed = load,
dims = dims)
pm.model_to_graphviz(model)
pm.sample_prior_predictive(model = model)
/usr/local/lib/python3.7/dist-packages/arviz/data/base.py:141: UserWarning: In variable load_observed, there are more dims (3) given than exist (1). Passed array should have shape (chain,draw, *shape)
UserWarning,
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-64-864ebabcd045> in <module>
----> 1 pm.sample_prior_predictive(model = model)
7 frames
/usr/local/lib/python3.7/dist-packages/pymc/sampling.py in sample_prior_predictive(samples, model, var_names, random_seed, return_inferencedata, idata_kwargs, compile_kwargs)
2265 if idata_kwargs:
2266 ikwargs.update(idata_kwargs)
-> 2267 return pm.to_inference_data(prior=prior, **ikwargs)
2268
2269
/usr/local/lib/python3.7/dist-packages/pymc/backends/arviz.py in to_inference_data(trace, prior, posterior_predictive, log_likelihood, coords, dims, model, save_warmup)
588 dims=dims,
589 model=model,
--> 590 save_warmup=save_warmup,
591 ).to_inference_data()
592
/usr/local/lib/python3.7/dist-packages/pymc/backends/arviz.py in to_inference_data(self)
520 "posterior_predictive": self.posterior_predictive_to_xarray(),
521 "predictions": self.predictions_to_xarray(),
--> 522 **self.priors_to_xarray(),
523 "observed_data": self.observed_data_to_xarray(),
524 }
/usr/local/lib/python3.7/dist-packages/pymc/backends/arviz.py in priors_to_xarray(self)
473 library=pymc,
474 coords=self.coords,
--> 475 dims=self.dims,
476 )
477 )
/usr/local/lib/python3.7/dist-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
312 default_dims=default_dims,
313 index_origin=index_origin,
--> 314 skip_event_dims=skip_event_dims,
315 )
316 return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))
/usr/local/lib/python3.7/dist-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
252 # filter coords based on the dims
253 coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254 return xr.DataArray(ary, coords=coords, dims=dims)
255
256
/usr/local/lib/python3.7/dist-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
404 data = _check_data_shape(data, coords, dims)
405 data = as_compatible_data(data)
--> 406 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
407 variable = Variable(dims, data, attrs, fastpath=True)
408 indexes = dict(
/usr/local/lib/python3.7/dist-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
122 elif len(dims) != len(shape):
123 raise ValueError(
--> 124 "different number of dimensions on data "
125 f"and dims: {len(shape)} vs {len(dims)}"
126 )
ValueError: different number of dimensions on data and dims: 3 vs 5