Hello,
I’m trying to create a hierarchical time series model consisting of sales for one product at 16 different locations. This model runs fine at one location.
I followed (as closely as possible) this article: GLM: Hierarchical Linear Regression — PyMC example gallery
I specifically followed the example:
county_idxs, counties = pd.factorize(data.county) coords = { "county": counties, "obs_id": np.arange(len(county_idxs)), }
When I factorize my data using:
location_idxs, locations = pd.factorize(df['location']) coords_={ "location":locations, "obs_id":np.arange(len(location_idxs)) }
, I get 16 unique locations and 943 observation ids.
When I run my model, it samples fine then I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_6258/1398150208.py in <module>
92 )
93
---> 94 trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)
/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
568 dims=dims,
569 attrs=make_attrs(attrs, library=numpyro),
--> 570 **idata_kwargs,
571 )
572
/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in from_dict(posterior, posterior_predictive, predictions, sample_stats, log_likelihood, prior, prior_predictive, sample_stats_prior, observed_data, constant_data, predictions_constant_data, warmup_posterior, warmup_posterior_predictive, warmup_predictions, warmup_log_likelihood, warmup_sample_stats, save_warmup, index_origin, coords, dims, pred_dims, pred_coords, attrs, **kwargs)
457 pred_coords=pred_coords,
458 attrs=attrs,
--> 459 **kwargs,
460 ).to_inference_data()
/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in to_inference_data(self)
333 return InferenceData(
334 **{
--> 335 "posterior": self.posterior_to_xarray(),
336 "sample_stats": self.sample_stats_to_xarray(),
337 "log_likelihood": self.log_likelihood_to_xarray(),
/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in wrapped(cls)
63 if all((getattr(cls, prop_i) is None for prop_i in prop)):
64 return None
---> 65 return func(cls)
66
67 return wrapped
/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in posterior_to_xarray(self)
102 dims=self.dims,
103 attrs=posterior_attrs,
--> 104 index_origin=self.index_origin,
105 ),
106 dict_to_dataset(
/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
312 default_dims=default_dims,
313 index_origin=index_origin,
--> 314 skip_event_dims=skip_event_dims,
315 )
316 return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))
/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
252 # filter coords based on the dims
253 coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254 return xr.DataArray(ary, coords=coords, dims=dims)
255
256
/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
404 data = _check_data_shape(data, coords, dims)
405 data = as_compatible_data(data)
--> 406 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
407 variable = Variable(dims, data, attrs, fastpath=True)
408 indexes = dict(
/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
153 if s != sizes[d]:
154 raise ValueError(
--> 155 f"conflicting sizes for dimension {d!r}: "
156 f"length {sizes[d]} on the data but length {s} on "
157 f"coordinate {k!r}"
ValueError: conflicting sizes for dimension 'location': length 943 on the data but length 16 on coordinate 'location'
Here is my model for reference:
n_changepoints = 8
t = np.arange(len(df)) / len(df)
s = np.linspace(0, np.max(t), n_changepoints+2)[1:-1]
a = (t[:, None] > s)*1
n_components = 10
def yearly_X(t, p=365.25, n=n_components):
x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
return np.concatenate((np.cos(x), np.sin(x)), axis = 1)
monthly_n_components = 5
def monthly_X(t, p=30.5, n=monthly_n_components):
x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
return np.concatenate((np.cos(x), np.sin(x)), axis = 1)
# df_sat['promo_flag_month_before'] = df_sat['promo_flag'].shift(1)
# df_sat['promo_flag_month_before'] = df_sat['promo_flag_month_before'].fillna(method = 'bfill')
# promo_before = np.array(df_sat['promo_flag_month_before'])
# promo = np.array(df_sat['promo_flag'])
month = np.array(df['month'])
yearly_fourier = yearly_X(t, 365.25/len(t))
monthly_fourier = monthly_X(t, 30.5/len(t))
location_idxs, locations = pd.factorize(df['location'])
coords_={
"location":locations,
"obs_id":np.arange(len(location_idxs))
}
with pm.Model(coords=coords_) as partial_pooled_model:
location_idx = pm.ConstantData('location_idx', location_idxs, dims = 'obs_id')
t_ = pm.MutableData('t', t)
a_ = pm.MutableData('a', a)
s_ = pm.MutableData('s', s)
yearly_f_ = pm.MutableData('yearly_f_', yearly_fourier)
monthly_m_ = pm.MutableData('monthly_m_',monthly_fourier)
sept_month = pm.MutableData('sept_month',sept)
march_month = pm.MutableData('march_month', march)
eaches = pm.MutableData('eaches', df['eaches'])
mu_k = pm.Normal('mu_k', 0,100)
sigma_k = pm.HalfNormal('sigma_k', 5)
k = pm.Normal('k', mu_k, sigma_k)
mu_m = pm.Normal('mu_m', 0, 100)
sigma_m = pm.HalfNormal('sigma_m', 5)
m = pm.Normal('m', mu_m, sigma_m)
delta = pm.Laplace('delta', 0, 0.05, shape=n_changepoints)
growth = (k + at.dot(a_, delta))
offset = (m + at.dot(a_, -s_ * delta))
trend = pm.Deterministic('trend', growth * t_ + offset, dims = 'location')
mu_yearly = pm.Normal('mu_yearly', 0, 100)
sigma_yearly = pm.HalfNormal('sigma_yearly', 5)
yearly_beta = pm.Normal('yearly_beta', mu_yearly, sigma_yearly, shape = n_components*2)
yearly_seasonality = pm.Deterministic('yearly_seasonality',at.dot(yearly_f_, yearly_beta), dims = 'location')
mu_monthly = pm.Normal('mu_monthly', 0, 100)
sigma_monthly = pm.HalfNormal('sigma_monthly', 5)
monthly_beta = pm.Normal('monthly_beta', mu_monthly, sigma_monthly, shape = monthly_n_components*2)
monthly_seasonality = pm.Deterministic('monthly_seasonality',at.dot(monthly_m_, monthly_beta), dims = 'location')
mu_sept = pm.Normal('mu_sept', 0, 100)
sigma_sept = pm.HalfNormal('sigma_sept', 5)
sept_beta = pm.Normal('sept_beta', mu_sept,sigma_sept, dims = 'location')
mu_march = pm.Normal('mu_march', 0, 100)
sigma_march = pm.HalfNormal('sigma_march', 5)
march_beta = pm.Normal('march_beta', mu_march, sigma_march, dims = 'location')
# promo_beta_before = pm.Normal('promo_beta_before', 0, 1)
# promo_beta = pm.Normal('promo_beta', 0, 1)
# promo_beta_interaction = pm.Normal('promo_beta_interaction', 0, 1)
predictions = pm.Deterministic('predictions', np.exp(trend[location_idx] + monthly_seasonality[location_idx] +
yearly_seasonality[location_idx] + (sept_beta[location_idx]*sept_month)
+ (march_beta[location_idx]*march_month)))
error = pm.HalfCauchy('error', 0.5)
pm.Poisson('obs',
predictions,
observed=eaches,
dims='obs_id'
)
trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)