Why are my coords creating an error after sampling is complete?

Hello,

I’m trying to create a hierarchical time series model consisting of sales for one product at 16 different locations. This model runs fine at one location.

I followed (as closely as possible) this article: GLM: Hierarchical Linear Regression — PyMC example gallery

I specifically followed the example:
county_idxs, counties = pd.factorize(data.county) coords = { "county": counties, "obs_id": np.arange(len(county_idxs)), }

When I factorize my data using:
location_idxs, locations = pd.factorize(df['location']) coords_={ "location":locations, "obs_id":np.arange(len(location_idxs)) } , I get 16 unique locations and 943 observation ids.

When I run my model, it samples fine then I get the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_6258/1398150208.py in <module>
     92         )
     93 
---> 94     trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
    568         dims=dims,
    569         attrs=make_attrs(attrs, library=numpyro),
--> 570         **idata_kwargs,
    571     )
    572 

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in from_dict(posterior, posterior_predictive, predictions, sample_stats, log_likelihood, prior, prior_predictive, sample_stats_prior, observed_data, constant_data, predictions_constant_data, warmup_posterior, warmup_posterior_predictive, warmup_predictions, warmup_log_likelihood, warmup_sample_stats, save_warmup, index_origin, coords, dims, pred_dims, pred_coords, attrs, **kwargs)
    457         pred_coords=pred_coords,
    458         attrs=attrs,
--> 459         **kwargs,
    460     ).to_inference_data()

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in to_inference_data(self)
    333         return InferenceData(
    334             **{
--> 335                 "posterior": self.posterior_to_xarray(),
    336                 "sample_stats": self.sample_stats_to_xarray(),
    337                 "log_likelihood": self.log_likelihood_to_xarray(),

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in wrapped(cls)
     63                 if all((getattr(cls, prop_i) is None for prop_i in prop)):
     64                     return None
---> 65             return func(cls)
     66 
     67         return wrapped

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in posterior_to_xarray(self)
    102                 dims=self.dims,
    103                 attrs=posterior_attrs,
--> 104                 index_origin=self.index_origin,
    105             ),
    106             dict_to_dataset(

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
    312             default_dims=default_dims,
    313             index_origin=index_origin,
--> 314             skip_event_dims=skip_event_dims,
    315         )
    316     return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
    252     # filter coords based on the dims
    253     coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254     return xr.DataArray(ary, coords=coords, dims=dims)
    255 
    256 

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
    404             data = _check_data_shape(data, coords, dims)
    405             data = as_compatible_data(data)
--> 406             coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
    407             variable = Variable(dims, data, attrs, fastpath=True)
    408             indexes = dict(

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
    153             if s != sizes[d]:
    154                 raise ValueError(
--> 155                     f"conflicting sizes for dimension {d!r}: "
    156                     f"length {sizes[d]} on the data but length {s} on "
    157                     f"coordinate {k!r}"

ValueError: conflicting sizes for dimension 'location': length 943 on the data but length 16 on coordinate 'location'

Here is my model for reference:

n_changepoints = 8
t = np.arange(len(df)) / len(df)

s = np.linspace(0, np.max(t), n_changepoints+2)[1:-1]


a = (t[:, None] > s)*1

n_components = 10

def yearly_X(t, p=365.25, n=n_components):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1)

monthly_n_components = 5
def monthly_X(t, p=30.5, n=monthly_n_components):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1) 

# df_sat['promo_flag_month_before'] = df_sat['promo_flag'].shift(1)
# df_sat['promo_flag_month_before'] = df_sat['promo_flag_month_before'].fillna(method = 'bfill')
# promo_before = np.array(df_sat['promo_flag_month_before'])
# promo = np.array(df_sat['promo_flag'])
month = np.array(df['month'])
yearly_fourier = yearly_X(t, 365.25/len(t))
monthly_fourier = monthly_X(t, 30.5/len(t))


location_idxs, locations = pd.factorize(df['location'])
coords_={
    "location":locations,
    "obs_id":np.arange(len(location_idxs))
}

with pm.Model(coords=coords_) as partial_pooled_model:
    location_idx = pm.ConstantData('location_idx', location_idxs, dims = 'obs_id')
    t_ = pm.MutableData('t', t)
    a_ = pm.MutableData('a', a)
    s_ = pm.MutableData('s', s)
    yearly_f_ = pm.MutableData('yearly_f_', yearly_fourier)
    monthly_m_ = pm.MutableData('monthly_m_',monthly_fourier)
    sept_month = pm.MutableData('sept_month',sept)
    march_month = pm.MutableData('march_month', march)
    eaches = pm.MutableData('eaches', df['eaches'])
    
    mu_k = pm.Normal('mu_k', 0,100)
    sigma_k = pm.HalfNormal('sigma_k', 5)
    k = pm.Normal('k', mu_k, sigma_k)
    
    mu_m = pm.Normal('mu_m', 0, 100)
    sigma_m = pm.HalfNormal('sigma_m', 5)
    m = pm.Normal('m', mu_m, sigma_m)
    delta = pm.Laplace('delta', 0, 0.05, shape=n_changepoints)

    growth = (k + at.dot(a_, delta))
    offset = (m + at.dot(a_, -s_ * delta))
    trend = pm.Deterministic('trend', growth * t_ + offset, dims = 'location')
    
    mu_yearly = pm.Normal('mu_yearly', 0, 100)
    sigma_yearly = pm.HalfNormal('sigma_yearly', 5)
    yearly_beta = pm.Normal('yearly_beta', mu_yearly, sigma_yearly, shape = n_components*2)
    yearly_seasonality = pm.Deterministic('yearly_seasonality',at.dot(yearly_f_, yearly_beta), dims = 'location')

    mu_monthly = pm.Normal('mu_monthly', 0, 100)
    sigma_monthly = pm.HalfNormal('sigma_monthly', 5)
    monthly_beta = pm.Normal('monthly_beta', mu_monthly, sigma_monthly, shape = monthly_n_components*2)
    monthly_seasonality = pm.Deterministic('monthly_seasonality',at.dot(monthly_m_, monthly_beta), dims = 'location')
    
    mu_sept = pm.Normal('mu_sept', 0, 100)
    sigma_sept = pm.HalfNormal('sigma_sept', 5)
    sept_beta = pm.Normal('sept_beta', mu_sept,sigma_sept, dims = 'location')
    
    mu_march = pm.Normal('mu_march', 0, 100)
    sigma_march = pm.HalfNormal('sigma_march', 5)
    march_beta = pm.Normal('march_beta', mu_march, sigma_march, dims = 'location')

    # promo_beta_before = pm.Normal('promo_beta_before', 0, 1)
    # promo_beta = pm.Normal('promo_beta', 0, 1)
    # promo_beta_interaction = pm.Normal('promo_beta_interaction', 0, 1)

    predictions =  pm.Deterministic('predictions', np.exp(trend[location_idx] + monthly_seasonality[location_idx] + 
                                                          yearly_seasonality[location_idx] + (sept_beta[location_idx]*sept_month) 
                                                          + (march_beta[location_idx]*march_month)))


    error = pm.HalfCauchy('error', 0.5)

    pm.Poisson('obs',
              predictions,
              observed=eaches,
               dims='obs_id'
        )

    trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)

pm.sample doesn’t actually check that the coords you provide match the dimensions of the outputs until after sampling is complete (I’ve wasted a lot of time this way). The best way I’ve found to do a pre-check is to always run pm.sample_prior_predictive before you sample.

As for why the coords are the wrong shape, i suspect the culprit is this line:

trend = pm.Deterministic('trend', growth * t_ + offset, dims = 'location')

This looks like it should have a shape like time, not like locations. Is the number 943 the number of time steps or observations? This number is a hint to what is going wrong.

943 is a timestep across all locations. I’m not sure if it matters but it’s 60 timesteps across the various locations.

When I take the dims out of “trend,” I get a shape error around the fourier transformations now:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_6258/1969810355.py in <module>
     92         )
     93 
---> 94     trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)
     95 # pm.model_to_graphviz( partial_pooled_model)
     96 

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
    568         dims=dims,
    569         attrs=make_attrs(attrs, library=numpyro),
--> 570         **idata_kwargs,
    571     )
    572 

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in from_dict(posterior, posterior_predictive, predictions, sample_stats, log_likelihood, prior, prior_predictive, sample_stats_prior, observed_data, constant_data, predictions_constant_data, warmup_posterior, warmup_posterior_predictive, warmup_predictions, warmup_log_likelihood, warmup_sample_stats, save_warmup, index_origin, coords, dims, pred_dims, pred_coords, attrs, **kwargs)
    457         pred_coords=pred_coords,
    458         attrs=attrs,
--> 459         **kwargs,
    460     ).to_inference_data()

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in to_inference_data(self)
    333         return InferenceData(
    334             **{
--> 335                 "posterior": self.posterior_to_xarray(),
    336                 "sample_stats": self.sample_stats_to_xarray(),
    337                 "log_likelihood": self.log_likelihood_to_xarray(),

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in wrapped(cls)
     63                 if all((getattr(cls, prop_i) is None for prop_i in prop)):
     64                     return None
---> 65             return func(cls)
     66 
     67         return wrapped

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in posterior_to_xarray(self)
    102                 dims=self.dims,
    103                 attrs=posterior_attrs,
--> 104                 index_origin=self.index_origin,
    105             ),
    106             dict_to_dataset(

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
    312             default_dims=default_dims,
    313             index_origin=index_origin,
--> 314             skip_event_dims=skip_event_dims,
    315         )
    316     return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
    252     # filter coords based on the dims
    253     coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254     return xr.DataArray(ary, coords=coords, dims=dims)
    255 
    256 

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
    404             data = _check_data_shape(data, coords, dims)
    405             data = as_compatible_data(data)
--> 406             coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
    407             variable = Variable(dims, data, attrs, fastpath=True)
    408             indexes = dict(

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
    153             if s != sizes[d]:
    154                 raise ValueError(
--> 155                     f"conflicting sizes for dimension {d!r}: "
    156                     f"length {sizes[d]} on the data but length {s} on "
    157                     f"coordinate {k!r}"

ValueError: conflicting sizes for dimension 'location': length 20 on the data but length 943 on coordinate 'location'

So there are 943 unique locations? But there are only 20 location labels

16 locations, 943 rows in the data. the 20 is the yearly n_components*2