Why are my coords creating an error after sampling is complete?

jordan.howell2 · June 28, 2022, 2:44pm

Hello,

I’m trying to create a hierarchical time series model consisting of sales for one product at 16 different locations. This model runs fine at one location.

I followed (as closely as possible) this article: GLM: Hierarchical Linear Regression — PyMC example gallery

I specifically followed the example:
county_idxs, counties = pd.factorize(data.county) coords = { "county": counties, "obs_id": np.arange(len(county_idxs)), }

When I factorize my data using:
location_idxs, locations = pd.factorize(df['location']) coords_={ "location":locations, "obs_id":np.arange(len(location_idxs)) } , I get 16 unique locations and 943 observation ids.

When I run my model, it samples fine then I get the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_6258/1398150208.py in <module>
     92         )
     93 
---> 94     trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
    568         dims=dims,
    569         attrs=make_attrs(attrs, library=numpyro),
--> 570         **idata_kwargs,
    571     )
    572 

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in from_dict(posterior, posterior_predictive, predictions, sample_stats, log_likelihood, prior, prior_predictive, sample_stats_prior, observed_data, constant_data, predictions_constant_data, warmup_posterior, warmup_posterior_predictive, warmup_predictions, warmup_log_likelihood, warmup_sample_stats, save_warmup, index_origin, coords, dims, pred_dims, pred_coords, attrs, **kwargs)
    457         pred_coords=pred_coords,
    458         attrs=attrs,
--> 459         **kwargs,
    460     ).to_inference_data()

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in to_inference_data(self)
    333         return InferenceData(
    334             **{
--> 335                 "posterior": self.posterior_to_xarray(),
    336                 "sample_stats": self.sample_stats_to_xarray(),
    337                 "log_likelihood": self.log_likelihood_to_xarray(),

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in wrapped(cls)
     63                 if all((getattr(cls, prop_i) is None for prop_i in prop)):
     64                     return None
---> 65             return func(cls)
     66 
     67         return wrapped

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in posterior_to_xarray(self)
    102                 dims=self.dims,
    103                 attrs=posterior_attrs,
--> 104                 index_origin=self.index_origin,
    105             ),
    106             dict_to_dataset(

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
    312             default_dims=default_dims,
    313             index_origin=index_origin,
--> 314             skip_event_dims=skip_event_dims,
    315         )
    316     return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
    252     # filter coords based on the dims
    253     coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254     return xr.DataArray(ary, coords=coords, dims=dims)
    255 
    256 

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
    404             data = _check_data_shape(data, coords, dims)
    405             data = as_compatible_data(data)
--> 406             coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
    407             variable = Variable(dims, data, attrs, fastpath=True)
    408             indexes = dict(

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
    153             if s != sizes[d]:
    154                 raise ValueError(
--> 155                     f"conflicting sizes for dimension {d!r}: "
    156                     f"length {sizes[d]} on the data but length {s} on "
    157                     f"coordinate {k!r}"

ValueError: conflicting sizes for dimension 'location': length 943 on the data but length 16 on coordinate 'location'

Here is my model for reference:

n_changepoints = 8
t = np.arange(len(df)) / len(df)

s = np.linspace(0, np.max(t), n_changepoints+2)[1:-1]


a = (t[:, None] > s)*1

n_components = 10

def yearly_X(t, p=365.25, n=n_components):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1)

monthly_n_components = 5
def monthly_X(t, p=30.5, n=monthly_n_components):
    x = 2 * np.pi * (np.arange(n)+1) * t[:, None] / p
    return np.concatenate((np.cos(x), np.sin(x)), axis = 1) 

# df_sat['promo_flag_month_before'] = df_sat['promo_flag'].shift(1)
# df_sat['promo_flag_month_before'] = df_sat['promo_flag_month_before'].fillna(method = 'bfill')
# promo_before = np.array(df_sat['promo_flag_month_before'])
# promo = np.array(df_sat['promo_flag'])
month = np.array(df['month'])
yearly_fourier = yearly_X(t, 365.25/len(t))
monthly_fourier = monthly_X(t, 30.5/len(t))


location_idxs, locations = pd.factorize(df['location'])
coords_={
    "location":locations,
    "obs_id":np.arange(len(location_idxs))
}

with pm.Model(coords=coords_) as partial_pooled_model:
    location_idx = pm.ConstantData('location_idx', location_idxs, dims = 'obs_id')
    t_ = pm.MutableData('t', t)
    a_ = pm.MutableData('a', a)
    s_ = pm.MutableData('s', s)
    yearly_f_ = pm.MutableData('yearly_f_', yearly_fourier)
    monthly_m_ = pm.MutableData('monthly_m_',monthly_fourier)
    sept_month = pm.MutableData('sept_month',sept)
    march_month = pm.MutableData('march_month', march)
    eaches = pm.MutableData('eaches', df['eaches'])
    
    mu_k = pm.Normal('mu_k', 0,100)
    sigma_k = pm.HalfNormal('sigma_k', 5)
    k = pm.Normal('k', mu_k, sigma_k)
    
    mu_m = pm.Normal('mu_m', 0, 100)
    sigma_m = pm.HalfNormal('sigma_m', 5)
    m = pm.Normal('m', mu_m, sigma_m)
    delta = pm.Laplace('delta', 0, 0.05, shape=n_changepoints)

    growth = (k + at.dot(a_, delta))
    offset = (m + at.dot(a_, -s_ * delta))
    trend = pm.Deterministic('trend', growth * t_ + offset, dims = 'location')
    
    mu_yearly = pm.Normal('mu_yearly', 0, 100)
    sigma_yearly = pm.HalfNormal('sigma_yearly', 5)
    yearly_beta = pm.Normal('yearly_beta', mu_yearly, sigma_yearly, shape = n_components*2)
    yearly_seasonality = pm.Deterministic('yearly_seasonality',at.dot(yearly_f_, yearly_beta), dims = 'location')

    mu_monthly = pm.Normal('mu_monthly', 0, 100)
    sigma_monthly = pm.HalfNormal('sigma_monthly', 5)
    monthly_beta = pm.Normal('monthly_beta', mu_monthly, sigma_monthly, shape = monthly_n_components*2)
    monthly_seasonality = pm.Deterministic('monthly_seasonality',at.dot(monthly_m_, monthly_beta), dims = 'location')
    
    mu_sept = pm.Normal('mu_sept', 0, 100)
    sigma_sept = pm.HalfNormal('sigma_sept', 5)
    sept_beta = pm.Normal('sept_beta', mu_sept,sigma_sept, dims = 'location')
    
    mu_march = pm.Normal('mu_march', 0, 100)
    sigma_march = pm.HalfNormal('sigma_march', 5)
    march_beta = pm.Normal('march_beta', mu_march, sigma_march, dims = 'location')

    # promo_beta_before = pm.Normal('promo_beta_before', 0, 1)
    # promo_beta = pm.Normal('promo_beta', 0, 1)
    # promo_beta_interaction = pm.Normal('promo_beta_interaction', 0, 1)

    predictions =  pm.Deterministic('predictions', np.exp(trend[location_idx] + monthly_seasonality[location_idx] + 
                                                          yearly_seasonality[location_idx] + (sept_beta[location_idx]*sept_month) 
                                                          + (march_beta[location_idx]*march_month)))


    error = pm.HalfCauchy('error', 0.5)

    pm.Poisson('obs',
              predictions,
              observed=eaches,
               dims='obs_id'
        )

    trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)

jessegrabowski · June 28, 2022, 3:12pm

pm.sample doesn’t actually check that the coords you provide match the dimensions of the outputs until after sampling is complete (I’ve wasted a lot of time this way). The best way I’ve found to do a pre-check is to always run pm.sample_prior_predictive before you sample.

As for why the coords are the wrong shape, i suspect the culprit is this line:

trend = pm.Deterministic('trend', growth * t_ + offset, dims = 'location')

This looks like it should have a shape like time, not like locations. Is the number 943 the number of time steps or observations? This number is a hint to what is going wrong.

jordan.howell2 · June 28, 2022, 3:42pm

943 is a timestep across all locations. I’m not sure if it matters but it’s 60 timesteps across the various locations.

When I take the dims out of “trend,” I get a shape error around the fourier transformations now:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_6258/1969810355.py in <module>
     92         )
     93 
---> 94     trace = pymc.sampling_jax.sample_numpyro_nuts(tune=2000, draws = 2000)
     95 # pm.model_to_graphviz( partial_pooled_model)
     96 

/opt/conda/lib/python3.7/site-packages/pymc/sampling_jax.py in sample_numpyro_nuts(draws, tune, chains, target_accept, random_seed, initvals, model, var_names, progress_bar, keep_untransformed, chain_method, postprocessing_backend, idata_kwargs, nuts_kwargs)
    568         dims=dims,
    569         attrs=make_attrs(attrs, library=numpyro),
--> 570         **idata_kwargs,
    571     )
    572 

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in from_dict(posterior, posterior_predictive, predictions, sample_stats, log_likelihood, prior, prior_predictive, sample_stats_prior, observed_data, constant_data, predictions_constant_data, warmup_posterior, warmup_posterior_predictive, warmup_predictions, warmup_log_likelihood, warmup_sample_stats, save_warmup, index_origin, coords, dims, pred_dims, pred_coords, attrs, **kwargs)
    457         pred_coords=pred_coords,
    458         attrs=attrs,
--> 459         **kwargs,
    460     ).to_inference_data()

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in to_inference_data(self)
    333         return InferenceData(
    334             **{
--> 335                 "posterior": self.posterior_to_xarray(),
    336                 "sample_stats": self.sample_stats_to_xarray(),
    337                 "log_likelihood": self.log_likelihood_to_xarray(),

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in wrapped(cls)
     63                 if all((getattr(cls, prop_i) is None for prop_i in prop)):
     64                     return None
---> 65             return func(cls)
     66 
     67         return wrapped

/opt/conda/lib/python3.7/site-packages/arviz/data/io_dict.py in posterior_to_xarray(self)
    102                 dims=self.dims,
    103                 attrs=posterior_attrs,
--> 104                 index_origin=self.index_origin,
    105             ),
    106             dict_to_dataset(

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
    312             default_dims=default_dims,
    313             index_origin=index_origin,
--> 314             skip_event_dims=skip_event_dims,
    315         )
    316     return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))

/opt/conda/lib/python3.7/site-packages/arviz/data/base.py in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
    252     # filter coords based on the dims
    253     coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254     return xr.DataArray(ary, coords=coords, dims=dims)
    255 
    256 

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, indexes, fastpath)
    404             data = _check_data_shape(data, coords, dims)
    405             data = as_compatible_data(data)
--> 406             coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
    407             variable = Variable(dims, data, attrs, fastpath=True)
    408             indexes = dict(

/opt/conda/lib/python3.7/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
    153             if s != sizes[d]:
    154                 raise ValueError(
--> 155                     f"conflicting sizes for dimension {d!r}: "
    156                     f"length {sizes[d]} on the data but length {s} on "
    157                     f"coordinate {k!r}"

ValueError: conflicting sizes for dimension 'location': length 20 on the data but length 943 on coordinate 'location'

jessegrabowski · June 28, 2022, 3:45pm

So there are 943 unique locations? But there are only 20 location labels

jordan.howell2 · June 28, 2022, 4:12pm

16 locations, 943 rows in the data. the 20 is the yearly n_components*2

Topic		Replies	Views
I'm having trouble finding where the shape error is v5	3	552	June 18, 2022
I'm getting an index error on my hierarchical model v5	2	490	June 17, 2022
pymc.exceptions.SamplingError: Initial evaluation of model at starting point failed! modeling	6	46	March 6, 2025
Error when using pm.Potential with pm.Data + coordinates Questions	3	411	April 5, 2021
What are the Reasons for a Variable with few Non-zero Values to Cause the Model to Fail to Converge? v5	2	286	October 30, 2023

Why are my coords creating an error after sampling is complete?

Related topics