How do I make future predictions in a time series model with out of sample data?

Hello.

I’m trying to sample the posterior with out of sample data to test my model. I thought I would have to update the coords like the shared variables to account for the update in the time value but I’m getting the following error while running the below code block.

#update time, items, and months
time_idxs_test, times_test = pd.factorize(df_test.index.get_level_values(0))
location_idxs_test, locations_test = pd.factorize(df_test.index.get_level_values(1))
item_idxs_test, items_test = pd.factorize(df_test.index.get_level_values(2))
month_idxs_test, months_test = pd.factorize(df_test.index.get_level_values(0).month)

#update matrix
t_test = time_idxs_test/max(time_idxs_test)
n_changepoints_test = 8
s_test = np.linspace(0, np.max(t_test), n_changepoints+2)[1:-1]
A_test = (t_test[:, None] > s)*1


#update target variable
y_test = np.array(df_test['eaches'])

#update fourier matrix
yearly_fourier_test = create_fourier_features(t_test, n=5,  p=12/max(time_idxs_test))

#update model
coords_test={"locations":locations_test,
            "items":items_test,
            'months':months_test,
            'changepoints':df_test.index.get_level_values(0)[np.argwhere(np.diff(A, axis=0) != 0)[:, 0]],
            "yearly_components": [f'yearly_{f}_{i+1}' for f in ['cos', 'sin'] for i in range(yearly_fourier_test.shape[1] // 2)],
            "obs_id":[f'{loc}_{time.year}_month_{time.month}_item_{item}' for time, loc, item in df_test.index.values]}


pm.set_data(new_data = {'t':t_test,
                       's':s_test,
                       'A':A_test,
                       'yearly_season':yearly_fourier_test},
            coords=coords_test,
            model = model)


---------------------------------------------------------------------------
ShapeError                                Traceback (most recent call last)
/tmp/ipykernel_3891/2405461556.py in <module>
     28                        'yearly_season':yearly_fourier_test},
     29             coords=coords_test,
---> 30             model = model)
     31 
     32 # test_ppc = pm.sample_posterior_predictive(trace, model=model)

/opt/conda/lib/python3.7/site-packages/pymc/model.py in set_data(new_data, model, coords)
   1873 
   1874     for variable_name, new_value in new_data.items():
-> 1875         model.set_data(variable_name, new_value, coords=coords)
   1876 
   1877 

/opt/conda/lib/python3.7/site-packages/pymc/model.py in set_data(self, name, values, coords)
   1261                     # definitely lead to shape problems.
   1262                     raise ShapeError(
-> 1263                         f"Resizing dimension '{dname}' is impossible, because "
   1264                         "a `TensorConstant` stores its length. To be able "
   1265                         "to change the dimension length, pass `mutable=True` when "

ShapeError: Resizing dimension 'obs_id' is impossible, because a `TensorConstant` stores its length. To be able to change the dimension length, pass `mutable=True` when registering the dimension via `model.add_coord`, or define it via a `pm.MutableData` variable.

At this point I tried the following:

with model:
    model.add_coords(coords_test)

That threw the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_3891/1802801453.py in <module>
     23 
     24 with model:
---> 25     model.add_coords(coords_test)
     26 
     27 pm.set_data(new_data = {'t':t_test,

/opt/conda/lib/python3.7/site-packages/pymc/model.py in add_coords(self, coords, lengths)
   1163 
   1164         for name, values in coords.items():
-> 1165             self.add_coord(name, values, length=lengths.get(name, None))
   1166 
   1167     def set_dim(self, name: str, new_length: int, coord_values: Optional[Sequence] = None):

/opt/conda/lib/python3.7/site-packages/pymc/model.py in add_coord(self, name, values, mutable, length)
   1136         if name in self.coords:
   1137             if not np.array_equal(values, self.coords[name]):
-> 1138                 raise ValueError(f"Duplicate and incompatible coordinate: {name}.")
   1139         if length is not None and not isinstance(length, (int, Variable)):
   1140             raise ValueError(

ValueError: Duplicate and incompatible coordinate: items.

So I’m not sure how to update all the coords with the out of sample dataset without duplicating items or the obs_id. Is there a proper way to do this?

Hi! Pretty sure the answer is to define the coords within the model context, and not passed to the context manager.

WRONG:

coords = {'c':c}
with pm.Model(coords = coords) as model:
    etc...

RIGHT:

with pm.Model() as model:
    model.add_coord('c',c, mutable = True)

Then, when setting out of sample data:

with model:
     model.set_data({'data':data}, coords = {'c':new_c})

See:

https://www.pymc.io/projects/docs/en/latest/api/generated/pymc.set_data.html

The fact that you can set coords through the model.set_data() method isn’t well documented.

I had a more detailed example but the page refreshed, hope this helps.

Thank you. I changed as you specified to this…

coords={"items":items,
        'months':months,
        'changepoints':df_train.index.get_level_values(0)[np.argwhere(np.diff(A, axis=0) != 0)[:, 0]],
        "yearly_components": [f'yearly_{f}_{i+1}' for f in ['cos', 'sin'] for i in range(yearly_fourier.shape[1] // 2)],
        "obs_id":[f'{time.year}_month_{time.month}_item_{item}' for time, item in df_train.index.values]}

with pm.Model() as monthly_model:
        
    model.add_coord('coords', coords, mutable = True)
    A_ = pm.Data('A', A, mutable=True, dims=['time', 'changepoints'])
    s_ = pm.Data('s', s, mutable=True, dims=['changepoints'])
    t_ = pm.Data('t', t, mutable=True, dims=['time'])
    yearly = pm.Data('yearly_season', yearly_fourier, mutable=True, dims=['obs_id', 'yearly_components'])
    ...the rest of the model

But when I do this, the dims does not seem to read the coords…

And got this error.

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/ipykernel_74304/458011986.py in <module>
     33     sigma_item_slope = pm.HalfNormal('sigma_item_slope', sigma=0.1)
     34     # offset_loc_slope = pm.Normal('offset_loc_slope', mu=0, sigma=0.1, dims=['locations'])
---> 35     offset_item_slope = pm.Normal('offset_item_slope', mu=0, sigma=0.1, dims=['items'])
     36 
     37 

/opt/conda/lib/python3.7/site-packages/pymc/distributions/distribution.py in __new__(cls, name, rng, dims, initval, observed, total_size, transform, *args, **kwargs)
    262         # a shape by which the created RV may need to be resized.
    263         rv_out, dims, observed, resize_shape = _make_rv_and_resize_shape(
--> 264             cls=cls, dims=dims, model=model, observed=observed, args=args, **kwargs
    265         )
    266 

/opt/conda/lib/python3.7/site-packages/pymc/distributions/distribution.py in _make_rv_and_resize_shape(cls, dims, model, observed, args, **kwargs)
    173     if dims is not None:
    174         if dims_can_resize:
--> 175             resize_shape, dims = resize_from_dims(dims, ndim_actual, model)
    176         elif Ellipsis in dims:
    177             # Replace ... with None entries to match the actual dimensionality.

/opt/conda/lib/python3.7/site-packages/pymc/distributions/shape_utils.py in resize_from_dims(dims, ndim_implied, model)
    529     if unknowndim_resize_dims:
    530         raise KeyError(
--> 531             f"Dimensions {unknowndim_resize_dims} are unknown to the model and cannot be used to specify a `size`."
    532         )
    533 

KeyError: "Dimensions {'items'} are unknown to the model and cannot be used to specify a `size`."

I also tried…

with pm.Model() as monthly_model:
        
    model.add_coord('items',items, mutable = True)
    model.add_coord('months',months, mutable = True)
    model.add_coord('changepoints',df_train.index.get_level_values(0)[np.argwhere(np.diff(A, axis=0) != 0)[:, 0]], mutable = True)
    model.add_coord('yearly_components', [f'yearly_{f}_{i+1}' for f in ['cos', 'sin'] for i in range(yearly_fourier.shape[1] // 2)], mutable = True)
    model.add_coord('obs_id',[f'{time.year}_month_{time.month}_item_{item}' for time, item in df_train.index.values], mutable = True)
    A_ = pm.Data('A', A, mutable=True, dims=['time', 'changepoints'])
    s_ = pm.Data('s', s, mutable=True, dims=['changepoints'])
    t_ = pm.Data('t', t, mutable=True, dims=['time'])
    yearly = pm.Data('yearly_season', yearly_fourier, mutable=True, dims=['obs_id', 'yearly_components'])

and got the same error.

model.add_coord should be monthly_model.add_coord

name space matters within context manager still.

well that’s embarrassing. refitting now.

Thank you.

Using the model.set_coord method appears to break pm.sample_posterior_predictive. Here’s a simple example, estimating the loc and scale of three independent normals:

# Generate data
data = np.random.normal(loc=np.array([3, 5, 8]), scale=np.array([1.1, 6.3, 9.1]), size=(1000, 3))

# Model 1: No coords
with pm.Model() as no_coords_model:
    mu = pm.Normal('mu', mu=0, sigma=10, size=3)
    sigma = pm.HalfNormal('sigma', sigma=10, size=3)
    
    ll = pm.Normal('obs', mu=mu, sigma=sigma, observed=data)
    no_coords_trace = pm.sample()
    no_coords_post = pm.sample_posterior_predictive(no_coords_trace)

# Model 2: Context manager
coords = {'name': ['A', 'B', 'C']}
with pm.Model(coords=coords) as context_model:
    mu = pm.Normal('mu', mu=0, sigma=10, dims=['name'])
    sigma = pm.HalfNormal('sigma', sigma=10, dims=['name'])
    ll = pm.Normal('obs', mu=mu, sigma=sigma, observed=data)
    
    context_trace = pm.sample()
    context_post = pm.sample_posterior_predictive(context_trace)

# Model 3: Within model
with pm.Model() as within_model:
    within_model.add_coord('name', ['A', 'B', 'C'], mutable=True)
    mu = pm.Normal('mu', mu=0, sigma=10, dims=['name'])
    sigma = pm.HalfNormal('sigma', sigma=10, dims=['name'])
    
    ll = pm.Normal('obs', mu=mu, sigma=sigma, observed=data)
    within_trace = pm.sample()
    within_post = pm.sample_posterior_predictive(within_trace)

The mean posterior values for mu and sigma are all identical:

traces = [no_coords_trace, context_trace, within_trace]
mus = [trace.posterior.mu.values[..., i].mean() for trace in traces for i in range(3)]
sigmas = [trace.posterior.sigma.values[..., i].mean() for trace in traces for i in range(3)]
post_df = pd.DataFrame(np.c_[mus, sigmas], columns=['mu', 'sigma'], index=pd.MultiIndex.from_product([['no_coords', 'context', 'within'], ['A', 'B', 'C']]))
print(post_df.unstack(1).to_string())

                 mu                         sigma                    
                  A         B         C         A         B         C
context    2.977460  4.982624  7.826642  1.081710  6.287514  9.165928
no_coords  2.976785  4.984743  7.827109  1.081657  6.289910  9.174939
within     2.976568  4.990646  7.825051  1.081552  6.286198  9.167916

But something appears to be happening with the posterior predictive values:

pps = [no_coords_post, context_post, within_post]
mean_value = [post.posterior_predictive.obs.values[..., i].mean() for post in pps for i in range(3)]
post_df = pd.DataFrame(mean_value, columns=['mean_ppc'], index=pd.MultiIndex.from_product([['no_coords', 'context', 'within'], ['A', 'B', 'C']]))

           mean_ppc                    
                  A         B         C
context    2.977167  4.985852  7.825006
no_coords  2.976837  4.982244  7.818495
within    -0.045788 -0.594845 -0.270400

The dims on within_post are the same as the others, but it seems like totally wrong values are getting sampled. It is telling that the mean of each distribution is not the mean of the means, suggesting it’s not a case of correct values being shuffled.

1 Like

Just tested this out, same results for me. I’m hoping this is just bug. I don’t know of any other way to do out-of-sample predictions.

Hoping that you don’t mind if I repost your MWE to the github issue tracker:

2 Likes

Back to the main question of how to make out-of-sample predictions, I’ve put some thought into it and I think the method of updating coords is correct if you’re using pymc to do OOS prediction.

However, after sampling the posterior with pm.sample()/JAX, you are given distributions for your RVs. Therefore, if you have coded up a data generating process that fits with your model, you can plug your sampled RVs into you data generating process, and aswell for your OOS data. Then you can sample your data generating process all you want to get a distribution of predictions.

This is coming from someone who is very new to Bayesian inference and PyMC.

Thanks. If I have OOS data that was just a test data set broken off from the training set with it’s own dims and coords, what would that look like syntactically?

1 Like

I followed this thread solution, but it doesn’t seems to work. My MVP code gives the following error

ValueError: conflicting sizes for dimension 'date': length 150 on the data but length 50 on coordinate 'date'

What should I do?

MVP code

import pymc as pm
import arviz as az
import numpy as np
import pandas as pd
from scipy.signal import medfilt

np.random.seed(1337)

n_size = 200
st_date = "2020-01-01"
data = pd.DataFrame({
    "date": pd.date_range(st_date, pd.to_datetime(st_date) + pd.Timedelta(days=n_size - 1)),
    "x1": medfilt(np.random.randn(n_size).cumsum(), 5),
    "x2": medfilt(np.random.randn(n_size).cumsum(), 5),
    "x3": medfilt(np.random.randn(n_size).cumsum(), 5),
    "beta_1": np.random.normal(0.5, 0.8, size=n_size),
    "beta_2": np.random.normal(-2, 1, size=n_size),
    "beta_3": np.random.normal(2, 2, size=n_size),
    "noise": 0.5*np.random.rand(n_size)
}).assign(**{
    "y": lambda df: df["beta_1"] * df["x1"] + df["beta_2"] * df["x2"] + df["beta_3"] * df["x3"] + df["noise"]
})

train_data = data.iloc[:int(0.75 * n_size)]
test_data = data.iloc[int(0.75 * n_size):]

coords = {"date": train_data["date"], "feats": ["x1", "x2", "x3"]}
with pm.Model() as model:
    for k in coords.keys():
        model.add_coord(k, coords[k], mutable=True)
    xs_ = pm.MutableData(name="x_inputs", value=train_data[["x1", "x2", "x3"]].to_numpy(), dims=["date", "feats"])
    
    betas = pm.Normal("betas", mu=0, sigma=1, dims="feats")
    mu = pm.Deterministic("mu", var=(betas.reshape((-1, len(coords["feats"]))) * xs_).sum(axis=1), dims="date")
    sigma = pm.HalfNormal("sigma", sigma=1)
    
    pm.Normal("likelihood", mu=mu, sigma=sigma, observed=train_data["y"].to_numpy(), dims="date")

    # Fit to the data
    model_trace = pm.sample(
        nuts_sampler="numpyro",
        chains=4,
        idata_kwargs={"log_likelihood": True},
    )
    model_posterior_predictive = pm.sample_posterior_predictive(
        trace=model_trace
    )
    
display(pm.model_to_graphviz(model=model))
display(az.summary(data=model_trace, var_names=["betas"]))

# Predict
test_coords = {"date": test_data["date"], "feats": ["x1", "x2", "x3"]}
with model:
    for k in test_coords.keys():
        model.set_dim(k, len(test_coords[k]), test_coords[k])
    model.set_data("x_inputs", test_data[["x1", "x2", "x3"]].to_numpy(), coords=test_coords)
    
    pred_posterior_predictive = pm.sample_posterior_predictive(
        model_trace.posterior
    )

My bad, I forgot to add predictions=True like so

pred_posterior_predictive = pm.sample_posterior_predictive(
    model_trace.posterior, predictions=True
)

Instead of dims="date" on the likelihood term, use size=xs_.shape[0]

You can use both dims and size symultaneously. Dims are useful if you want to manipulate the xarray object with those.

1 Like