I know the notebook - and it is good - but it does not cover predictions. But you are right for the plots. Thank you for that
The best practice for out of sample prediction is to:
- Use
pm.MutableData
objects to hold your data - Include
size=X.shape[0]
in the likelihood term - Use
pm.set_data
together withpm.sample_posterior_predictive
to get out-of-sample predictions.
Here is how it looks with your model:
with pm.Model() as model:
# Here's the mutable data
X_pt = pm.MutableData('X', X_train)
y_pt = pm.MutableData('y', y_train)
α = pm.Normal("α", mu=0, sigma=1)
β = pm.Normal("β", mu=0, sigma=1)
mu = α + β * X_pt
sigma = pm.HalfCauchy("sigma", beta=1)
# The size=X_pt.shape[0] part is very important, or else you will get some errors later on
obs = pm.Normal('obs', mu=mu, sigma=sigma, observed=y_pt, size=X_pt.shape[0])
with model:
trace = pm.sample(1000, tune=1000)
# In-sample prediction
trace = pm.sample_posterior_predictive(trace, extend_inferencedata=True)
# Out-of-sample prediction
pm.set_data({'X':X_test})
# predictions=True makes a new group in the InferenceData object, so your in-sample predictions (saved in "posterior_predictive") won't be overwritten.
trace = pm.sample_posterior_predictive(trace, extend_inferencedata=True, predictions=True)
Plotting the results:
fig, ax = plt.subplots()
for [group, X_data] in zip(['posterior_predictive', 'predictions'], [X_train, X_test]):
hdi = az.hdi(trace[group])
idata = az.extract(trace, group)
ax.plot(X_data, idata.obs.mean(dim=['sample']), label=group)
ax.fill_between(X_data, hdi.obs.sel(hdi='lower'), hdi.obs.sel(hdi='higher'), alpha=0.25)
ax.plot(X, y_data_scaled, color='tab:red', ls='--')
ax.legend()
plt.show()
Wau AMAZING jesse! You are the chief - chiefchecker
Can you give me the whole code? I gives me a bug, when I put it back in my notebook!
What is the error? Check that I didn’t change any names. The only things I did differently to you was to scale the data and use a formal train/test split:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
y_data_scaled = scaler.fit_transform(y_data[:, None]).ravel()
X = np.linspace(0, 1, y_data.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X, y_data_scaled, shuffle=False, test_size=0.1)
fig, ax = plt.subplots()
ax.plot(X_train, y_train)
ax.plot(X_test, y_test)
ValueError Traceback (most recent call last)
Input In [23], in <cell line: 56>()
65 sigma = pm.HalfCauchy(“sigma”, beta=1)
67 # The size=X_pt.shape[0] part is very important, or else you will get some errors later on
—> 68 obs = pm.Normal(‘obs’, mu=mu, sigma=sigma, observed=y_pt, size=X_pt.shape[0])
70 with model:
71 trace = pm.sample(1000, tune=1000)
File ~\anaconda3\lib\site-packages\pymc\distributions\distribution.py:308, in Distribution.new(cls, name, rng, dims, initval, observed, total_size, transform, *args, **kwargs)
305 elif observed is not None:
306 kwargs[“shape”] = tuple(observed.shape)
→ 308 rv_out = cls.dist(*args, **kwargs)
310 rv_out = model.register_rv(
311 rv_out,
312 name,
(…)
317 initval=initval,
318 )
320 # add in pretty-printing support
File ~\anaconda3\lib\site-packages\pymc\distributions\continuous.py:512, in Normal.dist(cls, mu, sigma, tau, **kwargs)
506 sigma = at.as_tensor_variable(sigma)
508 # tau = at.as_tensor_variable(tau)
509 # mean = median = mode = mu = at.as_tensor_variable(floatX(mu))
510 # variance = 1.0 / self.tau
→ 512 return super().dist([mu, sigma], **kwargs)
File ~\anaconda3\lib\site-packages\pymc\distributions\distribution.py:387, in Distribution.dist(cls, dist_params, shape, **kwargs)
385 ndim_supp = cls.rv_op(*dist_params, **kwargs).owner.op.ndim_supp
386 create_size = find_size(shape=shape, size=size, ndim_supp=ndim_supp)
→ 387 rv_out = cls.rv_op(*dist_params, size=create_size, **kwargs)
389 rv_out.logp = _make_nice_attr_error(“rv.logp(x)”, “pm.logp(rv, x)”)
390 rv_out.logcdf = _make_nice_attr_error(“rv.logcdf(x)”, “pm.logcdf(rv, x)”)
File ~\anaconda3\lib\site-packages\pytensor\tensor\random\basic.py:268, in NormalRV.call(self, loc, scale, size, **kwargs)
247 def call(self, loc=0.0, scale=1.0, size=None, **kwargs):
248 r""“Draw samples from a normal distribution.
249
250 Signature
(…)
266
267 “””
→ 268 return super().call(loc, scale, size=size, **kwargs)
File ~\anaconda3\lib\site-packages\pytensor\tensor\random\op.py:290, in RandomVariable.call(self, size, name, rng, dtype, *args, **kwargs)
289 def call(self, *args, size=None, name=None, rng=None, dtype=None, **kwargs):
→ 290 res = super().call(rng, size, dtype, *args, **kwargs)
292 if name is not None:
293 res.name = name
File ~\anaconda3\lib\site-packages\pytensor\graph\op.py:296, in Op.call(self, *inputs, **kwargs)
254 r""“Construct an Apply
node using :meth:Op.make_node
and return its outputs.
255
256 This method is just a wrapper around :meth:Op.make_node
.
(…)
293
294 “””
295 return_list = kwargs.pop(“return_list”, False)
→ 296 node = self.make_node(*inputs, **kwargs)
298 if config.compute_test_value != “off”:
299 compute_test_value(node)
File ~\anaconda3\lib\site-packages\pytensor\tensor\random\op.py:335, in RandomVariable.make_node(self, rng, size, dtype, *dist_params)
330 elif not isinstance(rng.type, RandomType):
331 raise TypeError(
332 “The type of rng should be an instance of either RandomGeneratorType or RandomStateType”
333 )
→ 335 shape = self._infer_shape(size, dist_params)
336 _, static_shape = infer_static_shape(shape)
337 dtype = self.dtype or dtype
File ~\anaconda3\lib\site-packages\pytensor\tensor\random\op.py:202, in RandomVariable._infer_shape(self, size, dist_params, param_shapes)
200 param_batched_dims = getattr(param, “ndim”, 0) - param_ndim_supp
201 if param_batched_dims > size_len:
→ 202 raise ValueError(
203 f"Size length is incompatible with batched dimensions of parameter {i} {param}:\n"
204 f"len(size) = {size_len}, len(batched dims {param}) = {param_batched_dims}. "
205 f"Size length must be 0 or >= {param_batched_dims}"
206 )
208 if self.ndim_supp == 0:
209 return size
ValueError: Size length is incompatible with batched dimensions of parameter 0 Elemwise{add,no_inplace}.0:
len(size) = 1, len(batched dims Elemwise{add,no_inplace}.0) = 2. Size length must be 0 or >= 2
I will need to try more and it will take me some time - anyway this is a perfect blueprint!!!
Got it - my next donation - when I have a bit of money while be in your name
Paying it forward by helping others on the forums here is good too : )
Yes you are right of course but first I will need to get better.
I want to build the BART coalmine example with a forecast of 10 steps and I try to adopt it like you show with:
with pm.Model() as model:
# Define mutable data
data_X = pm.MutableData("data_X", X)
# Define prior for mu
mu = pmb.BART("mu", data_X, Y.shape[0])
# Define likelihood
y_pred = pm.Poisson("y_pred", mu=mu, observed=Y)
# Sampling
idata = pm.sample(random_seed=RANDOM_SEED)
# Generate new data
new_X = generate_new_data(...)
# Use the existing model to generate posterior predictive samples for the new data
with model:
pm.set_data({"data_X": new_X})
ppc = pm.sample_posterior_predictive(idata, var_names=["mu", "y_pred"], samples=10)
but I get
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Input In [21], in <cell line: 1>()
3 data_X = pm.MutableData("data_X", X)
5 # Define prior for mu
----> 6 mu = pmb.BART("mu", data_X, Y.shape[0])
8 # Define likelihood
9 y_pred = pm.Poisson("y_pred", mu=mu, observed=Y)
File ~\anaconda3\lib\site-packages\pymc_bart\bart.py:95, in BART.__new__(cls, name, X, Y, m, alpha, split_prior, **kwargs)
92 manager = Manager()
93 cls.all_trees = manager.list()
---> 95 X, Y = preprocess_xy(X, Y)
97 if split_prior is None:
98 split_prior = []
File ~\anaconda3\lib\site-packages\pymc_bart\bart.py:156, in preprocess_xy(X, Y)
153 if isinstance(X, (Series, DataFrame)):
154 X = X.to_numpy()
--> 156 Y = Y.astype(float)
157 X = X.astype(float)
159 return X, Y
AttributeError: 'int' object has no attribute 'astype'
I put the mutable data and the Y.shape[0] - but there is still a way to go. From your answer some days before I found out that my linear model is to linear in a way … which is an insight by itself - if you know what i mean.
Why did you use Y.shape[0] in the mu
line? What are the arguments to pmb.BART
?
It is my first trial and somewhere I had to put it. I will look into it and write again.
It should be like:
with pm.Model() as model_coal:
μ_ = pmb.BART("μ_", X=x_data, Y=y_data, m=20)
μ = pm.Deterministic("μ", pm.math.abs(μ_))
y_pred = pm.Poisson("y_pred", mu=μ, observed=y_data)
idata_coal = pm.sample(random_seed=RANDOM_SEED)
but with mutable and shape[0] to do the forecast.
ah I see! You were putting the shape[0] in the wrong place, it should always go on the line where you add observed
. You still need to give the actual y_data to pmb.BART
, which was the source of your error (it was looking for data but found an integer (the shape)).
From this discussion, it looks like you can then just proceed as normal, without worrying about the Y data. Make your X data mutable, put X.shape[0] in the observed
part, sample, set_data, sample_posterior_predictive.
It seems I am not to far from a result:
# Create the model
with pm.Model() as model_disasters:
μ_ = pmb.BART("μ_", X=x_data, Y=y_data, m=20)
μ = pm.Deterministic("μ", pm.math.abs(μ_))
y_pred = pm.NegativeBinomial("y_pred", mu=μ, alpha=1, observed=y_data)
idata_disasters = pm.sample(random_seed=RANDOM_SEED)
# Forecast 10 steps into the future
future_years = 10
x_future = np.arange(x_data.min(), x_data.max() + future_years + 1)[:, None]
y_future = np.zeros((x_future.shape[0],))
with model_disasters:
# Generate samples from the posterior predictive distribution
post_pred = pm.sample_posterior_predictive(idata_disasters, var_names=["y_pred"], random_seed=RANDOM_SEED)
# Take the mean of the posterior predictive distribution as the forecast
y_future = post_pred["y_pred"].mean(axis=0)
# Plot the forecast
_, ax = plt.subplots(figsize=(10, 6))
ax.plot(x_centers, disaster_rate_mean, "w", lw=3)
az.plot_hdi(x_centers, disaster_rates, smooth=False)
az.plot_hdi(x_centers, disaster_rates, hdi_prob=0.5, smooth=False, plot_kwargs={"alpha": 0})
ax.plot(disasters, np.zeros_like(disasters) - 0.5, "k|")
ax.plot(x_future, y_future, 'g--', label='forecast')
ax.set_xlabel("years")
ax.set_ylabel("disaster_rate")
ax.legend()
plt.show()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [16], in <cell line: 13>()
11 y_future = np.zeros((x_future.shape[0],))
13 with model_disasters:
14 # Generate samples from the posterior predictive distribution
---> 15 post_pred = pm.sample_posterior_predictive(idata_disasters, var_names=["y_pred"], random_seed=RANDOM_SEED)
16 # Take the mean of the posterior predictive distribution as the forecast
17 y_future = post_pred["y_pred"].mean(axis=0)
File ~\anaconda3\lib\site-packages\pymc\sampling\forward.py:673, in sample_posterior_predictive(trace, model, var_names, sample_dims, random_seed, progressbar, return_inferencedata, extend_inferencedata, predictions, idata_kwargs, compile_kwargs)
671 ikwargs.setdefault("inplace", True)
672 return pm.predictions_to_inference_data(ppc_trace, **ikwargs)
--> 673 idata_pp = pm.to_inference_data(posterior_predictive=ppc_trace, **ikwargs)
675 if extend_inferencedata and idata is not None:
676 idata.extend(idata_pp)
File ~\anaconda3\lib\site-packages\pymc\backends\arviz.py:485, in to_inference_data(trace, prior, posterior_predictive, log_likelihood, coords, dims, sample_dims, model, save_warmup, include_transformed)
482 if isinstance(trace, InferenceData):
483 return trace
--> 485 return InferenceDataConverter(
486 trace=trace,
487 prior=prior,
488 posterior_predictive=posterior_predictive,
489 log_likelihood=log_likelihood,
490 coords=coords,
491 dims=dims,
492 sample_dims=sample_dims,
493 model=model,
494 save_warmup=save_warmup,
495 include_transformed=include_transformed,
496 ).to_inference_data()
File ~\anaconda3\lib\site-packages\pymc\backends\arviz.py:406, in InferenceDataConverter.to_inference_data(self)
396 def to_inference_data(self):
397 """Convert all available data to an InferenceData object.
398
399 Note that if groups can not be created (e.g., there is no `trace`, so
400 the `posterior` and `sample_stats` can not be extracted), then the InferenceData
401 will not have those groups.
402 """
403 id_dict = {
404 "posterior": self.posterior_to_xarray(),
405 "sample_stats": self.sample_stats_to_xarray(),
--> 406 "posterior_predictive": self.posterior_predictive_to_xarray(),
407 "predictions": self.predictions_to_xarray(),
408 **self.priors_to_xarray(),
409 "observed_data": self.observed_data_to_xarray(),
410 }
411 if self.predictions:
412 id_dict["predictions_constant_data"] = self.constant_data_to_xarray()
File ~\anaconda3\lib\site-packages\arviz\data\base.py:65, in requires.__call__.<locals>.wrapped(cls)
63 if all((getattr(cls, prop_i) is None for prop_i in prop)):
64 return None
---> 65 return func(cls)
File ~\anaconda3\lib\site-packages\pymc\backends\arviz.py:327, in InferenceDataConverter.posterior_predictive_to_xarray(self)
325 data = self.posterior_predictive
326 dims = {var_name: self.sample_dims + self.dims.get(var_name, []) for var_name in data}
--> 327 return dict_to_dataset(
328 data, library=pymc, coords=self.coords, dims=dims, default_dims=self.sample_dims
329 )
File ~\anaconda3\lib\site-packages\arviz\data\base.py:307, in dict_to_dataset(data, attrs, library, coords, dims, default_dims, index_origin, skip_event_dims)
305 data_vars = {}
306 for key, values in data.items():
--> 307 data_vars[key] = numpy_to_data_array(
308 values,
309 var_name=key,
310 coords=coords,
311 dims=dims.get(key),
312 default_dims=default_dims,
313 index_origin=index_origin,
314 skip_event_dims=skip_event_dims,
315 )
316 return xr.Dataset(data_vars=data_vars, attrs=make_attrs(attrs=attrs, library=library))
File ~\anaconda3\lib\site-packages\arviz\data\base.py:254, in numpy_to_data_array(ary, var_name, coords, dims, default_dims, index_origin, skip_event_dims)
252 # filter coords based on the dims
253 coords = {key: xr.IndexVariable((key,), data=np.asarray(coords[key])) for key in dims}
--> 254 return xr.DataArray(ary, coords=coords, dims=dims)
File ~\anaconda3\lib\site-packages\xarray\core\dataarray.py:428, in DataArray.__init__(self, data, coords, dims, name, attrs, indexes, fastpath)
426 data = _check_data_shape(data, coords, dims)
427 data = as_compatible_data(data)
--> 428 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
429 variable = Variable(dims, data, attrs, fastpath=True)
430 indexes, coords = _create_indexes_from_coords(coords)
File ~\anaconda3\lib\site-packages\xarray\core\dataarray.py:142, in _infer_coords_and_dims(shape, coords, dims)
140 dims = tuple(dims)
141 elif len(dims) != len(shape):
--> 142 raise ValueError(
143 "different number of dimensions on data "
144 f"and dims: {len(shape)} vs {len(dims)}"
145 )
146 else:
147 for d in dims:
ValueError: different number of dimensions on data and dims: 3 vs 2
If I find out how to put the mutable data the model should be ready and running without errors. I am using Chat GPT as an assistent, but it is only trained up to data from 2021. So not on the new version of BART v. 0.4.0 which still makes it tricky for me but for today I give up …
You got it right that I try to reproduce the disscusion and the advise from Osvaldo, but in his solution there is nothing about obs…
The last post in the linked thread shows how to use pm.MutableData
to do out-of-sample predictions in a BART model, and I don’t see any of that in the code block you just posted here. Am I missing something?
Yes - that i often confuse myself to a degree were I am on the right track and after this getting lost again.
Now if I make it mutable and put the shape[0] to observed and leave out y after the beginning were it is requested it looks like this:
with pm.Model() as model:
# Define mutable data
x_data = pm.MutableData("x_data", x_data)
# Define prior for mu
mu = pmb.BART("mu", X= x_data, Y=y_data)
# Define likelihood
y_pred = pm.Poisson("y_pred", mu=mu, observed=X.shape[0])
# Sampling
idata = pm.sample(random_seed=RANDOM_SEED)
# Generate new data
new_X = generate_new_data(...)
# Use the existing model to generate posterior predictive samples for the new data
with model:
pm.set_data({"data_X": new_X})
ppc = pm.sample_posterior_predictive(idata, var_names=["mu", "y_pred"], samples=10)
and get:
--------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [18], in <cell line: 1>()
1 with pm.Model() as model:
2 # Define mutable data
----> 3 x_data = pm.MutableData("x_data", x_data)
5 # Define prior for mu
6 mu = pmb.BART("mu", X= x_data, Y=y_data)
File ~\anaconda3\lib\site-packages\pymc\data.py:295, in MutableData(name, value, dims, coords, export_index_as_coords, **kwargs)
281 def MutableData(
282 name: str,
283 value,
(...)
288 **kwargs,
289 ) -> SharedVariable:
290 """Alias for ``pm.Data(..., mutable=True)``.
291
292 Registers the ``value`` as a :class:`~pytensor.compile.sharedvalue.SharedVariable`
293 with the model. For more information, please reference :class:`pymc.Data`.
294 """
--> 295 var = Data(
296 name,
297 value,
298 dims=dims,
299 coords=coords,
300 export_index_as_coords=export_index_as_coords,
301 mutable=True,
302 **kwargs,
303 )
304 return cast(SharedVariable, var)
File ~\anaconda3\lib\site-packages\pymc\data.py:415, in Data(name, value, dims, coords, export_index_as_coords, mutable, **kwargs)
413 mutable = False
414 if mutable:
--> 415 x = pytensor.shared(arr, name, **kwargs)
416 else:
417 x = at.as_tensor_variable(arr, name, **kwargs)
File ~\anaconda3\lib\site-packages\pytensor\compile\sharedvalue.py:199, in shared(value, name, strict, allow_downcast, **kwargs)
171 r"""Create a `SharedVariable` initialized with a copy or reference of `value`.
172
173 This function iterates over constructor functions to find a
(...)
195
196 """
198 if isinstance(value, Variable):
--> 199 raise TypeError("Shared variable values can not be symbolic.")
201 try:
202 var = shared_constructor(
203 value,
204 name=name,
(...)
207 **kwargs,
208 )
TypeError: Shared variable values can not be symbolic.
What is the datatype of the x_data
you are passing into pm.MutableData
?
In the coal_mine example from BART it gets defined to discretize the data:
# discretize data
years = int(X.max() - X.min())
bins = years // 4
hist, x_edges = np.histogram(Y, bins=bins)
# compute the location of the centers of the discretized data
x_centers = x_edges[:-1] + (x_edges[1] - x_edges[0]) / 2
# xdata needs to be 2D for BART
x_data = x_centers[:, None]
# express data as the rate number of disaster per year
y_data = hist / 4
So I keep the style
The error says “Shared variable values can not be symbolic”. “Symbolic” means it’s a pytensor tensor, plus the error is a “TypeError”, so you should check type(x_data)
and ensure that it is what you expect. In the code you posted, it is a numpy array, but I think you ran another model or something that overrode the name x_data
.
You are very right with all you say. It is a NumPy array - but no idea where it gets changed. I will have to check this out in more deep.
I saw in your LinkedIn profile that you did a course in Reinforcement Learning and speak Chinese as well, Amazing!
Would you mind if I write you an email tomorrow? - I have a question regarding the MCTS for finance, I also try to build and would like to ask you about your opinion.
Yeah of course, always happy to help
Happy Easter holiday! Frohe Ostern.