Unpooled Accelerated Failure Time Model

I’ve been working through the Frailty and Survival Regression Models example, which includes an accelerated failure time (AFT) model. I’d like to extend this model for my own use case with an unpooled dimension. However, the AFT model already uses an index to split censored/uncensored data. I’m unsure how to add an index that refers to the unpooled groups. Below I’ve taken my best guess at how to do this but it is invalid.

idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "group": unique_groups,
    "group_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = retention_df.left.values == 0.0


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    idx = pm.MutableData("idx", floor_measure, dims="group_flat")
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu + eta) / s))
    y_obs = pm.Weibull("y_obs", beta=reg[~cens][idx], alpha=s[idx], observed=y[~cens], dims="groups_flat")
    y_cens = pm.Potential("y_cens", weibull_lccdf(y[cens], alpha=s, beta=reg[cens]))

In trying to resolve this issue myself, I found this forum discussion very helpful. It helped me see a couple problems with my original best guess. However, I still haven’t gotten it to work. Below is my current best guess. The error I get now comes from prior predictive sampling. The Error message is: IndexError: shape mismatch: indexing arrays could not be broadcast together with (X,) (Y,). In this this case X corresponds to the length of the uncensored data and Y corresponds to the length of all observations (censored and uncensored).

idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "group": unique_groups,
    "group_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = retention_df.left.values == 0.0


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s))
    y_obs = pm.Weibull("y_obs", beta=reg[~cens, idx], alpha=s[idx], observed=y[~cens], dims="groups_flat")
    y_cens = pm.Potential("y_cens", weibull_lccdf(y[cens], alpha=s[idx], beta=reg[cens, idx]), dims="groups_flat")

Can you provide a runnable example?

Thanks for following up @cluhmann! I tweaked the AFT example for my use case.

import os

import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt

try:
    retention_df = pd.read_csv(os.path.join("..", "data", "time_to_attrition.csv"))
except FileNotFoundError:
    retention_df = pd.read_csv(pm.get_data("time_to_attrition.csv"))


dummies = pd.concat(
    [
        pd.get_dummies(retention_df["gender"], drop_first=True),
        pd.get_dummies(retention_df["level"], drop_first=True),
        pd.get_dummies(retention_df["field"], drop_first=True),
    ],
    axis=1,
).rename({"M": "Male"}, axis=1)

retention_df = pd.concat([retention_df, dummies], axis=1).sort_values("Male").reset_index(drop=True)

intervals = np.arange(12)
n_groups = 5
unique_groups = list(range(n_groups))
group_values = np.random.randint(n_groups, size=retention_df.shape[0])
group_idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "groups": unique_groups,
    "groups_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = retention_df.left.values == 0.0


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s))
    y_obs = pm.Weibull(
        "y_obs",
        beta=reg[~cens, group_idx],
        alpha=s[group_idx],
        observed=y[~cens],
        dims="groups_flat"
    )
    y_cens = pm.Potential(
        "y_cens",
        weibull_lccdf(y[cens], alpha=s[group_idx], beta=reg[cens, group_idx]),
        dims="groups_flat"
    )
    
    idata = pm.sample()

Took a quick look and I assume the issue is with this?

beta=reg[cens, group_idx]

beta is defined with dims="preds" (so shape=array([10])), but you are indexing into it as if it is shape array([2, 3770]) (or something like that).

The nomenclature in this model isn’t ideal. The beta parameter of the Weibull distribution is defined by the deterministic variable “reg”. The “reg” variable is based on the “mu”, “s” & “eta” variables, which are based on the “groups” dimension. When I evaluate aft_model["reg"].shape.eval() it yields array([3770, 5], dtype=int64). So the first dimension corresponds to the observations and the second to the unpooled groups.

Also, I noticed a mistake I made in defining the “reg” variable. Below is an updated example but it produces the same error as above.

intervals = np.arange(12)
n_groups = 5
unique_groups = list(range(n_groups))
group_values = np.random.randint(n_groups, size=retention_df.shape[0])
group_idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "groups": unique_groups,
    "groups_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = retention_df.left.values == 0.0


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s[None, :]))
    y_obs = pm.Weibull(
        "y_obs",
        beta=reg[~cens, group_idx],
        alpha=s[group_idx],
        observed=y[~cens],
        dims="groups_flat"
    )
    y_cens = pm.Potential(
        "y_cens",
        weibull_lccdf(y[cens], alpha=s[group_idx], beta=reg[cens, group_idx]),
        dims="groups_flat"
    )
    
    idata = pm.sample()

Ah. I think your issue is that cens is an array of booleans. Cast it to int and I think you’re good?

Well that is surprising! Your suggestion worked. However, the cens variable was not something I modified from the original example code. I re-ran the original example code (pasted below) and in runs fine with the cens variable as a boolean. So it seems PyMC handles the indexing variable differently in a one-dimensional vs multi-dimensional array?

intervals = np.arange(12)
coords = {
    "intervals": intervals,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = (retention_df.left.values == 0.0)


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1)
    s = pm.HalfNormal("s", 5.0)
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu + eta) / s))
    y_obs = pm.Weibull(
        "y_obs",
        beta=reg[~cens],
        alpha=s,
        observed=y[~cens],
    )
    y_cens = pm.Potential(
        "y_cens",
        weibull_lccdf(y[cens], alpha=s, beta=reg[cens]),
    )
    
    idata = pm.sample()

numpy indexing is odd when you mix integer and boolean indexing.

On another note can’t you use Censored here? Censored — PyMC dev documentation

The censored distribution isn’t applicable to my accelerated failure time model because the censoring does not occur at a fixed bound. For instance, if we are trying to predict the survival time of patients diagnosed with cancer, some patients were just diagnosed yesterday while others are still alive decades after diagnosis. Both are right censored but at different times.

You can pass a vector of censoring points (one per observation) and in those cases where the points are larger/smaller then the observations (you can use np.inf) the likelihood won’t be censored

PS casting the boolean mask to integer will definitely not behave as the original mask

Oh okay. It sounds like the example model’s use of a boolean mask for isolating censored and uncensored is probably not feasible when I try to extend it to an unpooled model. I will look into using the pm.Censored function with a vector of upper bounds and share what I come up with. I’m glad you caught my false assumption that an integer mask was functioning identically with a boolean mask! I hadn’t thought to question that assumption. Note, the example model is using a pm.Potential to handle the censored data. I suspect the model I produce using pm.Censored will be substantially different, but it seems like a good alternative approach. Thanks for your help!

The model using pm.Censored will do exactly the same without the Potential. It already accounts for it in the logp. The Potential was a trick to do the same before Censored existed

1 Like

I’ve tried to convert the model to use pm.Censored. What I don’t understand is how to specify the unpooled groups dimension in this situation. Here’s my best guess. The Weibull likelihood is ill defined.

intervals = np.arange(12)
n_groups = 5
unique_groups = list(range(n_groups))
group_values = np.random.randint(n_groups, size=retention_df.shape[0])
group_idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "groups": unique_groups,
    "groups_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y_obs = retention_df["month"].values
is_cens = retention_df.left.values == 0.0
cens_upper = [x if c else np.nan for x, c in zip(y_obs, is_cens)]

def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s[None, :]))
    y_uncens = pm.Weibull.dist(
        beta=reg[:, group_idx],
        alpha=s[group_idx],
        dims="groups_flat"
    )
    y = pm.Censored("y", y_uncens, lower=None, upper=cens_upper, observed=y_obs)
    
    idata = pm.sample()

the dims should go on the Censored, not y_uncens

I should have mentioned that I tried moving the dims argument to pm.Censored and it also gives an error: ValueError: Size length is incompatible with batched dimensions of parameter 0 ExpandDims{axis=0}.0: len(size) = 1, len(batched dims ExpandDims{axis=0}.0) = 2. Size must be None or have length >= 2. Here’s the corresponding code:

intervals = np.arange(12)
n_groups = 5
unique_groups = list(range(n_groups))
group_values = np.random.randint(n_groups, size=retention_df.shape[0])
group_idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "groups": unique_groups,
    "groups_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y_obs = retention_df["month"].values
is_cens = retention_df.left.values == 0.0
cens_upper = [x if c else np.nan for x, c in zip(y_obs, is_cens)]

def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s[None, :]))
    y_uncens = pm.Weibull.dist(
        beta=reg[:, group_idx],
        alpha=s[group_idx]
    )
    y = pm.Censored("y", y_uncens, lower=None, upper=cens_upper, observed=y_obs, dims="groups_flat")
    
    idata = pm.sample()

I assume this isn’t what you want?

reg[:, group_idx].shape.eval()
# array([3770, 3770])

I don’t think reg[:, group_idx].shape.eval() is helpful. I ran that evaluation against every model variant I have defined above. It yields the same result as what you showed. But when I evaluate aft_model["reg"].shape.eval() I get array([3770, 5], dtype=int64). I’m not sure why the results are different, but the former evaluation doesn’t make sense to me.

I finally got a valid version of the unpooled model working with the code below. It reverts to using integer indices to subset the censored/uncensored data instead of using pm.Censored. Thanks for your help! I wouldn’t have been able to figure this out without your responses!

intervals = np.arange(12)
n_groups = 5
unique_groups = list(range(n_groups))
group_values = np.random.randint(n_groups, size=retention_df.shape[0])
group_idx = pd.Categorical(group_values, categories=unique_groups).codes
coords = {
    "intervals": intervals,
    "groups": unique_groups,
    "groups_flat": group_values,
    "preds": [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ],
}

X = retention_df[
    [
        "sentiment",
        "intention",
        "Male",
        "Low",
        "Medium",
        "Finance",
        "Health",
        "Law",
        "Public/Government",
        "Sales/Marketing",
    ]
].copy()
y = retention_df["month"].values
cens = [i for i, x in enumerate(retention_df.left) if x == 0.0]
uncens = [i for i, x in enumerate(retention_df.left) if x == 1.0]


def weibull_lccdf(x, alpha, beta):
    """Log complementary cdf of Weibull distribution."""
    return -((x / beta) ** alpha)

with pm.Model(coords=coords, check_bounds=False) as aft_model:
    X_data = pm.MutableData("X_data_obs", X)
    beta = pm.Normal("beta", 0.0, 1, dims="preds")
    mu = pm.Normal("mu", 0, 1, dims="groups")
    s = pm.HalfNormal("s", 5.0, dims="groups")
    eta = pm.Deterministic("eta", pm.math.dot(beta, X_data.T))
    reg = pm.Deterministic("reg", pt.exp(-(mu[None, :] + eta[:, None]) / s[None, :]))
    y_obs = pm.Weibull(
        "y_obs",
        beta=reg[uncens, group_idx[uncens]],
        alpha=s[group_idx[uncens]],
        observed=y[uncens],
        dims="groups_flat"
    )
    y_cens = pm.Potential(
        "y_cens",
        weibull_lccdf(y[cens], alpha=s[group_idx[cens]], beta=reg[cens, group_idx[cens]]),
        dims="groups_flat"
    )
    
    idata = pm.sample()