Sampling options with many observations and a HurdleGamma likelihood

Sure, this should run for you with the previously shared csv. My only issue right now is that each group will have different length observed values for the Bernoulli and the Gamma since the Bernoulli has all records and the Gamma has only the “successful” records. So when I use sample_posterior_predictive I get different length arrays for y_bernoulli and y_gamma and I’m not sure how to handle that to get to an HDI for each group. I think I could just use the values from the posterior trace with numpy or scipy but I’d prefer to use sample_posterior_predictive if possible.

df = pl.read_csv(
    "sample_data.csv",
    schema_overrides={"group": pl.String, "record": pl.String}
)

gamma_obs = df.filter(pl.col("observed_value") > 0).select("observed_value").to_numpy().squeeze()
bernoulli_obs = df.select("observed_value").to_numpy().squeeze()

def factorize(dataset, col_name):
    dataset = dataset.with_columns(pl.col(col_name).cast(pl.Categorical).to_physical().alias("idx"))
    categories = dataset.select([col_name, "idx"]).unique().sort("idx").select(col_name).to_numpy().squeeze()
    idx = dataset.select("idx").to_numpy().squeeze()

    return idx, categories


group_idx, groups = factorize(df, "group")
record_idx, records = factorize(df, "record")
nonzero_group_idx, non_zero_groups = factorize(df.filter(pl.col("observed_value") > 0), "group")
nonzero_record_idx, nonzero_records = factorize(df.filter(pl.col("observed_value") > 0), "record")
coords = {
    "groups": groups,
    "records": records,
    "non_zero_groups": non_zero_groups,
    "non_zero_records": nonzero_records,
}

with pm.Model(coords=coords) as model:

    # HYPERPRIORS ON ALPHA
    a_alpha = pm.Gamma("a_alpha", alpha=26.4, beta=3.25)
    a_beta = pm.Gamma("a_beta", alpha=26.4, beta=3.25)
    # ALPHA
    alpha = pm.Gamma("alpha", alpha=a_alpha, beta=a_beta, dims="groups") 
    
    # HYPERPRIORS ON BETA
    b_alpha = pm.Gamma("b_alpha", alpha=26.4, beta=3.25) 
    b_beta = pm.Gamma("b_beta", alpha=26.4, beta=3.25)  
    # BETA
    beta = pm.Gamma("beta", alpha=b_alpha, beta=b_beta, dims="groups") 
    
    # HYPERPRIORS ON PSI
    p_alpha = pm.Gamma("p_alpha", alpha=2.78, beta=7.37)
    p_beta = pm.Gamma("p_beta", alpha=22.2, beta=24)
    # PSI
    psi = pm.Beta("psi", alpha=p_alpha, beta=p_beta, dims="groups") 
    
    # MU CALC
    mu = pm.Deterministic("mu", alpha / beta, dims="groups")
    
    y_bernoulli = pm.Bernoulli(
        "y_bernoulli",
        p=psi[group_idx],
        observed=bernoulli_obs,
        dims="records"
    )
    
    y_gamma = pm.Gamma(
        "y_gamma",
        alpha=alpha[nonzero_group_idx],
        beta=beta[nonzero_group_idx],
        observed=gamma_obs,
        dims="non_zero_records"
    )

pm.model_to_graphviz(model)
1 Like