Constraints for Parameters / Solve underdetermined model / Multilevel Model

Hey all,
What is the best way to implement constraints when trying the model below?
a_i \sim N(\mu , \sigma)
b_{ij} \sim N(\mu_{j}, \sigma_{j})
for x_i = a_i + \sum_{j}^{n} b_{ij} c_{ij} with known coefficients b_{ij} \in \{0, 1\} and \sum b_{ij} = 1.

Since this is underdetermined. I wanted to get out of the dilemma by setting
\sum_{j} \mu_{j} = 0 via \mu_{n} = -\sum_{j}^{n-1} \mu_{j}
(see constrained_model)

But this does not converge as good anymore (r_hat up to 1.03).

I also tried to do a reparametrization (see constrained_model_rp) via

c_dist = pm.Normal("c_dist", mu=0, sigma=1, shape=n)
c = pm.Deterministic("c", mu_c + sigma_c * c_dist)

This one runs and converges, but the reparametrization softens the constraint since the means of c_dist are different to 0.
Is there a good alternative to implement this constraint?

Ultimately, I want to have multiple of these layers, what would be the best way to get there?

n = 5
samples = 10000

def data_faker(n, samples, mu_a=3, sig_a=5, sig_b=1):
    xdata = np.random.normal(mu_a, sig_a, samples)
    bdata = np.random.randint(low=0, high=n, size=(samples))
    mu = np.arange(0, n)-(n-1)/2
    for i,m in enumerate(mu):
        xdata += np.where(bdata == i, np.random.normal(m, sig_b, samples), 0)
    
    return xdata, bdata

xdata, bdata = data_faker(n, samples)

with pm.Model() as model:
    #Input data
    x = pm.Data("x", xdata, dims="obs_id")
    b = pm.Data("b", bdata, dims="obs_id")
    #baseline
    mu_a = pm.Normal("mu_a ", mu=0.0, sigma=10)
    sigma_a= pm.Exponential("sigma_a", 1)
    # level 1
    mu_c = pm.Exponential("mu_c", 1, shape=n)
    sigma_c= pm.Exponential("sigma_c", 1, shape=n)
    c = pm.Normal("c", sigma_c, shape=n)

    y_hat = mu_a + c[b]
    # y_hat = c[b] # --> this one runs stable

    # value
    value = pm.Normal("value", 
                      mu=y_hat,
                      sigma = sigma_a,
                      observed = x)

with pm.Model() as constrained_model:
    #Input data
    x = pm.Data("x", xdata, dims="obs_id")
    b = pm.Data("b", bdata, dims="obs_id")
    
    #baseline
    mu_a = pm.Normal("mu_a ", mu=0.0, sigma=10)
    sigma_a= pm.Exponential("sigma_a", 1)

    # level 1
    mu_c_raw = [pm.Exponential(f"mu_c_raw_{i}", 1) for i in range(n-1)]
    mu_c = pytensor.tensor.stacklists(mu_c_raw + [-pt.tensor.sum(mu_c_raw )])
    # mu_c = pytensor.tensor.stacklists(mu_c_raw + [-pt.tensor.sum(pytensor.tensor.stacklists(mu_c_raw ))]) # same result
    sigma_c= pm.Exponential("sigma_c", 1, shape=n)

    c = pm.Normal("c", mu=mu_c, sigma=sigma_c, shape=n)

    y_hat = mu_a + c[b]

    # value
    value = pm.Normal("value", 
                      mu=y_hat,
                      sigma = sigma_a,
                      observed = x)

with pm.Model() as constrained_model_rp:
    #Input data
    x = pm.Data("x", xdata, dims="obs_id")
    b = pm.Data("b", bdata, dims="obs_id")
    
    #baseline
    mu_a = pm.Normal("mu_a ", mu=0.0, sigma=10)
    sigma_a= pm.Exponential("sigma_a", 1)

    # level 1
    mu_c_raw = [pm.Exponential(f"mu_c_raw_{i}", 1) for i in range(n-1)]
    mu_c = pytensor.tensor.stacklists(mu_c_raw + [-pt.tensor.sum(mu_c_raw )])
    # mu_c = pytensor.tensor.stacklists(mu_c_raw + [-pt.tensor.sum(pytensor.tensor.stacklists(mu_c_raw ))])
    sigma_c= pm.Exponential("sigma_c", 1, shape=n)

    # reparametrization
    c_dist = pm.Normal("c_dist", mu=0, sigma=1, shape=n)
    c = pm.Deterministic("c", mu_c + sigma_c * c_dist)

    y_hat = mu_a + c[b]

    # value
    value = pm.Normal("value", 
                      mu=y_hat,
                      sigma = sigma_a,
                      observed = x)

I am not great at PYMC directly (I work mostly in Bambi) but after messing around in ChatGPT, and correcting a couple of things, this seemed to work (the trace and PPC look good too):

n = 5
samples = 10000

# Data generation function
def data_faker(n, samples, mu_a=3, sig_a=5, sig_b=1):
    xdata = np.random.normal(mu_a, sig_a, samples)
    bdata = np.random.randint(low=0, high=n, size=samples)
    mu = np.arange(0, n) - (n - 1) / 2  # Group-specific effects
    for i, m in enumerate(mu):
        xdata += np.where(bdata == i, np.random.normal(m, sig_b, samples), 0)
    return xdata, bdata

# Generate the data
xdata, bdata = data_faker(n, samples)

# Define coordinates for groups
coords = {"group": np.arange(n)}  # [0, 1, 2, 3, 4]

with pm.Model(coords=coords) as model:
    # Data containers
    bdata_shared = pm.Data('bdata_shared', bdata)    # Length: samples
    x_obs_data = pm.Data('x_obs_data', xdata)        # Length: samples

    # Prior for overall mean effect
    a_i = pm.Normal('a_i', mu=3, sigma=5)

    # Prior for group-specific weights (normalized to sum to 1)
    b_ij = pm.Normal('b_ij', mu=0, sigma=1, dims='group', initval=np.zeros(n))
    b_softplus = pt.softplus(b_ij)
    b_ij_binary = pm.Deterministic('b_ij_binary', b_softplus / pt.sum(b_softplus))

    # Prior for group-specific effects
    c = pm.Normal('c', mu=0, sigma=5, dims='group')

    # Expected value per data point
    b_ij_binary_i = b_ij_binary[bdata_shared]
    c_i = c[bdata_shared]
    x_i = a_i + b_ij_binary_i * c_i

    # Likelihood
    sigma_obs = pm.HalfNormal('sigma_obs', sigma=1)
    x_obs = pm.Normal('x_obs', mu=x_i, sigma=sigma_obs, observed=x_obs_data)

    # Sample from the posterior
    idata = pm.sample()

image

2 Likes

Hi Zweli,
thanks for your support :slight_smile:

The convergence is indeed very good.

And I am surprised, that the b_ij_binary still come with a good convergence. I only need to re-calculate the sample weights via b_ij_binary and c.

Another approach, that I identified is via the ZeroSumNormal. That works decently well.
Benefit: I can directly see the means in the sampled parameters. However, that does not work for the standarddeviations that are defined for the ZSN as

\sigma^2 (I - \tfrac{1}{n}J) \Big) \\ \text{where} \ ~ J_{ij} = 1 \ ~ \text{and} \\ n = \text{nbr of zero-sum axes}

with pm.Model() as constrained_model_rp_zs:
    """constrained model with reparametrization and zero sum normal distribution"""
    #Input data
    x = pm.Data("x", xdata, dims="obs_id")
    b = pm.Data("b", bdata, dims="obs_id")
    
    #baseline
    mu_a = pm.Normal("mu_a ", mu=0.0, sigma=10)
    sigma_a= pm.Exponential("sigma_a", 1)

    # level 1
    sigma_c= pm.Exponential("sigma_c", 1, shape=n)

    # reparametrization is needed since ZSN has the same sigma for all dimensions
    c_dist = pm.ZeroSumNormal("c_dist", sigma=1, shape=n)
    c = pm.Deterministic("c", sigma_c * c_dist)

    y_hat = mu_a + c[b]

    # value
    value = pm.Normal("value", 
                      mu=y_hat,
                      sigma = sigma_a,
                      observed = x)

Since you want all your coefficients to be on a simplex, it makes more sense to pump things through a softmax transformation, as @zweli did. Alternatively, you could directly parameterize your coefficients with a Dirichlet prior. Draws from a Dirichlet will naturally have the properties you want.

1 Like

thanks :slight_smile:
both options work with good convergence.

1 Like

Hey all,
with three weeks more understanding - here my summary for what helped me.

2 Likes