I am new to pymc. I started from this tutorial on RL to implement hierarchical bayesian modelling for a RL model where I need to update both Q and V at each step. I am coding the likelihood. I understand that I would need to set two outputs for pytensor.scan() (one is Q and one is V). Although the code runs, Qs.eval() and V.eval() give me “nan” entries after running pytensor.scan(). I tried different ways, adapting the code from these examples from theano to account for multiple outputs, but I cannot solve the issue. May you please help me on this?
My likelihood and update functions:
def pytensor_llik_td(parameters, df):
alpha_v = parameters['alphav']
alpha_conf = parameters['alphaconf']
alpha_disc = parameters['alphadisc']
beta = parameters['beta']
actions = df.choice.values
rewards = df.outcome.values
rewards_unchosen = df.outcome_unchosen.values
information = df.information.values
alternative_actions = abs(1-actions)
rewards = pt.as_tensor_variable(rewards, dtype="float64")
rewards_unchosen = pt.as_tensor_variable(rewards_unchosen, dtype="float64")
actions = pt.as_tensor_variable(actions, dtype="int32")
alternative_actions = pt.as_tensor_variable(alternative_actions, dtype="int32")
information = pt.as_tensor_variable(information, dtype="int32")
# Compute the Qs values
Qs = pt.zeros((2,), dtype="float64")
V = pt.as_tensor_variable(0, dtype="float64")
[Qs, V], updates = pytensor.scan(fn=update_Q, sequences=[actions, rewards, alternative_actions, rewards_unchosen, information], outputs_info=[Qs, V], non_sequences=[alpha_conf, alpha_disc, alpha_v], strict=True)
# Apply the sotfmax transformation
Qs = Qs[:-1] * beta
logp_actions = Qs - pt.logsumexp(Qs, axis=1, keepdims=True)
# Calculate the log likelihood of the observed actions
logp_actions = logp_actions[pt.arange(actions.shape[0] - 1), actions[1:]]
return pt.sum(logp_actions) # PyMC expects the standard log-likelihood
def update_Q(action, reward, alternative_action, reward_unchosen, information, Qs, V, alpha_conf, alpha_disc, alpha_v):
"""
This function updates the Q table according to the RL update rule.
It will be called by pytensor.scan to do so recursevely, given the observed data and the alpha parameter
"""
r_v = (reward + reward_unchosen) * information + (reward + Qs[alternative_action]) * (1-information)
r_v *= .5
V += alpha_v * ( r_v - V )
alpha = alpha_conf * ( reward > Qs[action] ) + alpha_disc * ( reward <= Qs[action] )
Qs = pt.set_subtensor(Qs[action], Qs[action] + alpha * (reward - V - Qs[action]))
alpha = alpha_conf * ( reward_unchosen <= Qs[alternative_action] ) + alpha_disc * ( reward_unchosen > Qs[alternative_action] )
Qs = pt.set_subtensor(Qs[alternative_action], Qs[alternative_action] + alpha * (reward_unchosen - V - Qs[alternative_action]) * information )
return [Qs, V]