Hi everyone,
Thank you for the amazing work you’ve been doing! I am in the process of writing a model for fitting some experimental data in a behavioural experiment. The sampling is extremely slow and the model is quite complicated, so I was wondering if there is some way to make it more efficient.
import pymc3 as pm
import theano as tt
import theano.tensor as T
import numpy as np
import scipy.stats as stats
def theano_RSA(
possible_signals_array=T.lmatrix("possible_signals_array"),
real_signals_indices=T.lvector("real_signals_indices"),
alphas=T.dvector("alphas"),
choice_alphas=T.dvector("choice_alphas"),
cost_factors= T.dvector("cost_factors"),
objective_costs_possible=T.dvector("objective_costs_possible"),
at_most_as_costly=T.lmatrix("at_most_as_costly"),
types=T.lvector("types"),
distances=T.dmatrix("distances")):
"""
Parameters
----------
possible_signals_array:
Shape (# possible signals, # states)
real_signals_indices:
the indices of the real signals in the possible_signals_array.
shape (# real signals)
alphas:
shape (# participants)
choice_alphas:
shape (# participants)
cost_factors:
How much each participant weight the cost of the messages in production.
shape (# participants)
objective_costs_possible:
The "objective" cost of the signals
shape (# signals)
at_most_as_costly:
for each real signal, whether each possible signal is at most as costly as the real signal
shape (# real signals available to participant, # signals considered by listener).
i,j is 1 iff cost(i) >= cost(j)
distances:
shape (# states, # states)
Returns
-------
theano variable
"""
real_signals_array = possible_signals_array[real_signals_indices]
objective_costs_real = objective_costs_possible[real_signals_indices]
considered_signals = at_most_as_costly & types.dimshuffle("x", 0)
language_l = possible_signals_array / possible_signals_array.sum(axis=-1, keepdims=True)
expected_dist_l0 = T.dot(language_l, distances)
unnorm_l0 = T.exp(choice_alphas[:,np.newaxis,np.newaxis]*-expected_dist_l0)
l0 = unnorm_l0 / T.sum(unnorm_l0, axis=-1, keepdims=True)
l0_extended = l0[:,np.newaxis,:,:]
unnorm_s1 = T.exp(
alphas[:,np.newaxis, np.newaxis, np.newaxis]*
(utility_l0 - costs_possible[:,np.newaxis,:,np.newaxis])
)
unnorm_l2 = T.exp(choice_alphas[:,np.newaxis,np.newaxis,np.newaxis]*-expected_dist_l2)
l2 = unnorm_l2 / T.sum(unnorm_l2, axis=-1, keepdims=True)
unnorm_s3 = T.exp(
alphas[:,np.newaxis, np.newaxis]*
(utility_l2 - costs_real[:,:,np.newaxis])
)
s3 = unnorm_s3 / unnorm_s3.sum(axis=-2, keepdims=True)
return s3
def create_model(num_participants, num_states, possible_signals_array, real_signals_indices, costs_possible,
at_most_as_costly, types, distances, picked_signals_indices, picsizes_values, participants_indices, states_values):
states = np.arange(num_states)
with pm.Model() as model:
### hyperprior for population-level parameters over alphas
pop_alpha_mu = pm.HalfNormal("pop_alpha_mu", sigma=1)
pop_alpha_sigma = pm.HalfNormal("pop_alpha_sigma", sigma=1)
alphas = pm.Gamma("alphas", mu=pop_alpha_mu, sigma=pop_alpha_sigma, shape=num_participants)
### hyperprior for population-level parameters over choice_alphas
pop_choice_alpha_mu = pm.HalfNormal("pop_choice_alpha_mu", sigma=1)
pop_choice_alpha_sigma = pm.HalfNormal("pop_choice_alpha_sigma", sigma=1)
choice_alphas = pm.Gamma("choice_alphas", mu=pop_choice_alpha_mu, sigma=pop_choice_alpha_sigma, shape=num_participants)
### hyperprior for population-level parameters over cost_factors
pop_cost_factors_mu = pm.HalfNormal("pop_cost_factors_mu", sigma=1)
pop_cost_factors_sigma = pm.HalfNormal("pop_cost_factors_sigma", sigma=1)
cost_factors = pm.Gamma("cost_factors", mu=pop_cost_factors_mu, sigma=pop_cost_factors_sigma, shape=num_participants)
s3_list = [[],]
for state in states[1:]:
s3 = theano_RSA(
possible_signals_array=tt.shared(possible_signals_array[state], name="possible_signals_array"),
real_signals_indices=tt.shared(real_signals_indices, name="real_signals_indices"),
objective_costs_possible=tt.shared(costs_possible, name="objective_costs_possible"),
at_most_as_costly=tt.shared(at_most_as_costly, name="at_most_as_costly"),
types=tt.shared(types, name="types"),
distances=tt.shared(distances[:state+1,:state+1], name="distances"),
alphas=alphas,
choice_alphas=choice_alphas,
cost_factors=cost_factors
)
s3_list.append(s3)
probability_accept = T.stack([
s3_list[picsize_value][participant_index, :, state_value]
for picsize_value, participant_index, state_value in zip(picsizes_values, participants_indices, states_values)
])
# save the probability of acceptance
pm.Deterministic("probability_accept", probability_accept)
### observed
obs = pm.Categorical(
"picked_signals",
p=probability_accept,
shape=len(picsizes_values),
# observed=picked_signals_indices
)
return model
def run_model(model, cores, draws=1000, tune=1000):
with model:
step = pm.NUTS(target_accept=0.99)
trace=pm.sample(
cores=cores,
step=step,
draws=draws,
tune=tune
)
return trace
As you can see, the model itself is not very complex at all; most of the complexity is coming from the theano_RSA function.
One obvious way to make sampling more efficient would be to eliminate the loop in the the model which adds to s3_list. Initially I had a fully vectorised function, however since the different elements have different sizes in one dimension, the RSA function creates a bunch of NaN values, which broke the theano gradient calculation, and therefore didn’t work with NUTS. I tried setting the NaNs to 0 with switch, but that also broke the gradient.
Part of the problem is that the involved tensors are going to be pretty large (e.g. shape (120, 13, 13, 20)), and there’s multiple such tensors in the RSA function. I tried using variational inference because afaik it’s a faster option, but it wouldn’t work because of the observed categorical distribution at the end.
Please let me know if more detail on the various variables would be helpful!
PS. Here is some additional functions needed to run the model, but which are not involved in the actual sampling so don’t really matter for pymc3:
def produce_artificial_data(num_participants, num_states, num_trials):
### produce language information
states = np.arange(num_states)
possible_signals_array, real_signals_indices, costs_possible, at_most_as_costly, types = produce_experiment_language(states)
distances = create_distance(num_states)
### produce participant information
participants_indices = np.repeat(np.arange(num_participants), repeats=num_trials)
# size of the picture for each trial
picsizes_values = np.random.choice(np.arange(1,num_states), size=num_participants*num_trials)
# the number of objects in the target set (<= picsize)
states_values = np.apply_along_axis(
lambda x: np.random.choice(np.arange(x)),
arr=picsizes_values.reshape(-1,1)+1,
axis=1
)
alphas = stats.gamma.rvs(a=1.3, scale=1/1.3, size=num_participants)
choice_alphas = stats.gamma.rvs(a=3., scale=1/2.4, size=num_participants)
cost_factors = stats.gamma.rvs(a=3., scale=1/2.4, size=num_participants)
# WARNING: this might create a HUGE array, so be careful with the # of participants parameter
s3_list = [[],]
for state in states[1:]:
s3=theano_RSA(return_symbolic=False)(**{
"possible_signals_array": possible_signals_array[state],
"real_signals_indices": real_signals_indices,
"alphas": alphas,
"choice_alphas": choice_alphas,
"cost_factors": cost_factors,
"objective_costs_possible": costs_possible,
"at_most_as_costly": at_most_as_costly,
"types": types,
"distances": distances[:state+1, :state+1]
})
s3_list.append(s3)
# probability of accepting
# shape: (# datapoints, # real signals)
probs_accept = np.stack([
s3_list[picsize_value][participant_index, :, state_value]
for picsize_value, participant_index, state_value in zip(picsizes_values, participants_indices, states_values)
])
# TODO: make this more elegant. Not so nice now.
# unfortunately np.multinomial doesn't accept an array for p parameter
picked_signals_indices = np.apply_along_axis(
lambda p: np.random.choice(len(p), p=p),
arr=probs_accept,
axis=1
)
return {
"num_participants": num_participants,
"num_states": num_states,
"possible_signals_array": possible_signals_array,
"real_signals_indices": real_signals_indices,
"costs_possible": costs_possible,
"at_most_as_costly": at_most_as_costly,
"types": types,
"distances": distances,
"picked_signals_indices": picked_signals_indices,
"picsizes_values": picsizes_values,
"participants_indices": participants_indices,
"states_values": states_values
}, {"alphas": alphas, "choice_alphas": choice_alphas, "cost_factors": cost_factors}
def produce_experiment_language(states):
"""
Produce various information concerning the language used by participants in experiment
The important thing about this function is that it depends on data only through states,
which is the maximum number of states. Conceptually convenient to separate from data.
"""
# proportions_array has shape (# states, # states)
proportions_arrays = [np.linspace(0, 1, num_objects) for num_objects in states+1]
#### for each signal, specify name, array, cost, type
# make sure that the name is the same as the name recorded in the experimental data
#### signals in experiment
# for now, wrt types: 1=upward monotonic, 2=point, 3=downward monotonic
most = ("Most", [p > 0.5 for p in proportions_arrays], 1, 1)
mth = ("More than half", [p > 0.5 for p in proportions_arrays], 3, 1)
every = ("All", [p == 1. for p in proportions_arrays], 1, 1)
# approximate "half" for the cases where there is no perfect mid?
half = ("Half", [np.abs(p-0.5) == np.min(np.abs(p-0.5)) if p.size!=0 else np.array([]) for p in proportions_arrays], 1, 1)
many = ("Many", [p > 0.4 for p in proportions_arrays], 1, 1)
none = ("None", [p == 0. for p in proportions_arrays], 1, 3)
lth = ("Less than half", [p < 0.5 for p in proportions_arrays], 3, 3)
few = ("Few", [p < 0.2 for p in proportions_arrays], 1, 3)
some = ("Some", [p > 0. for p in proportions_arrays], 1, 1)
# signals that I am assuming the speaking thinks of the listener thinking about,
# but are not real options in the experiment
more_than_2_3rds = ("more_than_2_3rds", [p > 0.66 for p in proportions_arrays], 3, 1)
more_than_3_4ths = ("more_than_3_4ths", [p > 0.75 for p in proportions_arrays], 3, 1)
less_than_1_3rds = ("less_than_1_3rds", [p < 0.33 for p in proportions_arrays], 3, 3)
less_than_1_4ths = ("less_than_1_4ths", [p < 0.25 for p in proportions_arrays], 3, 3)
signals = [
most, mth, every, half, many, none, lth, few, some,
more_than_2_3rds, more_than_3_4ths,
less_than_1_3rds, less_than_1_4ths
]
real_signals_indices = np.arange(9).astype(int)
names = [signal[0] for signal in signals]
# signals_language_array for each picsize has an array with shape (# signals, picsize)
possible_signals_array = [
np.array(a).astype(int)
for a in list(zip(*[s[1] for s in signals]))
]
costs_possible = np.array([s[2] for s in signals])
types = np.array([s[3] for s in signals])
# (real signals)
costs_real = costs_possible[real_signals_indices]
# (real signals, possible signals)
at_most_as_costly = ((costs_real.reshape(-1,1) - costs_possible) >= 0).astype(int)
return possible_signals_array, real_signals_indices, costs_possible, at_most_as_costly, types
def create_distance(num_states):
# (states, states)
a = np.tile(np.arange(num_states), reps=(num_states,1))
distances = np.abs(a - np.arange(num_states).reshape(-1,1))
return distances
num_participants = 10 # will be 120
num_states = 5 # will be will be 20
num_trials = 10 # will be 300
model_input, real_parameters = produce_artificial_data(
num_participants,
num_states,
num_trials
)
model = create_model(**model_input)
trace = run_model(model, 1)