I am making a hierarchical logistic regression model that’s meant to predict the election winner based on ‘fundamentals’ (demographics, the economy, etc).
num_states = len(df_model['state_fips'].unique())
states_lookup = dict(zip(df['state_fips'].unique() , range(len(df['state_fips'].unique()))))
states = df.state_fips.replace(states_lookup).values
with pm.Model() as election_model:
# Priors
mu_a = pm.Normal('mu_a', mu=0, sigma=5)
sigma_a = pm.HalfCauchy('sigma_a', 5)
mu_b = pm.Normal('mu_b', mu=0, sigma=3)
sigma_b = pm.HalfCauchy('sigma_b', 5)
alpha = pm.Normal('alpha', mu_a, sd=sigma_a, shape=num_states)
beta_income = pm.Normal('beta_income', mu_b, sd=sigma_b, shape=num_states)
beta_consumer = pm.Normal('beta_consumer', mu_b, sd=sigma_b)
beta_urban = pm.Normal('beta_urban', mu_b, sd=sigma_b, shape=num_states)
beta_density = pm.Normal('beta_density', mu_b, sd=sigma_b, shape=num_states)
beta_nonHS = pm.Normal('beta_nonHS', mu_b, sd=sigma_b, shape=num_states)
beta_pctasian = pm.Normal('beta_pctasian', mu_b, sd=sigma_b, shape=num_states)
beta_advanced = pm.Normal('beta_advanced', mu_b, sd=sigma_b, shape=num_states)
beta_approval = pm.Normal('beta_approval', mu_b, sd=sigma_b)
beta_evan = pm.Normal('beta_evan', mu_b, sd=sigma_b, shape=num_states)
income = pm.Data('income', df.Pct_Over_National_Average.values)
consumer = pm.Data('consumer', df.Idx_Consumer_Sentiment.values)
urban = pm.Data('urban', df.urban_pct.values)
density = pm.Data('density', df.pop_density.values)
nonHS = pm.Data('nonHS', df.nonHS_graduate.values)
pct_asian = pm.Data('pct_asian', df.pct_asian.values)
adv_degree = pm.Data('adv_degree', df.advanced_degree_or_more.values)
approval = pm.Data('approval', df.Y4_avg_net_approval.values)
evang = pm.Data('evang', df.evangelical_pct.values)
sigma_y = pm.HalfCauchy('sigma_y',0.001)
mu = alpha[states] + beta_income[states] * income + beta_consumer * consumer + beta_urban[states] * urban + beta_density[states] * density + beta_nonHS[states] * nonHS + beta_pctasian[states] * pct_asian + beta_advanced[states] * adv_degree + beta_approval * approval + beta_evan[states] * evang + sigma_y
#theta = pm.Deterministic('theta', pm.invlogit(mu))
theta = pm.Deterministic('theta', pm.math.sigmoid(mu))
#Y_obs = pm.Binomial('Y_obs', p=theta, n=num_states, observed=df['dem_state_win'].values)
Y_obs = pm.Bernoulli('Y_obs', theta, observed=df['dem_state_win'].values)
I standardize the parameters first. I’m having two problems. The first is that when I run samples, it seems to have some problems with convergence and some of the rhats are bigger than I would like. The second problem is that I’m trying to enter 2020 data to make a prediction:
with election_model:
pm.set_data({'income': df20.Pct_Over_National_Average.values, 'consumer': df20.Idx_Consumer_Sentiment.values,
'urban': df20.urban_pct.values, 'density': df20.pop_density.values, 'nonHS': df20.nonHS_graduate.values,
'pct_asian': df20.pct_asian.values, 'adv_degree': df20.advanced_degree_or_more.values,
'approval': df20.Y4_avg_net_approval.values, 'evang': df20.evangelical_pct.values})
y_test = pm.sample_posterior_predictive(trace)
I get an input dimension mismatch:
ValueError: Input dimension mis-match. (input[0].shape[0] = 357, input[2].shape[0] = 51)
357 is the length of the original parameters (51 states plus DC x 7 elections) and the new data is for 2020 for the 51 states. Obviously I don’t have much experience with PyMC3, can anyone shed some light on this for me?