Hi,
I’m trying to train a simple toy model with ADVI.
The ELBO seems converged, but the prediction is wrong.
The same model works with NUTS sampler, but not with ADVI.
My code is here: ADVI PYMC3 · GitHub
Thanks!
#Toy model
def build_toy_dataset(N=50, noise_std=0.2):
x = np.linspace(-3, 3, num=N)
y = np.cos(x) + np.random.normal(0, noise_std, size=N)
x = x.astype(floatX).reshape((N, 1))
y = y.astype(floatX)
return x, yN = 50 # number of data points
D = 1 # number of featuresX_train, Y_train = build_toy_dataset(N)
X_test, Y_test = build_toy_dataset(N)fig, ax = plt.subplots()
ax.plot(X_test,Y_test,‘ro’,X_train,Y_train,‘bx’,alpha=0.2)
ax.legend([‘Y_test’,‘Y_train’])
ax.set(xlabel=‘X’, ylabel=‘Y’, title=‘Toy Regression data set’);
Construct NN - 2 layers with 5 nodes each
def construct_nn_2Layers_with_b(ann_input, ann_output):
n_hidden = 5
n_features = ann_input.get_value().shape[1]# Initialize random weights between each layer init_1 = np.random.randn(n_features, n_hidden).astype(floatX) init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX) init_out = np.random.randn(n_hidden).astype(floatX) init_b_1 = np.random.randn(n_hidden).astype(floatX) init_b_2 = np.random.randn(n_hidden).astype(floatX) init_b_out = np.random.randn(1).astype(floatX) with pm.Model() as neural_network: # Weights from input to hidden layer weights_in_1 = pm.Normal('w_in_1', 0, sd=1, shape=(n_features, n_hidden), testval=init_1) bias_1 = pm.Normal('b_1', mu=0, sd=1, shape=(n_hidden), testval=init_b_1) # Weights from 1st to 2nd layer weights_1_2 = pm.Normal('w_1_2', 0, sd=1, shape=(n_hidden, n_hidden), testval=init_2) bias_2 = pm.Normal('b_2', mu=0, sd=1, shape=(n_hidden), testval=init_b_2) # Weights from hidden layer to output weights_2_out = pm.Normal('w_2_out', 0, sd=1, shape=(n_hidden,), testval=init_out) bias_out = pm.Normal('b_out', mu=0, sd=1, shape=(1), testval=init_b_out) # Build neural-network using tanh activation function act_1 = pm.math.tanh(pm.math.dot(ann_input, weights_in_1)+bias_1) act_2 = pm.math.tanh(pm.math.dot(act_1, weights_1_2)+bias_2) act_out = (pm.math.dot(act_2, weights_2_out) + bias_out) sd = pm.HalfNormal('sd', sd=1) out = pm.Normal('out', mu=act_out, sd=sd, observed=ann_output, total_size=Y_train.shape[0]) return neural_network
ann_input = theano.shared(X_train)
ann_output = theano.shared(Y_train)
neural_network = construct_nn_2Layers_with_b(ann_input, ann_output)
ADVI:
with neural_network:
s = theano.shared(pm.floatX(1))
inference = pm.ADVI(cost_part_grad_scale=s)
tracker = pm.callbacks.Tracker(
mean=inference.approx.mean.eval, # callable that returns mean
std=inference.approx.std.eval # callable that returns std
)
pm.fit(n=20000, method=inference, callbacks=[tracker])
# It is time to sets
to zero
s.set_value(0)
approx = pm.fit(n=30000, method=inference, callbacks=[tracker] )
Prediction:
ann_input.set_value(X_test)
ann_output.set_value(Y_test)
with neural_network:
ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False)pred = ppc[‘out’].mean(axis=0)
Plots:
fig, ax = plt.subplots()
ax.plot(X_test,Y_test,‘bx’,alpha=0.5,label=‘Observed’)
ax.plot(X_test,pred,‘r–’,alpha=0.9,label=‘Posterior predictive means’)
ax.legend()
ax.set(xlabel=‘X’, ylabel=‘Y’, title=‘Test set’);