Hi,

I’m trying to train a simple toy model with ADVI.

The ELBO seems converged, but the prediction is wrong.

The same model works with NUTS sampler, but not with ADVI.

My code is here: https://gist.github.com/adkoo/99d960455258166b247d3310f439e6db

Thanks!

#Toy model

def build_toy_dataset(N=50, noise_std=0.2):

x = np.linspace(-3, 3, num=N)

y = np.cos(x) + np.random.normal(0, noise_std, size=N)

x = x.astype(floatX).reshape((N, 1))

y = y.astype(floatX)

return x, yN = 50 # number of data points

D = 1 # number of featuresX_train, Y_train = build_toy_dataset(N)

X_test, Y_test = build_toy_dataset(N)fig, ax = plt.subplots()

ax.plot(X_test,Y_test,‘ro’,X_train,Y_train,‘bx’,alpha=0.2)

ax.legend([‘Y_test’,‘Y_train’])

ax.set(xlabel=‘X’, ylabel=‘Y’, title=‘Toy Regression data set’);

Construct NN - 2 layers with 5 nodes each

def construct_nn_2Layers_with_b(ann_input, ann_output):

n_hidden = 5

n_features = ann_input.get_value().shape[1]`# Initialize random weights between each layer init_1 = np.random.randn(n_features, n_hidden).astype(floatX) init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX) init_out = np.random.randn(n_hidden).astype(floatX) init_b_1 = np.random.randn(n_hidden).astype(floatX) init_b_2 = np.random.randn(n_hidden).astype(floatX) init_b_out = np.random.randn(1).astype(floatX) with pm.Model() as neural_network: # Weights from input to hidden layer weights_in_1 = pm.Normal('w_in_1', 0, sd=1, shape=(n_features, n_hidden), testval=init_1) bias_1 = pm.Normal('b_1', mu=0, sd=1, shape=(n_hidden), testval=init_b_1) # Weights from 1st to 2nd layer weights_1_2 = pm.Normal('w_1_2', 0, sd=1, shape=(n_hidden, n_hidden), testval=init_2) bias_2 = pm.Normal('b_2', mu=0, sd=1, shape=(n_hidden), testval=init_b_2) # Weights from hidden layer to output weights_2_out = pm.Normal('w_2_out', 0, sd=1, shape=(n_hidden,), testval=init_out) bias_out = pm.Normal('b_out', mu=0, sd=1, shape=(1), testval=init_b_out) # Build neural-network using tanh activation function act_1 = pm.math.tanh(pm.math.dot(ann_input, weights_in_1)+bias_1) act_2 = pm.math.tanh(pm.math.dot(act_1, weights_1_2)+bias_2) act_out = (pm.math.dot(act_2, weights_2_out) + bias_out) sd = pm.HalfNormal('sd', sd=1) out = pm.Normal('out', mu=act_out, sd=sd, observed=ann_output, total_size=Y_train.shape[0]) return neural_network`

ann_input = theano.shared(X_train)

ann_output = theano.shared(Y_train)

neural_network = construct_nn_2Layers_with_b(ann_input, ann_output)

ADVI:

with neural_network:

s = theano.shared(pm.floatX(1))

inference = pm.ADVI(cost_part_grad_scale=s)

tracker = pm.callbacks.Tracker(

mean=inference.approx.mean.eval, # callable that returns mean

std=inference.approx.std.eval # callable that returns std

)

pm.fit(n=20000, method=inference, callbacks=[tracker])

# It is time to set`s`

to zero

s.set_value(0)

approx = pm.fit(n=30000, method=inference, callbacks=[tracker] )

Prediction:

ann_input.set_value(X_test)

ann_output.set_value(Y_test)

with neural_network:

ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False)pred = ppc[‘out’].mean(axis=0)

Plots:

fig, ax = plt.subplots()

ax.plot(X_test,Y_test,‘bx’,alpha=0.5,label=‘Observed’)

ax.plot(X_test,pred,‘r–’,alpha=0.9,label=‘Posterior predictive means’)

ax.legend()

ax.set(xlabel=‘X’, ylabel=‘Y’, title=‘Test set’);