Hello folks,
I have a problem setting up a basic Bayesian Neural Network. Let’s say I have some data set produced b
np.random.seed(40)
n_samples = 100
X_train = np.random.normal(size=(n_samples, 1))
y_train = np.random.normal(np.cos(5.*X_train) / (np.abs(X_train) + 1.), 0.1).ravel()
X_train = np.hstack((X_train, X_train**2, X_train**3))
X_test = np.atleast_2d(np.linspace(-3., 3., num=n_samples)).T
y_test = np.random.normal(np.cos(5.*X_test) / (np.abs(X_test) + 1.), 0.1)
X_test = np.hstack((X_test, X_test**2, X_test**3))
X_train = scale(X_train, axis=1)
X_train = X_train.astype(floatX)
y_train = y_train.astype(floatX)
X_test = X_test.astype(floatX)
y_test = y_test.astype(floatX)
I then define my Neural Network with distributions on the weights and the biases:
def createbayesianNN(inputs, outputs):
basic_model = pm.Model()
n_hidden = 100
#Initialize random weights:
init_1 = np.random.randn(3, n_hidden).astype(floatX)
init_2 = np.random.randn(n_hidden, 1).astype(floatX)
binit_1 = np.random.randn(n_hidden).astype(floatX)
binit_2 = np.random.randn(1).astype(floatX)
with basic_model:
#Weights from input to hidden layer:
weights_in_1 = pm.Normal('w_in_1', 0, sd = 1, shape=(3, n_hidden), testval = init_1)
#Weights from hidden layer to outpout
weights_1_out = pm.Normal('w_1_out', 0, sd = 1, shape=(n_hidden, 1), testval = init_2)
#Bias from hidden layer:
bias_a = pm.Normal('bias_a', 0, sd = 1, shape = n_hidden , testval = binit_1)
#Bias from output layer:
bias_b = pm.Normal('bias_b', 0, sd = 1, shape=(1), testval = binit_2)
#Build NN structure:
act_1 = tt.nnet.relu(pm.math.dot(inputs, weights_in_1) + bias_a)
act_out = pm.math.sigmoid(pm.math.dot(act_1, weights_1_out) + bias_b)
#likelihood
out = pm.Normal('out', mu = act_out, sd = 0.001, observed = outputs)
return basic_model
I put the data in a theano.shared variable:
bnn_in = theano.shared(X_train)
bnn_out = theano.shared(y_train)
And finally run it with ADVI:
%%time
with bayesianNN:
inference = pm.ADVI()
approx = pm.fit(n=100000, method=inference)
I plot the Elbo to see if it is converged:
plt.plot(-inference.hist, label='new ADVI', alpha=.3)
plt.plot(approx.hist, label='old ADVI', alpha=.3)
plt.legend()
plt.ylabel('ELBO')
plt.xlabel('iteration');
But unfortunately it is nowhere close to convergence.
My question. Have I done something wrong in specification?
Do I have to initialize the weights? I followed examples in the forum as well as on the website. Some do some do not.