# Bayesian Neural Network - No convergence

Hello folks,
I have a problem setting up a basic Bayesian Neural Network. Let’s say I have some data set produced b

``````np.random.seed(40)
n_samples = 100
X_train         = np.random.normal(size=(n_samples, 1))
y_train         = np.random.normal(np.cos(5.*X_train) / (np.abs(X_train) + 1.), 0.1).ravel()
X_train         = np.hstack((X_train, X_train**2, X_train**3))
X_test          = np.atleast_2d(np.linspace(-3., 3., num=n_samples)).T
y_test          = np.random.normal(np.cos(5.*X_test) / (np.abs(X_test) + 1.), 0.1)
X_test          = np.hstack((X_test, X_test**2, X_test**3))

X_train = scale(X_train, axis=1)
X_train = X_train.astype(floatX)
y_train = y_train.astype(floatX)
X_test  = X_test.astype(floatX)
y_test  = y_test.astype(floatX)
``````

I then define my Neural Network with distributions on the weights and the biases:

``````def createbayesianNN(inputs, outputs):
basic_model = pm.Model()
n_hidden = 100

#Initialize random weights:
init_1 = np.random.randn(3, n_hidden).astype(floatX)
init_2 = np.random.randn(n_hidden, 1).astype(floatX)
binit_1 = np.random.randn(n_hidden).astype(floatX)
binit_2 = np.random.randn(1).astype(floatX)

with basic_model:
#Weights from input to hidden layer:
weights_in_1 = pm.Normal('w_in_1', 0, sd = 1, shape=(3, n_hidden), testval = init_1)

#Weights from hidden layer to outpout
weights_1_out = pm.Normal('w_1_out', 0, sd = 1, shape=(n_hidden, 1), testval = init_2)

#Bias from hidden layer:
bias_a = pm.Normal('bias_a', 0, sd = 1, shape = n_hidden , testval = binit_1)

#Bias from output layer:
bias_b = pm.Normal('bias_b', 0, sd = 1, shape=(1), testval = binit_2)

#Build NN structure:
act_1    = tt.nnet.relu(pm.math.dot(inputs, weights_in_1) + bias_a)
act_out  = pm.math.sigmoid(pm.math.dot(act_1, weights_1_out) + bias_b)

#likelihood
out = pm.Normal('out', mu = act_out, sd = 0.001, observed = outputs)

return basic_model
``````

I put the data in a theano.shared variable:

``````bnn_in = theano.shared(X_train)
bnn_out = theano.shared(y_train)
``````

And finally run it with ADVI:

``````%%time
with bayesianNN:
approx = pm.fit(n=100000, method=inference)
``````

I plot the Elbo to see if it is converged:

``````plt.plot(-inference.hist, label='new ADVI', alpha=.3)
plt.legend()
plt.ylabel('ELBO')
plt.xlabel('iteration');
``````

But unfortunately it is nowhere close to convergence.

My question. Have I done something wrong in specification?
Do I have to initialize the weights? I followed examples in the forum as well as on the website. Some do some do not.

I think this is more a property of the task than anything else. Convergence is fine (you can test this by putting a hyperprior on the observation `sd`). Ultimately I can’t get a network to learn anything but the mean:

``````N_ITER=50000
layer_sizes = [50] * 3
lsd = 1
bsd = 0.1
noise_sd = 1e-3
X_train_sub = X_train[:,0]
X_train_sub = (X_train_sub - np.mean(X_train_sub))/np.std(X_train_sub)
X_train_sub = X_train_sub.reshape((X_train_sub.shape[0],1))
X = theano.shared(X_train_sub)
minibatch_x = pm.Minibatch(X_train_sub, batch_size=50)
minibatch_y = pm.Minibatch(y_train, batch_size=50)
with pm.Model() as model:
prev_lyr = X_train_sub.shape[1]
layers = [minibatch_x]
for j, n_hidden in enumerate(layer_sizes):
winit = np.random.uniform(size=(prev_lyr, n_hidden))/np.sqrt(n_hidden)
weights = pm.Normal('w_%d' % (j + 1), mu=0, sd=lsd, shape=(prev_lyr, n_hidden), testval=winit) # 1 x nhidden
biases = pm.Normal('b_%d' % (j + 1), mu=0, sd=bsd, shape=(n_hidden,), testval=0)
layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.25)
prev_lyr = n_hidden
layers.append(layer)
# output layer
winit = np.random.uniform(size=(prev_lyr, 1))
weights = pm.Normal('w_out', mu=0, sd=lsd, shape=(prev_lyr, 1), testval=winit) # should be y_train.shape[1] but is a vector
layer = tt.dot(layers[-1], weights)
output = pm.Normal('output', mu=layer, sd=noise_sd, observed=minibatch_y, total_size=y_train.shape[0])

plt.yscale('log');
plt.figure()

with model:

X.set_value(X_test[:,0].reshape((X_test.shape[0], 1)))
with pm.Model() as model_full:
prev_lyr = X_train_sub.shape[1]
layers = [X]
for j, n_hidden in enumerate(layer_sizes):
weights = pm.Normal('w_%d' % (j + 1), mu=0, sd=lsd, shape=(prev_lyr, n_hidden), testval=lsd) # 1 x nhidden
biases = pm.Normal('b_%d' % (j + 1), mu=0, sd=bsd, shape=(n_hidden,))
layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.25)
prev_lyr = n_hidden
layers.append(layer)
# output layer
weights = pm.Normal('w_out', mu=0, sd=lsd, shape=(prev_lyr, 1), testval=lsd) # should be y_train.shape[1] but is a vector
bias = pm.Normal('b_out', mu=0, sd=bsd, testval=0)
layer = pm.Deterministic('output', tt.dot(layers[-1], weights) + bias)
prpc = pm.sample_prior_predictive(1000)

y_pred = ppc['output'][:,:,0]
odr = np.argsort(X_test[:,0])
plt.scatter(X_test[odr,0], y_test[odr])
y_mean = np.mean(y_pred, axis=0)
y_ul = np.percentile(y_pred, 97.5, axis=0)
y_ll = np.percentile(y_pred, 2.5, axis=0)
plt.plot(X_test[odr,0], y_mean[odr], color='red')
y_prpred = prpc['output'][:,:,0]
y_pr_mean = np.mean(y_prpred, axis=0)
plt.plot(X_test[odr,0], y_pr_mean[odr], color='orange')
``````

1 Like

Actually I take that back. I don’t think it’s the task: a simple (non-Bayesian) network converges easily

``````# pure theano
theano.config.compute_test_value = 'ignore'
layer_sizes = [50, 50]
Wvars = list()
Wvals = [X_train_sub.reshape((X_train_sub.shape[0],1))]
Bvars = list()
Xv = tt.matrix('X')
Yv = tt.matrix('Y')
activity = [Xv]
activity_fx = lambda x: tt.nnet.relu(x, alpha=0.1) # tt.tanh(x) # tt.nnet.relu(x, alpha=0.1)
for j, size in enumerate(layer_sizes):
ell = np.sqrt(6/(size + Wvals[-1].shape[1]))
Winit = np.random.uniform(-ell, ell, size=(Wvals[-1].shape[1], size))
print(Winit.shape)
var = theano.shared(Winit, name='w_%d' % (j + 1))
Wvars.append(var)
Wvals.append(Winit)
binit = np.random.normal(1e-3, 0.1, size=size)
bvar = theano.shared(binit, name='b_%d' % (j+1))
Bvars.append(bvar)
act = activity_fx(tt.dot(activity[-1], var) + bvar)
activity.append(act)

ell = np.sqrt(6/(Wvals[-1].shape[1]+1))
Winit = np.random.uniform(-ell, ell, size=(Wvals[-1].shape[1], 1))
var = theano.shared(Winit, name='w_out')
Wvars.append(var)
Wvals.append(Winit)
output = tt.dot(activity[-1], var)

test_ = Wvals[0]
for W in Wvals[1:]:
test_ = np.dot(test_, W)
print(test_.shape)

alpha = 1e-6 * 500
cost = tt.sum(tt.sqr(output-Yv))
predict = theano.function([Xv], output)
def gen_mb(X, y, s):
n_points = y.shape[0]
m = int(n_points/s)*s
r=int(m/s)
while True:
idx = np.random.choice(n_points, m)
for i in range(s):
start = i*r
end=(i+1)*r
yield X[idx[start:end],:], y[idx[start:end],:]

niter = 64*15000
bs = 64
minibatch_iter = gen_mb(X_train_sub.reshape((X_train_sub.shape[0],1)),
y_train.reshape((y_train.shape[0],1)), bs)
bcost_vals, cost_vals = list(), list()
q = int(niter/(bs*10))
ui = [0, 1, 10, 100, 500] + [q*t for t in range(1, 10)]
u = [bs*t for t in ui]
for it_ in range(niter):
Xb, yb = next(minibatch_iter)
costM = train(Xb, yb)
if np.isnan(costM):
print(predict(Xb))
break
cost_vals.append(costM)
if it_ % bs == 0:
bcost = np.mean(cost_vals)
cost_vals = list()
bcost_vals.append(bcost)
if it_ in u:
print('iter: %d mean mse: %.4f' % (it_, bcost), end='\n')
y_pred = predict(X_train_sub.reshape((X_train_sub.shape[0],1)))
plt.scatter(X_train_sub, y_train, alpha=0.5)
plt.scatter(X_train_sub, y_pred[:,0], color='orange', alpha=0.5)
plt.title('iter: %d cost: %f' % (it_, bcost))
plt.figure()

y_pred = predict(X_train_sub.reshape((X_train_sub.shape[0],1)))
y_pred_test = predict(X_test_sub.reshape(X_test_sub.shape[0], 1))

bcost_vals = np.array(bcost_vals)
plt.plot(bcost_vals)

theano.config.compute_test_value = 'raise'
``````

@twiecki It seems like “bayesianifying” a neural network is not necessarily straightforward. An architecture that works when fit to minimize MSE may not converge with a trivial “replace weights with N(0,1) variables” approach. Are there any notebooks or gists about such pitfalls?

2 Likes

Hey @chartl,
thanks for looking into it. It helped me a lot. I will play around with your code a bit more and maybe I will get it to converge somehow. If that is the case I will come back here!

@bay

If you run your original code with the SGD or Nesterov optimizer, do you find that it instantly fails? See:

I noticed the same thing with the same regression task, but unfortunately I couldn’t figure out why it couldn’t fit in the same way either. Understanding this / getting it to work would be very fruitful.

I haven’t read it yet but this paper might have some clues: https://arxiv.org/abs/1906.02506

So This turns out to be (weirdly?!) a broadcasting issue. This block of code works fine:

``````N_FIT=35000
layer_sizes = [120]
lsd = 1.
bsd = 1.
noise_sd = 1e-2
with pm.Model() as model_full:
prev_lyr = X_train.shape[1]
layers = [X_train]
for j, n_hidden in enumerate(layer_sizes):
ell = np.sqrt(6/(prev_lyr + n_hidden))
winit = np.random.uniform(-ell, ell, size=(prev_lyr, n_hidden))
weights = pm.Normal('w_%d' % (j+1), 0, lsd, testval=winit, shape=winit.shape)
biases = pm.Normal('b_%d' % (j+1), 0, bsd, shape=n_hidden)
layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.1)
prev_lyr = n_hidden
layers.append(layer)
# output layer
ell = np.sqrt(6./(prev_lyr + 1))
winit = np.random.uniform(-ell, ell, size=(prev_lyr,))
wout = pm.Normal('w_out', 0, lsd, shape=(prev_lyr,), testval=winit)
out_layer = pm.Deterministic('net_output', tt.dot(layers[-1], wout))
output = pm.Normal('output', mu=out_layer, sd=noise_sd, observed=y_train, total_size=y_train.shape[0])
layers.append(output)
tracker = pm.callbacks.Tracker(
mean=advi.approx.mean.eval,  # callable that returns mean
std=advi.approx.std.eval  # callable that returns std
)
ppc = pm.sample_posterior_predictive(trace, vars=[out_layer, output])
prpc = pm.sample_prior_predictive(1000)

plt.yscale('log');
plt.figure()
y_pred = ppc['output']
odr = np.argsort(X_train[:,0])
plt.scatter(X_train[odr,0], y_train[odr])
y_mean = np.mean(y_pred, axis=0)
plt.plot(X_train[odr,0], y_mean[odr], color='red')

``````

But this small change

``````    winit = np.random.uniform(-ell, ell, size=(prev_lyr,1))
wout = pm.Normal('w_out', 0, lsd, shape=(prev_lyr,1), testval=winit)
``````

Breaks the convergence. Pymc4 could definitely use some kind of warning or better documentation about broadcasting pitfalls…

2 Likes

How could a theano/numpy broadcasting issue possibly cause such a difference in model output? This is legitimately fascinating.