Bayesian Neural Network - No convergence

Hello folks,
I have a problem setting up a basic Bayesian Neural Network. Let’s say I have some data set produced b

np.random.seed(40)
n_samples = 100
X_train         = np.random.normal(size=(n_samples, 1)) 
y_train         = np.random.normal(np.cos(5.*X_train) / (np.abs(X_train) + 1.), 0.1).ravel()
X_train         = np.hstack((X_train, X_train**2, X_train**3))    
X_test          = np.atleast_2d(np.linspace(-3., 3., num=n_samples)).T
y_test          = np.random.normal(np.cos(5.*X_test) / (np.abs(X_test) + 1.), 0.1)
X_test          = np.hstack((X_test, X_test**2, X_test**3))

X_train = scale(X_train, axis=1)
X_train = X_train.astype(floatX)
y_train = y_train.astype(floatX)
X_test  = X_test.astype(floatX)
y_test  = y_test.astype(floatX)

I then define my Neural Network with distributions on the weights and the biases:

def createbayesianNN(inputs, outputs):
    basic_model = pm.Model()
    n_hidden = 100

    #Initialize random weights:
    init_1 = np.random.randn(3, n_hidden).astype(floatX)
    init_2 = np.random.randn(n_hidden, 1).astype(floatX)
    binit_1 = np.random.randn(n_hidden).astype(floatX)
    binit_2 = np.random.randn(1).astype(floatX)




    with basic_model:
        #Weights from input to hidden layer:
        weights_in_1 = pm.Normal('w_in_1', 0, sd = 1, shape=(3, n_hidden), testval = init_1)
    
        #Weights from hidden layer to outpout
        weights_1_out = pm.Normal('w_1_out', 0, sd = 1, shape=(n_hidden, 1), testval = init_2)
    
        #Bias from hidden layer:
        bias_a = pm.Normal('bias_a', 0, sd = 1, shape = n_hidden , testval = binit_1)
    
        #Bias from output layer:
         bias_b = pm.Normal('bias_b', 0, sd = 1, shape=(1), testval = binit_2)
    
        #Build NN structure:
        act_1    = tt.nnet.relu(pm.math.dot(inputs, weights_in_1) + bias_a)
        act_out  = pm.math.sigmoid(pm.math.dot(act_1, weights_1_out) + bias_b)
    
        #likelihood
        out = pm.Normal('out', mu = act_out, sd = 0.001, observed = outputs)
    
        return basic_model

I put the data in a theano.shared variable:

bnn_in = theano.shared(X_train)
bnn_out = theano.shared(y_train)

And finally run it with ADVI:

%%time
with bayesianNN:
    inference = pm.ADVI()
    approx = pm.fit(n=100000, method=inference)

I plot the Elbo to see if it is converged:

plt.plot(-inference.hist, label='new ADVI', alpha=.3)
plt.plot(approx.hist, label='old ADVI', alpha=.3)
plt.legend()
plt.ylabel('ELBO')
plt.xlabel('iteration');

index

But unfortunately it is nowhere close to convergence.

My question. Have I done something wrong in specification?
Do I have to initialize the weights? I followed examples in the forum as well as on the website. Some do some do not.

I think this is more a property of the task than anything else. Convergence is fine (you can test this by putting a hyperprior on the observation sd). Ultimately I can’t get a network to learn anything but the mean:

N_ITER=50000
layer_sizes = [50] * 3
lsd = 1
bsd = 0.1
noise_sd = 1e-3
X_train_sub = X_train[:,0]
X_train_sub = (X_train_sub - np.mean(X_train_sub))/np.std(X_train_sub)
X_train_sub = X_train_sub.reshape((X_train_sub.shape[0],1))
X = theano.shared(X_train_sub)
minibatch_x = pm.Minibatch(X_train_sub, batch_size=50)
minibatch_y = pm.Minibatch(y_train, batch_size=50)
with pm.Model() as model:
    prev_lyr = X_train_sub.shape[1]
    layers = [minibatch_x]
    for j, n_hidden in enumerate(layer_sizes):
        winit = np.random.uniform(size=(prev_lyr, n_hidden))/np.sqrt(n_hidden)
        weights = pm.Normal('w_%d' % (j + 1), mu=0, sd=lsd, shape=(prev_lyr, n_hidden), testval=winit) # 1 x nhidden
        biases = pm.Normal('b_%d' % (j + 1), mu=0, sd=bsd, shape=(n_hidden,), testval=0)
        layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.25)
        prev_lyr = n_hidden
        layers.append(layer)
    # output layer
    winit = np.random.uniform(size=(prev_lyr, 1))
    weights = pm.Normal('w_out', mu=0, sd=lsd, shape=(prev_lyr, 1), testval=winit) # should be y_train.shape[1] but is a vector
    layer = tt.dot(layers[-1], weights)
    output = pm.Normal('output', mu=layer, sd=noise_sd, observed=minibatch_y, total_size=y_train.shape[0])
    advi_fit = pm.fit(n=N_ITER, method='advi', obj_optimizer=pm.adam(learning_rate=5e-3))

plt.plot(advi_fit.hist, label='ADVI', alpha=.3);
plt.yscale('log');
plt.figure()

with model:
    advi_tr = advi_fit.sample(2500)

X.set_value(X_test[:,0].reshape((X_test.shape[0], 1)))
with pm.Model() as model_full:
    prev_lyr = X_train_sub.shape[1]
    layers = [X]
    for j, n_hidden in enumerate(layer_sizes):
        weights = pm.Normal('w_%d' % (j + 1), mu=0, sd=lsd, shape=(prev_lyr, n_hidden), testval=lsd) # 1 x nhidden
        biases = pm.Normal('b_%d' % (j + 1), mu=0, sd=bsd, shape=(n_hidden,))
        layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.25)
        prev_lyr = n_hidden
        layers.append(layer)
    # output layer
    weights = pm.Normal('w_out', mu=0, sd=lsd, shape=(prev_lyr, 1), testval=lsd) # should be y_train.shape[1] but is a vector
    bias = pm.Normal('b_out', mu=0, sd=bsd, testval=0)
    layer = pm.Deterministic('output', tt.dot(layers[-1], weights) + bias)
    ppc = pm.sample_posterior_predictive(advi_tr, vars=[layer])
    prpc = pm.sample_prior_predictive(1000)

y_pred = ppc['output'][:,:,0]
odr = np.argsort(X_test[:,0])
plt.scatter(X_test[odr,0], y_test[odr])
y_mean = np.mean(y_pred, axis=0)
y_ul = np.percentile(y_pred, 97.5, axis=0)
y_ll = np.percentile(y_pred, 2.5, axis=0)
plt.plot(X_test[odr,0], y_mean[odr], color='red')
y_prpred = prpc['output'][:,:,0]
y_pr_mean = np.mean(y_prpred, axis=0)
plt.plot(X_test[odr,0], y_pr_mean[odr], color='orange')

image

image

1 Like

Actually I take that back. I don’t think it’s the task: a simple (non-Bayesian) network converges easily

# pure theano
theano.config.compute_test_value = 'ignore'
layer_sizes = [50, 50]
Wvars = list()
Wvals = [X_train_sub.reshape((X_train_sub.shape[0],1))]
Bvars = list()
Xv = tt.matrix('X')
Yv = tt.matrix('Y')
activity = [Xv]
activity_fx = lambda x: tt.nnet.relu(x, alpha=0.1) # tt.tanh(x) # tt.nnet.relu(x, alpha=0.1)
for j, size in enumerate(layer_sizes):
    ell = np.sqrt(6/(size + Wvals[-1].shape[1]))
    Winit = np.random.uniform(-ell, ell, size=(Wvals[-1].shape[1], size))
    print(Winit.shape)
    var = theano.shared(Winit, name='w_%d' % (j + 1))
    Wvars.append(var)
    Wvals.append(Winit)
    binit = np.random.normal(1e-3, 0.1, size=size)
    bvar = theano.shared(binit, name='b_%d' % (j+1))
    Bvars.append(bvar)
    act = activity_fx(tt.dot(activity[-1], var) + bvar)
    activity.append(act)

ell = np.sqrt(6/(Wvals[-1].shape[1]+1))
Winit = np.random.uniform(-ell, ell, size=(Wvals[-1].shape[1], 1))
var = theano.shared(Winit, name='w_out')
Wvars.append(var)
Wvals.append(Winit)
output = tt.dot(activity[-1], var)

test_ = Wvals[0]
for W in Wvals[1:]:
    test_ = np.dot(test_, W)
print(test_.shape)

alpha = 1e-6 * 500
cost = tt.sum(tt.sqr(output-Yv))
grad_updates = [(w, w - alpha * theano.grad(cost, w)) for w in Wvars]
grad_updates += [(b, b - alpha * theano.grad(cost, b)) for b in Bvars]
train = theano.function([Xv, Yv],cost,updates=grad_updates)
predict = theano.function([Xv], output)
def gen_mb(X, y, s):
    n_points = y.shape[0]
    m = int(n_points/s)*s
    r=int(m/s)
    while True:
        idx = np.random.choice(n_points, m)
        for i in range(s):
            start = i*r
            end=(i+1)*r
            yield X[idx[start:end],:], y[idx[start:end],:]

niter = 64*15000
bs = 64
minibatch_iter = gen_mb(X_train_sub.reshape((X_train_sub.shape[0],1)), 
                                             y_train.reshape((y_train.shape[0],1)), bs)
bcost_vals, cost_vals = list(), list()
q = int(niter/(bs*10))
ui = [0, 1, 10, 100, 500] + [q*t for t in range(1, 10)]
u = [bs*t for t in ui]
for it_ in range(niter):
    Xb, yb = next(minibatch_iter)
    costM = train(Xb, yb)
    if np.isnan(costM):
        print(predict(Xb))
        break
    cost_vals.append(costM)
    if it_ % bs == 0:
        bcost = np.mean(cost_vals)
        cost_vals = list()
        bcost_vals.append(bcost)
    if it_ in u:
        print('iter: %d mean mse: %.4f' % (it_, bcost), end='\n')
        y_pred = predict(X_train_sub.reshape((X_train_sub.shape[0],1)))
        plt.scatter(X_train_sub, y_train, alpha=0.5)
        plt.scatter(X_train_sub, y_pred[:,0], color='orange', alpha=0.5)
        plt.title('iter: %d cost: %f' % (it_, bcost))
        plt.figure()
        
y_pred = predict(X_train_sub.reshape((X_train_sub.shape[0],1)))
y_pred_test = predict(X_test_sub.reshape(X_test_sub.shape[0], 1))

bcost_vals = np.array(bcost_vals)
plt.plot(bcost_vals)

theano.config.compute_test_value = 'raise'

image

@twiecki It seems like “bayesianifying” a neural network is not necessarily straightforward. An architecture that works when fit to minimize MSE may not converge with a trivial “replace weights with N(0,1) variables” approach. Are there any notebooks or gists about such pitfalls?

2 Likes

Hey @chartl,
thanks for looking into it. It helped me a lot. I will play around with your code a bit more and maybe I will get it to converge somehow. If that is the case I will come back here!

@bay

If you run your original code with the SGD or Nesterov optimizer, do you find that it instantly fails? See:

I noticed the same thing with the same regression task, but unfortunately I couldn’t figure out why it couldn’t fit in the same way either. Understanding this / getting it to work would be very fruitful.

I haven’t read it yet but this paper might have some clues: https://arxiv.org/abs/1906.02506

So This turns out to be (weirdly?!) a broadcasting issue. This block of code works fine:

N_FIT=35000
layer_sizes = [120]
lsd = 1.
bsd = 1.
noise_sd = 1e-2
with pm.Model() as model_full:
    prev_lyr = X_train.shape[1]
    layers = [X_train]
    for j, n_hidden in enumerate(layer_sizes):
        ell = np.sqrt(6/(prev_lyr + n_hidden))
        winit = np.random.uniform(-ell, ell, size=(prev_lyr, n_hidden))
        weights = pm.Normal('w_%d' % (j+1), 0, lsd, testval=winit, shape=winit.shape)
        biases = pm.Normal('b_%d' % (j+1), 0, bsd, shape=n_hidden)
        layer = tt.nnet.relu(tt.dot(layers[j], weights) + biases, alpha=0.1)
        prev_lyr = n_hidden
        layers.append(layer)
    # output layer
    ell = np.sqrt(6./(prev_lyr + 1))
    winit = np.random.uniform(-ell, ell, size=(prev_lyr,))
    wout = pm.Normal('w_out', 0, lsd, shape=(prev_lyr,), testval=winit)
    out_layer = pm.Deterministic('net_output', tt.dot(layers[-1], wout))
    output = pm.Normal('output', mu=out_layer, sd=noise_sd, observed=y_train, total_size=y_train.shape[0])
    layers.append(output)
    advi = pm.ADVI(beta=1.0)
    tracker = pm.callbacks.Tracker(
      mean=advi.approx.mean.eval,  # callable that returns mean
      std=advi.approx.std.eval  # callable that returns std
    )
    advi_fit = advi.fit(N_FIT, callbacks=[tracker], 
                       obj_optimizer=pm.adam(learning_rate=2e-3),
                       obj_n_mc=3, total_grad_norm_constraint=1000.)
    trace = advi_fit.sample(2500)
    ppc = pm.sample_posterior_predictive(trace, vars=[out_layer, output])
    prpc = pm.sample_prior_predictive(1000)

plt.plot(advi_fit.hist[::10], label='ADVI', alpha=.3);
plt.yscale('log');
plt.figure()
y_pred = ppc['output']
odr = np.argsort(X_train[:,0])
plt.scatter(X_train[odr,0], y_train[odr])
y_mean = np.mean(y_pred, axis=0)
plt.plot(X_train[odr,0], y_mean[odr], color='red')

image

image

But this small change

    winit = np.random.uniform(-ell, ell, size=(prev_lyr,1))
    wout = pm.Normal('w_out', 0, lsd, shape=(prev_lyr,1), testval=winit)

Breaks the convergence. Pymc4 could definitely use some kind of warning or better documentation about broadcasting pitfalls…

image

image

2 Likes

How could a theano/numpy broadcasting issue possibly cause such a difference in model output? This is legitimately fascinating.