Help Converting Keras Model to PYMC3 syntax


I would like to learn how to better translate a keras deep learning model to a bayesian neural net. I just got done with Eric J. Ma’s 2017 pydata talk on “Demystifying Bayesian Deep Learning” which helped my conceptual knowledge immensely.

Thanks for all the help thus far pymc3 community!

I was wondering if anyone can help me translate the below keras deep learning regression model to pymc3 syntax? Honestly, if anyone can translate the first three layers and the compiler/optimizer, then I can apply to the rest of the model.

model = Sequential()
model.add(Dense(256, input_dim=12, kernel_initializer=‘normal’, activation=‘relu’
, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(128, activation = ‘relu’, kernel_regularizer = regularizers.l2(0.01)))
model.add(Dense(128, activation = ‘relu’, kernel_regularizer = regularizers.l2(0.01)))
model.add(Dense(64, activation=‘relu’, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation=‘relu’,kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(16, activation = ‘relu’, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(8, activation = ‘relu’, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation=‘linear’))
model.compile(loss=‘mse’, optimizer=‘adam’, metrics=[‘mse’,‘mae’])
history =, y_train, epochs=25, verbose=1, validation_split=0.2)

The core of any neural network is just chaining the weights. The L2 regularizer you have is spiritually similar to a normal prior, so that would make a weight layer look like

weight_sd = 0.1 # could use a HalfNormal hyperprior if you really wanted
W1 = pm.Normal('weight_1', 0, weight_sd, shape=(output_dim, input_dim))

Dropout are a set of Bernoulli variables

drop_rate = 0.5
D1 = pm.Bernoulli('dropout_1', drop_rate, shape=(output_dim, input_dim))
WD1 = pm.Deterministic('dropout_weights_1', W1 * D1)

With the weights constructed it’s just multiplication and ReLU

L2 = pm.Deterministic('Layer2', tt.nnet.relu(, L1)))

Bias could be added, if you want

B2 = pm.Normal('bias2', 0., 1.)
L2 = pm.Deterministic('Layer2', tt.nnet.relu(, L1)) + B2)

More generally, see

1 Like

I would remove the dropout part, as:

  • dropout is more a model training technique
  • discrete node are hard to train

If you really want something identical, you can try using the theano raw random stream and index to it:

1 Like

Thank you @chartl. I thought I had it but I keep getting the same error. The code and error log are below. Does anything immediately jump out to you?


ann_input = tt.shared(np.asarray(X_train))
y_train_t = y_train.transpose()
ann_output = tt.shared(np.asarray(y_train_t))

n_hidden = 15

Initialize random weights between each layer

init_1 = np.random.randn(X_train.shape[1], n_hidden)
init_2 = np.random.randn(n_hidden, n_hidden)
init_3 = np.random.randn(n_hidden, n_hidden)
init_4 = np.random.randn(n_hidden, n_hidden)
init_5 = np.random.randn(n_hidden, n_hidden)
init_6 = np.random.randn(n_hidden, n_hidden)
init_7 = np.random.randn(n_hidden, n_hidden)
init_8 = np.random.randn(n_hidden, n_hidden)
init_9 = np.random.randn(n_hidden, n_hidden)
init_10 = np.random.randn(n_hidden, n_hidden)
init_11 = np.random.randn(n_hidden, n_hidden)
init_12 = np.random.randn(n_hidden, n_hidden)
init_13 = np.random.randn(n_hidden, n_hidden)
init_14 = np.random.randn(n_hidden, n_hidden)
init_15 = np.random.randn(n_hidden, n_hidden)
init_out = np.random.randn(n_hidden)

with pm.Model() as neual_network:
# Weights from input to hidden layer
weights_in_1 = pm.Normal(‘w_in_1’, 0, sd=1,
shape=(X_train.shape[1], n_hidden),

weights_1_2 = pm.Normal('w_1_2', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_2_3 = pm.Normal('_2_3', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_3_4 = pm.Normal('w_3_4', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_4_5 = pm.Normal('w_4_5', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_5_6 = pm.Normal('w_5_6', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_6_7 = pm.Normal('w_6_7', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_7_8 = pm.Normal('w_7_8', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 
weights_8_9 = pm.Normal('w_8_9', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_9_10 = pm.Normal('w_9_10', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_10_11 = pm.Normal('w_10_11', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_11_12 = pm.Normal('w_11_12', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_12_13 = pm.Normal('w_12_13', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_13_14 = pm.Normal('w_13_14', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

weights_14_15 = pm.Normal('w_14_15', 0, sd=1, 
                        shape=(n_hidden, n_hidden), 

# Weights from hidden layer to output
weights_15_out = pm.Normal('w_15_out', 0, sd=1, 

# Build neural-network using tanh activation function

B2 = pm.Normal('bias2', 0., 1.)
L1 = pm.Deterministic('Layer1',, weights_in_1) + B2)
L2 = pm.Deterministic('Layer2', T.nnet.relu(, weights_1_2)+ B2))
L3 = pm.Deterministic('Layer3', T.nnet.relu(, weights_2_3)+ B2))
L4 = pm.Deterministic('Layer4', T.nnet.relu(, weights_3_4)+ B2))
L5 = pm.Deterministic('Layer5', T.nnet.relu(, weights_4_5)+ B2))
L6 = pm.Deterministic('Layer6', T.nnet.relu(, weights_5_6)+ B2))
L7 = pm.Deterministic('Layer7', T.nnet.relu(, weights_6_7)+ B2))
L8 = pm.Deterministic('Layer8', T.nnet.relu(, weights_7_8)+ B2))
L9 = pm.Deterministic('Layer9', T.nnet.relu(, weights_8_9)+ B2))
L10 = pm.Deterministic('Laye10', T.nnet.relu(, weights_10_11)+ B2))
L11 = pm.Deterministic('Layer11', T.nnet.relu(, weights_11_12)+ B2))
L12 = pm.Deterministic('Layer12', T.nnet.relu(, weights_12_13)+ B2))
L13 = pm.Deterministic('Layer13', T.nnet.relu(, weights_13_14)+ B2))
L14 = pm.Deterministic('Layer14', T.nnet.relu(, weights_14_15)+ B2))  
L15 = pm.Deterministic('Layer15', T.nnet.relu(, weights_15_out)+ B2))
act_out =, weights_15_out)

out = pm.Normal('out', mu = act_out, observed=ann_output, shape = y_train.shape)

inference = pm.ADVI()
approx =, method=inference)
trace = pm.sample(draws=5000, progress_bar = True)

error log:

ValueError Traceback (most recent call last)
119 L14 = pm.Deterministic(‘Layer14’, T.nnet.relu(, weights_14_15)+ B2))
120 L15 = pm.Deterministic(‘Layer15’, T.nnet.relu(, weights_15_out)+ B2))
–> 121 act_out =, weights_15_out)

~/anaconda3/lib/python3.7/site-packages/theano/tensor/ in dot(a, b)
6103 return tensordot(a, b, [[a.ndim - 1], [np.maximum(0, b.ndim - 2)]])
6104 else:
-> 6105 return _dot(a, b)

~/anaconda3/lib/python3.7/site-packages/theano/gof/ in call(self, *inputs, **kwargs)
672 thunk.outputs = [storage_map[v] for v in node.outputs]
–> 674 required = thunk()
675 assert not required # We provided all inputs

~/anaconda3/lib/python3.7/site-packages/theano/gof/ in rval(p, i, o, n)
890 # default arguments are stored in the closure of rval
891 def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
–> 892 r = p(n, [x[0] for x in i], o)
893 for o in node.outputs:
894 compute_map[o][0] = True

~/anaconda3/lib/python3.7/site-packages/theano/tensor/ in perform(self, node, inp, out)
5968 # gives a numpy float object but we need to return a 0d
5969 # ndarray
-> 5970 z[0] = np.asarray(, y))
5972 def grad(self, inp, grads):

ValueError: shapes (151437,) and (15,) not aligned: 151437 (dim 0) != 15 (dim 0)

Nevermind @chartl. I had one more dimension.