Saving ADVI results and reloading

simpleton · April 12, 2018, 6:55pm

Hi,

I am trying to use ADVI to train a model on the GPU. I then want to use the fit results and generate the trace on the CPU for memory reasons. I have followed How to save fitted ADVI Result? which doesn’t appear to work.

with model:
    approx=pm.ADVI()
    approx.fit()
saveparam = [(param.name, param.eval()) for param in approx.approx.params]

with model2:
    approx2=pm.ADVI()
for i, param in enumerate(saveparam):
    approx2.approx.params[i].set_value(param[1])

trace = approx.approx.sample()
trace2 = approx2.approx.sample()

model and model2 are different instances of the same model. I expect trace and trace2 to be similar and produce similar predictions for a testing set. But it appears trace2 hasn’t ‘learnt’ anything. Do I need to save some other result from the approx.fit()?

junpenglao · April 13, 2018, 5:19am

I checked on a simple model and it still work, are you sure model and model2 are identical?

simpleton · April 14, 2018, 1:42pm

Thanks for your reply. I have experimented with simple linear models and you are right that everything seems to work. I have attached a slightly more complicated example that is causing me some issues. Despite setting the random seed at the beginning, the final histogram of the trace of a single parameter is different each time I run the code. ADVI produces identical average ELBO each time I run, but the traces for the two models sometimes agree and sometimes catastrophically disagree. Any ideas what’s going on?

import numpy as np
np.random.seed(52)
import pymc3 as pm
import cPickle as pickle
import matplotlib.pyplot as plt
import theano
floatX = theano.config.floatX

ndata, ninputs, noutputs = 100, 4, 2
X = np.random.uniform(size=(ndata, ninputs))
Y = np.array([np.sum(X, axis=1), np.sum(X, axis=1)]).T
Yerr = 0.2
Y += np.random.normal(size=(ndata, noutputs)) * Yerr
Yerr = np.ones_like(Y) * Yerr

pkl_fl = 'test_simple.pkl'

neuronsPerHiddenlayer = 12


def construct_neural_network(inputs, targets, errtargets,
                           neuronsPerHiddenlayer, ninputs, noutputs, ndata):

  np.random.seed(42)
  init_1 = np.random.randn(ninputs, neuronsPerHiddenlayer).astype(floatX)
  init_2 = np.random.randn(neuronsPerHiddenlayer, neuronsPerHiddenlayer).astype(floatX)
  init_out = np.random.randn(neuronsPerHiddenlayer, noutputs).astype(floatX)

  with pm.Model() as neural_network:

      weights_in_1 = pm.Normal('w_in_1', 0, sd=1.,
                               shape=(ninputs, neuronsPerHiddenlayer),
                               testval=init_1)
      weights_1_2 = pm.Normal('w_1_2', 0, sd=1.,
                                shape=(neuronsPerHiddenlayer, neuronsPerHiddenlayer),
                                testval=init_2)
      weights_2_out = pm.Normal('w_2_out', 0, sd=1.,
                                shape=(neuronsPerHiddenlayer, noutputs),
                                testval=init_out)
      b_1_out = pm.Normal('b_1_out', 0, 1,
                          shape=(noutputs))

      act_1 = pm.math.sigmoid(pm.math.dot(inputs, weights_in_1))
      act_out = pm.math.dot(pm.math.dot(act_1,weights_1_2), weights_2_out) + b_1_out

      yTrain = pm.Normal('yTrain',
                         mu=act_out,
                         sd=errtargets,
                         observed=targets,
                         total_size=(ndata, noutputs))

  return neural_network


model = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model:
  inference = pm.ADVI()
  inference.fit(150000)
  saveparam = [(param.name, param.eval())
               for param in inference.approx.params]
  with open(pkl_fl, 'wb') as pkl:
      pickle.dump(saveparam, pkl)
  t = inference.approx.sample(100)

model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference = pm.ADVI()
  with open(pkl_fl, 'rb') as pkl:
      saveparam = pickle.load(pkl)
  for i, param in enumerate(saveparam):
      inference.approx.params[i].set_value(param[1])
  t2 = inference.approx.sample(100)

plt.hist(t2['w_2_out'][:, 0, 0])
plt.hist(t['w_2_out'][:, 0, 0])
plt.show(block=True)

junpenglao · April 14, 2018, 4:11pm

I see. It is possible that the mapping from vector to dict is different every time the model is initiated, which means the order of the saveparam (as it is a 1d vector) the elements does not have the same order between the two models.

The safe way to do is to save the parameters as a dictionary, and map it back to a vector whenever the new model is created.

...
# save inference
logp = model.logp_dlogp_function()
saveparam = {param.name : logp.array_to_dict(param.eval())
               for param in inference.approx.params}

# new model
model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference2 = pm.ADVI()

# load inference
logp2 = model2.logp_dlogp_function()
inference2.approx.params[0].set_value(logp.dict_to_array(saveparam['mu']))
inference2.approx.params[1].set_value(logp.dict_to_array(saveparam['rho']))

simpleton · April 14, 2018, 5:29pm

Thanks. That does seem a safer way of doing it. However, when I implement it I still get different traces for the two models. I don’t believe I have made a mistake:

import numpy as np
np.random.seed(52)
import pymc3 as pm
import cPickle as pickle
import matplotlib.pyplot as plt
import theano
floatX = theano.config.floatX

ndata, ninputs, noutputs = 100, 4, 2
X = np.random.uniform(size=(ndata, ninputs))
Y = np.array([np.sum(X, axis=1), np.sum(X, axis=1)]).T
Yerr = 0.2
Y += np.random.normal(size=(ndata, noutputs)) * Yerr
Yerr = np.ones_like(Y) * Yerr

pkl_fl = 'test_simple.pkl'

neuronsPerHiddenlayer = 12


def construct_neural_network(inputs, targets, errtargets,
                             neuronsPerHiddenlayer, ninputs, noutputs, ndata):

    np.random.seed(42)
    init_1 = np.random.randn(ninputs, neuronsPerHiddenlayer).astype(floatX)
    init_2 = np.random.randn(neuronsPerHiddenlayer, neuronsPerHiddenlayer).astype(floatX)
    init_out = np.random.randn(neuronsPerHiddenlayer, noutputs).astype(floatX)

    with pm.Model() as neural_network:

        weights_in_1 = pm.Normal('w_in_1', 0, sd=1.,
                                 shape=(ninputs, neuronsPerHiddenlayer),
                                 testval=init_1)
        weights_1_2 = pm.Normal('w_1_2', 0, sd=1.,
                                  shape=(neuronsPerHiddenlayer, neuronsPerHiddenlayer),
                                  testval=init_2)
        weights_2_out = pm.Normal('w_2_out', 0, sd=1.,
                                  shape=(neuronsPerHiddenlayer, noutputs),
                                  testval=init_out)
        b_1_out = pm.Normal('b_1_out', 0, 1,
                            shape=(noutputs))

        act_1 = pm.math.sigmoid(pm.math.dot(inputs, weights_in_1))
        act_out = pm.math.dot(pm.math.dot(act_1,weights_1_2), weights_2_out) + b_1_out

        yTrain = pm.Normal('yTrain',
                           mu=act_out,
                           sd=errtargets,
                           observed=targets,
                           total_size=(ndata, noutputs))

    return neural_network


model = construct_neural_network(
    X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model:
    inference = pm.ADVI()
    inference.fit(150000)
    t = inference.approx.sample(100)
logp=model.logp_dlogp_function() 
saveparam = {param.name: logp.array_to_dict( param.eval())
	 for param in inference.approx.params}
with open(pkl_fl, 'wb') as pkl:
    pickle.dump(saveparam, pkl)

model2 = construct_neural_network(
    X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
    inference = pm.ADVI()
with open(pkl_fl, 'rb') as pkl:
    saveparam = pickle.load(pkl)
logp=model2.logp_dlogp_function() 
inference.approx.params[0].set_value(logp.dict_to_array(saveparam['mu']))
inference.approx.params[1].set_value(logp.dict_to_array(saveparam['rho']))

t2 = inference.approx.sample(100)

plt.hist(t2['w_2_out'][:, 0, 0])
plt.hist(t['w_2_out'][:, 0, 0])
plt.show(block=True)

junpenglao · April 14, 2018, 8:37pm

You are right… seems the ordering mapping in logp_dlogp_function is not exactly the same as the one used in opvi.
Try this:

...
# save inference
bij = inference.approx.groups[0].bij
saveparam = {param.name: bij.rmap(param.eval())
	 for param in inference.approx.params}

# new model
model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference2 = pm.ADVI()

# load inference
bij2 = inference2.approx.groups[0].bij
inference2.approx.params[0].set_value(bij2.map(saveparam['mu']))
inference2.approx.params[1].set_value(bij2.map(saveparam['rho']))

simpleton · April 15, 2018, 10:41am

Great. This seems to work. Thanks for your help.

Topic		Replies	Views
How to save fitted ADVI Result? Questions	3	2098	August 6, 2018
Setting ADVI start state Questions	3	725	October 14, 2019
Unable to load trace after VI Questions variational_inferenc	14	1723	November 24, 2021
ADVI start with initialization Questions	7	2072	September 23, 2017
Minibatch for a large dataset ADVI Questions	2	1210	September 7, 2018

Saving ADVI results and reloading

Related topics