Saving ADVI results and reloading


#1

Hi,

I am trying to use ADVI to train a model on the GPU. I then want to use the fit results and generate the trace on the CPU for memory reasons. I have followed How to save fitted ADVI Result? which doesn’t appear to work.

with model:
    approx=pm.ADVI()
    approx.fit()
saveparam = [(param.name, param.eval()) for param in approx.approx.params]

with model2:
    approx2=pm.ADVI()
for i, param in enumerate(saveparam):
    approx2.approx.params[i].set_value(param[1])

trace = approx.approx.sample()
trace2 = approx2.approx.sample()

model and model2 are different instances of the same model. I expect trace and trace2 to be similar and produce similar predictions for a testing set. But it appears trace2 hasn’t ‘learnt’ anything. Do I need to save some other result from the approx.fit()?


Pickling trace object
How to save fitted ADVI Result?
#2

I checked on a simple model and it still work, are you sure model and model2 are identical?


#3

Thanks for your reply. I have experimented with simple linear models and you are right that everything seems to work. I have attached a slightly more complicated example that is causing me some issues. Despite setting the random seed at the beginning, the final histogram of the trace of a single parameter is different each time I run the code. ADVI produces identical average ELBO each time I run, but the traces for the two models sometimes agree and sometimes catastrophically disagree. Any ideas what’s going on?

import numpy as np
np.random.seed(52)
import pymc3 as pm
import cPickle as pickle
import matplotlib.pyplot as plt
import theano
floatX = theano.config.floatX

ndata, ninputs, noutputs = 100, 4, 2
X = np.random.uniform(size=(ndata, ninputs))
Y = np.array([np.sum(X, axis=1), np.sum(X, axis=1)]).T
Yerr = 0.2
Y += np.random.normal(size=(ndata, noutputs)) * Yerr
Yerr = np.ones_like(Y) * Yerr

pkl_fl = 'test_simple.pkl'

neuronsPerHiddenlayer = 12


def construct_neural_network(inputs, targets, errtargets,
                           neuronsPerHiddenlayer, ninputs, noutputs, ndata):

  np.random.seed(42)
  init_1 = np.random.randn(ninputs, neuronsPerHiddenlayer).astype(floatX)
  init_2 = np.random.randn(neuronsPerHiddenlayer, neuronsPerHiddenlayer).astype(floatX)
  init_out = np.random.randn(neuronsPerHiddenlayer, noutputs).astype(floatX)

  with pm.Model() as neural_network:

      weights_in_1 = pm.Normal('w_in_1', 0, sd=1.,
                               shape=(ninputs, neuronsPerHiddenlayer),
                               testval=init_1)
      weights_1_2 = pm.Normal('w_1_2', 0, sd=1.,
                                shape=(neuronsPerHiddenlayer, neuronsPerHiddenlayer),
                                testval=init_2)
      weights_2_out = pm.Normal('w_2_out', 0, sd=1.,
                                shape=(neuronsPerHiddenlayer, noutputs),
                                testval=init_out)
      b_1_out = pm.Normal('b_1_out', 0, 1,
                          shape=(noutputs))

      act_1 = pm.math.sigmoid(pm.math.dot(inputs, weights_in_1))
      act_out = pm.math.dot(pm.math.dot(act_1,weights_1_2), weights_2_out) + b_1_out

      yTrain = pm.Normal('yTrain',
                         mu=act_out,
                         sd=errtargets,
                         observed=targets,
                         total_size=(ndata, noutputs))

  return neural_network


model = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model:
  inference = pm.ADVI()
  inference.fit(150000)
  saveparam = [(param.name, param.eval())
               for param in inference.approx.params]
  with open(pkl_fl, 'wb') as pkl:
      pickle.dump(saveparam, pkl)
  t = inference.approx.sample(100)

model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference = pm.ADVI()
  with open(pkl_fl, 'rb') as pkl:
      saveparam = pickle.load(pkl)
  for i, param in enumerate(saveparam):
      inference.approx.params[i].set_value(param[1])
  t2 = inference.approx.sample(100)

plt.hist(t2['w_2_out'][:, 0, 0])
plt.hist(t['w_2_out'][:, 0, 0])
plt.show(block=True)

#4

I see. It is possible that the mapping from vector to dict is different every time the model is initiated, which means the order of the saveparam (as it is a 1d vector) the elements does not have the same order between the two models.

The safe way to do is to save the parameters as a dictionary, and map it back to a vector whenever the new model is created.

...
# save inference
logp = model.logp_dlogp_function()
saveparam = {param.name : logp.array_to_dict(param.eval())
               for param in inference.approx.params}

# new model
model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference2 = pm.ADVI()

# load inference
logp2 = model2.logp_dlogp_function()
inference2.approx.params[0].set_value(logp.dict_to_array(saveparam['mu']))
inference2.approx.params[1].set_value(logp.dict_to_array(saveparam['rho']))

#5

Thanks. That does seem a safer way of doing it. However, when I implement it I still get different traces for the two models. I don’t believe I have made a mistake:

import numpy as np
np.random.seed(52)
import pymc3 as pm
import cPickle as pickle
import matplotlib.pyplot as plt
import theano
floatX = theano.config.floatX

ndata, ninputs, noutputs = 100, 4, 2
X = np.random.uniform(size=(ndata, ninputs))
Y = np.array([np.sum(X, axis=1), np.sum(X, axis=1)]).T
Yerr = 0.2
Y += np.random.normal(size=(ndata, noutputs)) * Yerr
Yerr = np.ones_like(Y) * Yerr

pkl_fl = 'test_simple.pkl'

neuronsPerHiddenlayer = 12


def construct_neural_network(inputs, targets, errtargets,
                             neuronsPerHiddenlayer, ninputs, noutputs, ndata):

    np.random.seed(42)
    init_1 = np.random.randn(ninputs, neuronsPerHiddenlayer).astype(floatX)
    init_2 = np.random.randn(neuronsPerHiddenlayer, neuronsPerHiddenlayer).astype(floatX)
    init_out = np.random.randn(neuronsPerHiddenlayer, noutputs).astype(floatX)

    with pm.Model() as neural_network:

        weights_in_1 = pm.Normal('w_in_1', 0, sd=1.,
                                 shape=(ninputs, neuronsPerHiddenlayer),
                                 testval=init_1)
        weights_1_2 = pm.Normal('w_1_2', 0, sd=1.,
                                  shape=(neuronsPerHiddenlayer, neuronsPerHiddenlayer),
                                  testval=init_2)
        weights_2_out = pm.Normal('w_2_out', 0, sd=1.,
                                  shape=(neuronsPerHiddenlayer, noutputs),
                                  testval=init_out)
        b_1_out = pm.Normal('b_1_out', 0, 1,
                            shape=(noutputs))

        act_1 = pm.math.sigmoid(pm.math.dot(inputs, weights_in_1))
        act_out = pm.math.dot(pm.math.dot(act_1,weights_1_2), weights_2_out) + b_1_out

        yTrain = pm.Normal('yTrain',
                           mu=act_out,
                           sd=errtargets,
                           observed=targets,
                           total_size=(ndata, noutputs))

    return neural_network


model = construct_neural_network(
    X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model:
    inference = pm.ADVI()
    inference.fit(150000)
    t = inference.approx.sample(100)
logp=model.logp_dlogp_function() 
saveparam = {param.name: logp.array_to_dict( param.eval())
	 for param in inference.approx.params}
with open(pkl_fl, 'wb') as pkl:
    pickle.dump(saveparam, pkl)

model2 = construct_neural_network(
    X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
    inference = pm.ADVI()
with open(pkl_fl, 'rb') as pkl:
    saveparam = pickle.load(pkl)
logp=model2.logp_dlogp_function() 
inference.approx.params[0].set_value(logp.dict_to_array(saveparam['mu']))
inference.approx.params[1].set_value(logp.dict_to_array(saveparam['rho']))

t2 = inference.approx.sample(100)

plt.hist(t2['w_2_out'][:, 0, 0])
plt.hist(t['w_2_out'][:, 0, 0])
plt.show(block=True)

#6

You are right… seems the ordering mapping in logp_dlogp_function is not exactly the same as the one used in opvi.
Try this:

...
# save inference
bij = inference.approx.groups[0].bij
saveparam = {param.name: bij.rmap(param.eval())
	 for param in inference.approx.params}

# new model
model2 = construct_neural_network(
  X, Y, Yerr, neuronsPerHiddenlayer, ninputs, noutputs, ndata)
with model2:
  inference2 = pm.ADVI()

# load inference
bij2 = inference2.approx.groups[0].bij
inference2.approx.params[0].set_value(bij2.map(saveparam['mu']))
inference2.approx.params[1].set_value(bij2.map(saveparam['rho']))

Understanding ADVI approx.sample(n)
ADVI : Collecting Values for Specific Variables
#7

Great. This seems to work. Thanks for your help.