# Unique solution for probabilistic PCA

To be clear, I presume you refer to the VI methods described in the article by Moore?

I never got around to implementing the orthogonal symmetrisation method, however the signflip method requires but a small change to the MeanFieldApproximation.

From memory (should give you the gist, might even just work as is):

from pymc3.variational.opvi import Group, node_property
from pymc3.variational.approximations import MeanFieldGroup

@Group.register
class SymmetricMeanFieldGroup(MeanFieldGroup):
__param_spec__ = dict(smu=('d', ), rho=('d', ))
short_name = 'sym_mean_field'
alias_names = frozenset(['smf'])

@node_property
def mean(self):
return self.params_dict['smu']

@node_property
def symbolic_logq_not_scaled(self):

z = self.symbolic_random

logq = pm.NormalMixture.dist([.5,.5], [self.mean, -self.mean], [self.std, self,std]).logp(z)
return logq.sum(range(1, logq.ndim))
4 Likes

That looks really neat! So happy to see that it is extensible and easy to implement! Groups API really help here as well as you do not have signflip symmetry in general.

I think this is missed in SMF implementation

• Variable creations are as well not correctly implemented, I see them hardcoded in mean field implementation
1 Like

Yes, that’s exactly what I was looking for! I appreciate your help!

I have been using pymc3 for quite a while but hadn’t heard of the “Group” API.
I’ll take a stab at it with what you’ve provided.

Ok, I did build a new example without the pymc3 import *, and that fixed the previous issue.
I had to modify the NormalMixture class to get rid of the call to get_tau_sd(), as it was causing an error, and shouldn’t be necessary since I’m providng std, not tau.

Here’s the code:

class NormalMixture(pm.Mixture):
def __init__(self, w, mu, comp_shape=(), *args, **kwargs):
sd=kwargs.pop('sd',None)
self.mu = mu = tt.as_tensor_variable(mu)
self.sd = sd = tt.as_tensor_variable(sd)

super(NormalMixture, self).__init__(w, pm.Normal.dist(mu, sd=sd, shape=comp_shape),
*args, **kwargs)

def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
mu = dist.mu
w = dist.w
sd = dist.sd
name = r'\text{%s}' % name
return r'${} \sim \text{{NormalMixture}}(\mathit{{w}}={},~\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
get_variable_name(w),
get_variable_name(mu),
get_variable_name(sd))

@Group.register
class SymmetricMeanFieldGroup(MeanFieldGroup):
"""Symmetric MeanField Group for Symmetrized VI"""
__param_spec__ = dict(smu=('d', ), rho=('d', ))
short_name = 'sym_mean_field'
alias_names = frozenset(['smf'])

@node_property
def mean(self):
return self.params_dict['smu']

def create_shared_params(self, start=None):
if start is None:
start = self.model.test_point
else:
start_ = start.copy()
update_start_vals(start_, self.model.test_point, self.model)
start = start_
if self.batched:
start = start[self.group[0].name][0]
else:
start = self.bij.map(start)
rho = np.zeros((self.ddim,))
if self.batched:
start = np.tile(start, (self.bdim, 1))
rho = np.tile(rho, (self.bdim, 1))
return {'smu': theano.shared(
pm.floatX(start), 'smu'),
'rho': theano.shared(
pm.floatX(rho), 'rho')}

@node_property
def symbolic_logq_not_scaled(self):

z = self.symbolic_random
logq = NormalMixture.dist([.5,.5],
mu=[self.mean, -self.mean],
sd=[self.std, self.std],
).logp(z)

return logq.sum(range(1, logq.ndim))

class SADVI(ADVI):
def __init__(self, *args, **kwargs):
super(ADVI, self).__init__(SymmetricMeanField(*args, **kwargs))

class SymmetricMeanField(SingleGroupApproximation):
__doc__ = """**Single Group Mean Field Approximation**

""" + str(SymmetricMeanFieldGroup.__doc__)
_group_class = SymmetricMeanFieldGroup


Now .fit() fails on the first iteration with the error below.
It seems like the Join operation produces a (2,510), while the Composite produces a (1,510).

To me it seems like the Join operation joins a (1,510) with a copy multiplied by -1? I don’t really know how to debug this…

ValueError: Input dimension mis-match. (input[0].shape[0] = 1, input[1].shape[0] = 2)
Apply node that caused the error: Elemwise{sub,no_inplace}(Elemwise{add,no_inplace}.0, Join.0)
Toposort index: 157
Inputs types: [TensorType(float64, matrix), TensorType(float64, matrix)]
Inputs shapes: [(1, 510), (2, 510)]
Inputs strides: [(4080, 8), (4080, 8)]
Inputs values: [‘not shown’, ‘not shown’]
Inputs type_num: [12, 12]
Outputs clients: [[Elemwise{pow,no_inplace}(Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x,x}.0), Elemwise{pow}(Elemwise{sub,no_inplace}.0, Elemwise{sub}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\theano\configparser.py”, line 117, in res
return f(*args, **kwargs)
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\pymc3\variational\opvi.py”, line 1174, in symbolic_logq
return self.symbolic_logq_not_scaled
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\pymc3\memoize.py”, line 31, in memoizer
cache[key] = obj(*args, **kwargs)
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\theano\configparser.py”, line 117, in res
return f(*args, **kwargs)
File “”, line 42, in symbolic_logq_not_scaled
).logp(z)
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\pymc3\distributions\mixture.py”, line 146, in logp
return bound(logsumexp(tt.log(w) + self._comp_logp(value), axis=-1),
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\pymc3\distributions\mixture.py”, line 112, in comp_logp
return comp_dists.logp(value
)
File “C:\Users\dycontri\AppData\Local\conda\conda\envs\analytics\lib\site-packages\pymc3\distributions\continuous.py”, line 480, in logp
return bound((-tau * (value - mu)**2 + tt.log(tau / np.pi / 2.)) / 2.,

Hi,

trying to run the example. Running PPCA for the very first time:) Could please anyone clarify on what variables can hold sign switching? That is not obvious at the first glance

1 Like

@ferrine
I generated some fake data

import numpy as np
import datetime
from pymc3.variational.inference import ADVI
from pymc3.variational.opvi import Group, node_property
from pymc3.variational.approximations import MeanFieldGroup, SingleGroupApproximation
import pymc3 as pm
import theano
N=500
d = np.random.normal(size=(3, N))
d1 = (d[0]>0).astype(int)
d2 = -1*d[1]+np.random.normal(size=(1,N))/2
d3 = 2*d1+np.random.normal(size=(1,N))
d4=.5*d2+np.random.normal(size=(1,N))
yd = 4.*d1 +7.*d2 +5*d3+.3*d4+d[2]
#yd=.5*d1+d[2]
#d1[[1,5,25,14,32,11,125,157, 13,46,81,99,168,101,145,111]]=-999
d1True=d1.copy()
#d1[:450]=-999.
d1


Here is the model code:

import theano.tensor as tt
obs=np.array([d1,d2[0],d3[0],d4[0], yd[0]]).T
with pm.Model() as model1:
s = pm.HalfCauchy('s', 10., shape=5)
latent_std=10#HalfCauchy('l_std',100.)
w = pm.Normal('w',0.,10, shape=(5,1))

latent_ = pm.Normal('latent_',0.,1., shape=(len(d1),1))
latent = pm.Deterministic('latent', latent_*latent_std)
p = pm.Deterministic('p',tt.dot(latent, w.T))
cov = pm.Deterministic('cov', latent_std**2*tt.dot(w, w.T)+(s**2)*np.eye(5))
sd=pm.Deterministic('sd', tt.sqrt(tt.diag(cov)))
latent_like = pm.Normal('latent_like', mu=p, sd=s, observed=obs)

with model1:
approx=pm.fit(method=SADVI())
trace=approx.sample()


I know for sure that the parameter w exhibits sign-switching, at least. Make sure to sample with NUTS 4 chains to reliably see the sign-switching occur. Or run ADVI to see parameters regularized to 0.

Just FYI, in this model, w is the matrix of factor loadings (5,1)

@ferrine

I got the VI to run, but the loss is always -inf or nan…

To get it to even run, I had to edit NormalMixture again to remove the pm.Normal(shape=) syntax, and instead create a list of 2 pm.Normal.dist() instances using the mu and -mu respectively. Otherwise, I got a “Normal object is not iterable” error. See below.

class NormalMixture(pm.Mixture):
def __init__(self, w, mu, comp_shape=(), *args, **kwargs):
sd=kwargs.pop('sd',None)
self.mu = mu = tt.as_tensor_variable(mu)
self.sd = sd = tt.as_tensor_variable(sd)

super(NormalMixture, self).__init__(w, [pm.Normal.dist(mu[0], sd=sd[0]),pm.Normal.dist(mu[1], sd=sd[1])],
*args, **kwargs)

def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
mu = dist.mu
w = dist.w
sd = dist.sd
name = r'\text{%s}' % name
return r'${} \sim \text{{NormalMixture}}(\mathit{{w}}={},~\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
get_variable_name(w),
get_variable_name(mu),
get_variable_name(sd))

@Group.register
class SymmetricMeanFieldGroup(MeanFieldGroup):
"""Symmetric MeanField Group for Symmetrized VI"""
__param_spec__ = dict(smu=('d', ), rho=('d', ))
short_name = 'sym_mean_field'
alias_names = frozenset(['smf'])

@node_property
def mean(self):
return self.params_dict['smu']

def create_shared_params(self, start=None):
if start is None:
start = self.model.test_point
else:
start_ = start.copy()
update_start_vals(start_, self.model.test_point, self.model)
start = start_
if self.batched:
start = start[self.group[0].name][0]
else:
start = self.bij.map(start)
rho = np.zeros((self.ddim,))
if self.batched:
start = np.tile(start, (self.bdim, 1))
rho = np.tile(rho, (self.bdim, 1))
return {'smu': theano.shared(
pm.floatX(start), 'smu'),
'rho': theano.shared(
pm.floatX(rho), 'rho')}

@node_property
def symbolic_logq_not_scaled(self):

z = self.symbolic_random
logq = NormalMixture.dist(w=[np.ones((1,510))*.5,np.ones((1,510))*.5],
mu=[self.mean, -self.mean],
sd=[self.std, self.std]
).logp([z,-z])

return logq.sum(range(1, logq.ndim))

class SADVI(ADVI):
def __init__(self, *args, **kwargs):
super(ADVI, self).__init__(SymmetricMeanField(*args, **kwargs))

class SymmetricMeanField(SingleGroupApproximation):
__doc__ = """**Single Group Mean Field Approximation**

""" + str(SymmetricMeanFieldGroup.__doc__)
_group_class = SymmetricMeanFieldGroup

Incidentally, I want to point out @gBokiau, @ferrine , while the avg. loss while running symmetrized ADVI is -inf, it looks like it’s correctly fixing the sign-switching, as the weight parameters are no longer regularized to 0.

Probably because you missed symbolic_random. Sampling from q(z) is still wrong

    @node_property
def symbolic_random(self):
initial = self.symbolic_initial
sign = theano.tensor.as_tensor([-1, 1])[self._rng.binomial(initial.shape)]
sigma = self.std
mu = self.mean
return sigma * initial + mu * sign

1 Like

Moreover, you should specify variables that refer to a specific group (https://docs.pymc.io/api/inference.html#pymc3.variational.opvi.Group)

1 Like

Thank you very much for your help, @ferrine

I’m looking for how to specify which variables go with which group.
Do you mean that in the SADVI() class I need to add parameters to pass variable names to SymmetricMeanFieldGroup vs. MeanFieldGroup?

@ferrine - nvm. I found it!

1 Like

@ferrine

I implemented the symbolic_random as you suggested, but it actually reintroduced the sign-switching, regularizing behavior as with normal ADVI…

While before, the w parameter’s weights were correctly estimated, now they’re all set to 0:

without change to symbolic_random:
The weights match those found with NUTS, having a decent spread. I worry that the uncertainty of these parameters is underestimated, though…

AFTER change to sybolic_random:
The parameters are all located at 0

Interesting, I think I need some time to dig in tomorrow. (I yet has no time)

Thing to try number one. Smart starting value for std/mean in smf. Gradients may be too noizy as a sample from positive mode jumps over zero.

Thing to try number 2 (a bit advanced) https://arxiv.org/abs/1805.08498
In here there is a better reparametrization gradient for mixture distribution.

Another hotfix to the model may be just setting positive support fow w using half normal dist as prior

Good night

1 Like

@ferrine

I got the model to report average loss properly by changing w in symbolic_logp_not_scaled from [.5,.5] to theano.tensor.stack([.5,.5]).

@node_property
def symbolic_logq_not_scaled(self):

z = self.symbolic_random

mu=theano.tensor.stack([self.mean, -self.mean])
sd=theano.tensor.stack([self.std, self.std])

logq = NormalMixture.dist(w=theano.tensor.stack([.5,.5]),
mu=mu,
sd=sd
).logp(z)

return logq.sum(range(1, logq.ndim))

My SMF implementation that I used in my experiments
IMPORTANT!

# if we sample from mixture, we should not broadcast random sign!!!
# this works with 1 sample approximationm but it does not matter here
sign = pm.floatX(theano.tensor.as_tensor([-1, 1]))[(self._rng.uniform(()) > 0).astype('int8')]

from pymc3.variational.opvi import Group, node_property
from pymc3.variational.approximations import MeanFieldGroup

@Group.register
class SymmetricMeanFieldGroup(MeanFieldGroup):
"""Symmetric MeanField Group for Symmetrized VI"""
__param_spec__ = dict(smu=('d', ), rho=('d', ))
short_name = 'sym_mean_field'
alias_names = frozenset(['smf'])

@node_property
def mean(self):
return self.params_dict['smu']

def create_shared_params(self, start=None):
if start is None:
start = self.model.test_point
else:
start_ = start.copy()
update_start_vals(start_, self.model.test_point, self.model)
start = start_
if self.batched:
start = start[self.group[0].name][0]
else:
start = self.bij.map(start)
rho = np.zeros((self.ddim,))
if self.batched:
start = np.tile(start, (self.bdim, 1))
rho = np.tile(rho, (self.bdim, 1))
return {'smu': theano.shared(
pm.floatX(start), 'smu'),
'rho': theano.shared(
pm.floatX(rho), 'rho')}

@node_property
def symbolic_random(self):
initial = self.symbolic_initial
# if we sample from mixture, we should not broadcast random sign!!!
sign = pm.floatX(theano.tensor.as_tensor([-1, 1]))[(self._rng.uniform(()) > 0).astype('int8')]
sigma = self.std
mu = self.mean
return sigma * initial + mu * sign

@node_property
def symbolic_logq_not_scaled(self):
z = self.symbolic_random
sigma = self.std
mu = self.mean
logq = .5 * (
pm.Normal.dist(mu=mu, sd=sigma).logp(z)
+
pm.Normal.dist(mu=-mu, sd=sigma).logp(z)
)
return logq.sum(range(1, logq.ndim))


Code to setup inference

with model1:
symmetric_group = pm.Group([w], vfam='smf')
other_group = pm.Group(None, vfam='mf')
approx = pm.Approximation([symmetric_group, other_group])
inference = pm.KLqp(approx)


optional starting point

symmetric_group.params_dict['smu'].set_value(np.ones(symmetric_group.ddim, dtype='float32'))
# rho (sigma=softplus(rho)) parametrization
symmetric_group.params_dict['rho'].set_value(np.ones(symmetric_group.ddim, dtype='float32'))


Running inference

inference.fit(25000)


Strange things happen, I have loss going to undefined regions(( IUnfortunately not much time to investigate

1 Like

@ferrine

The model is working correctly for me as described before, (without change to symbolic_random).

Now the only thing I’m working on is getting Minibatch to work.
Here are the weights traceplot from the full dataset

While the weights from the Minibatch dataset are very similar, their sds are higher, and the cov parameter is underestimated. (the resulting correlation matrix is almost all 0)

To clarify, here’s the code I’m working with:

class NormalMixture(pm.Mixture):
def __init__(self, w, mu, comp_shape=(), *args, **kwargs):
sd=kwargs.pop('sd',None)
self.mu = mu = tt.as_tensor_variable(mu)
self.sd = sd = tt.as_tensor_variable(sd)

super(NormalMixture, self).__init__(w, [pm.Normal.dist(mu[0], sd=sd[0]),pm.Normal.dist(mu[1], sd=sd[1])],
*args, **kwargs)

def _repr_latex_(self, name=None, dist=None):
if dist is None:
dist = self
mu = dist.mu
w = dist.w
sd = dist.sd
name = r'\text{%s}' % name
return r'${} \sim \text{{NormalMixture}}(\mathit{{w}}={},~\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
get_variable_name(w),
get_variable_name(mu),
get_variable_name(sd))

@Group.register
class SymmetricMeanFieldGroup(MeanFieldGroup):
"""Symmetric MeanField Group for Symmetrized VI"""
__param_spec__ = dict(smu=('d', ), rho=('d', ))
short_name = 'sym_mean_field'
alias_names = frozenset(['smf'])

@node_property
def mean(self):
return self.params_dict['smu']

def create_shared_params(self, start=None):
if start is None:
start = self.model.test_point
else:
start_ = start.copy()
update_start_vals(start_, self.model.test_point, self.model)
start = start_
if self.batched:
start = start[self.group[0].name][0]
else:
start = self.bij.map(start)
rho = np.zeros((self.ddim,))
if self.batched:
start = np.tile(start, (self.bdim, 1))
rho = np.tile(rho, (self.bdim, 1))
return {'smu': theano.shared(
pm.floatX(start), 'smu'),
'rho': theano.shared(
pm.floatX(rho), 'rho')}

#@node_property
#def symbolic_random(self):
#    initial = self.symbolic_initial
#    sigma = self.std
#    mu = self.mean
#    sign = theano.tensor.as_tensor([-1, 1])[self._rng.binomial(sigma.ones_like().shape)]
#
#    return sigma * initial + mu * sign

@node_property
def symbolic_logq_not_scaled(self):

z = self.symbolic_random

mu=theano.tensor.stack([self.mean, -self.mean])
sd=theano.tensor.stack([self.std, self.std])

logq = NormalMixture.dist(w=theano.tensor.stack([.5,.5]),
mu=mu,
sd=sd
).logp(z)

return logq.sum(range(1, logq.ndim))

I guess this effect can be explained in the following way:

• you say your posterior has 2 modes
• you actually sample from one mode
• in KL both modes are covered
• you do not care what mode you are sampling from

Considering the minibatch dataset, did you specify total_size for your observed variable?

Yes, I specified total_size.
The model code is like so:

import theano.tensor as tt
obs=np.array([d1,d2[0],d3[0],d4[0], yd[0]]).T
data_shape=obs.shape
batch_size=64
if batch_size:
obs=pm.Minibatch(obs, batch_size)
else:
batch_size=data_shape[0]

with pm.Model() as model1:
s = pm.HalfCauchy('s', 1., shape=5)
l_std=pm.HalfCauchy('l_std',1.)
w = pm.Normal('w',0,1., shape=(5,2), testval=np.random.randn(5,2))

latent_ = pm.Normal('latent_',0.,1., shape=(batch_size,2), testval=np.random.randn(batch_size,2))
latent = pm.Deterministic('latent', latent_*l_std)
p = pm.Deterministic('p',tt.dot(latent, w.T))
cov = pm.Deterministic('cov', l_std**2*tt.dot(w, w.T)+(s**2)*np.eye(5))
sd=pm.Deterministic('sd', tt.sqrt(tt.diag(cov)))

latent_like = pm.Normal('latent_like', mu=p, sd=s, observed=obs, total_size=data_shape if batch_size!=data_shape[0] else None)

you can leave this as with data_shape only, this does not affect the results

data_shape if batch_size!=data_shape[0] else None -> data_shape


BTW
Did your loss went to -inf?

1 Like

My loss doens’t go to inf. The loss seems similar to that of regular ADVI (which I think is expected, no?

To your first statement about the data_shape thing, let me make sure I understand you.
it sounds like you’re saying I’m using the correct syntax here, and I don’t need to make a change, right?