Multi observed latent Gaussian Processes

Hi,

I have a generative model for a group of time series that is basically a single latent Gaussian Process summed with a number of piecewise linear functions equal to the number of time series. In reality I am applying it by summing a big number of latent GPs in a larger dataset.

I am using gp.Latent to create the latent GPs (in this case just one) and summing them up in the likelihood by indexing the time series that I want them to be summed upon. Is there a way to make it more performant? Probably by using some different implementation from the GP API?

Here you can find an example of the generative process and plot the series:

import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano
import theano.tensor as tt

class OutPiecewiseLinearChangepoints():
    # Piecewise linear function applied outside of the GPs and added in the likelihood parameter
    def __init__(self, 
                 k, 
                 m,
                 b,
                 changepoints,
                 n_series = 4):
        self.k = k
        self.m = m
        self.b = b
        self.changepoints = changepoints
        self.n_series = n_series

    def create_changepoints(self, X, changepoints):
        return (0.5 * (1.0 + tt.sgn(tt.tile(X.reshape((-1,1)), (1,len(changepoints))) - changepoints)))

    def build(self, X):

        X = theano.shared(X)

        A = self.create_changepoints(X, self.changepoints)
        
        growth = (self.k.reshape((1, -1)) + tt.dot(A, self.b))*tt.tile(X, (1, self.n_series))
        offset = (self.m.reshape((1,-1)) + tt.dot(A, (-self.changepoints.reshape((-1,1)) * self.b)))
        piecewise = growth + offset

        return piecewise

    piece = OutPiecewiseLinearChangepoints(k=np.array([0.1]), 
                                   m=np.array([0.1]), 
                                   b=np.random.normal(0.1, 0.4, size=(4,4)), 
                                   changepoints = changepoints_t,
                                   ).build(np.arange(10).reshape((-1,1))).eval()

    lengthscale = 0.2
    eta = 2.0
    cov = eta ** 2 * pm.gp.cov.ExpQuad(1, lengthscale)

    X = np.arange(10)[:, None]
    K = cov(X).eval()

    gpl = pm.MvNormal.dist(mu=np.zeros(K.shape[0]), cov=K).random(size=10).T

    x_train = np.random.normal(pm.MvNormal.dist(mu=np.zeros(K.shape[0]), cov=K).random(size=4).T + piece, 0.1)
    plt.figure(figsize=(14, 4))
    plt.plot(X, x_train)
    plt.title("time series generated")
    plt.ylabel("y")
    plt.xlabel("X");

I built the model as:

with pm.Model() as model:
    period = pm.Gamma('period', 40, 10)

    l_t = pm.InverseGamma('l_t', 4, 40)
    l_p = pm.HalfNormal('l_p', 0.5)
    Ρ_trend = pm.HalfNormal('Ρ_trend',1)
    σ  = pm.HalfNormal("σ",  sigma=0.1)

    mu_func = pm.gp.mean.Zero()

    # cov function for the GP 
    cov = (Ρ_trend**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=l_t)
            + pm.gp.cov.WhiteNoise(σ))

    gp = pm.gp.Latent(mean_func=mu_func, cov_func=cov)
    f = gp.prior('f', X=X, reparameterize=True)

    k = pm.Normal("k", mu=0, sigma=0.01, shape = s)
    m = pm.Normal("m", mu=0, sigma=0.01, shape = s)
    delta = pm.Normal("delta", mu=0, sigma=0.01, shape=(len(changepoints_t),s))
    
    piece = OutPiecewiseLinearChangepoints(k = k,
                                          m = m,
                                          b = delta,
                                          changepoints = changepoints_t).build(X)

    # Likelihood
    y_pred = pm.Normal('y_pred', mu=f.reshape((-1,1)) + piece, sigma = σ,observed=x_train)

    # Fitting using ADVI
    trace_vi = pm.fit(100000,
                       method='advi',
                       obj_n_mc=1,
                       obj_optimizer=pm.adamax(),
                       callbacks=[pm.callbacks.CheckParametersConvergence(every=50, diff='absolute',tolerance=1e-3)],
                       progressbar = True)
    
    # Sampling
    trace_vi_samples = trace_vi.sample()
    pred_samples_fit = pm.sample_posterior_predictive(trace_vi_samples,
                                                           vars=[y_pred],
                                                           samples=n_samples,
                                                      progressbar = True)
    
    # Conditional of the GP on new points
    f_n = gp.conditional('f_n', Xnew=X_new)

    piece = OutPiecewiseLinearChangepoints(k = k,
                                          m = m,
                                          b = delta,
                                          changepoints = changepoints_t).build(X_new)

    
    y_pred_new = pm.Normal("y_pred_new", 
                            mu=f_n.reshape((-1,1)) + piece, 
                            shape=(X_new.shape[0], s))
    
    # Sampling of predictions
    pred_samples_predict = pm.sample_posterior_predictive(trace_vi_samples, 
                                              vars=[y_pred_new], 
                                              samples=n_samples,
                                              progressbar = True)

Thanks,
Luis

Just to verify, is your model a set of time series with a single “draw” from a GP with the different OutPiecewiseLinearChangepoints added, or is each time series a separate “draw” from the GP?

Hi @bwengals it is the first, a set of time series with a single “draw” from a GP with the different OutPiecewiseLinearChangepoints added.

The idea is to extend it further by adding several of these latent GPs indexed by the time series that belong to different groups. Example: GP1 applies to time series [0, 0, 1, 1] and GP2 applies to time series [1, 1, 1, 0]. This means that for time series 2 (index starting at 0) I would like to add both “latent” GPs: GP1+GP2 + OutPiecewiseLinearChangepoints as the mean of Gaussian Likelihood for instance.

I see, that makes sense. One last clarification, for your GP1 and GP2 etc, do they share the same covariance matrix, or do they each have different lengthscales, etas? Also, how many points are in your time series?

One thing is I think σ is being double counted – you likely want to remove the WhiteNoise component of your covariance.

For speed, the most important thing for GPs (other than the number of data points N) are the priors on the hyperparameters. I would recommend starting with fixing the values of the lengthscales and etas and using NUTs and seeing how long it takes. Then gradually make these priors less informative. Fixing these values will put a ceiling on your sampling speed because the cholesky of K is calculated once. Very informative priors will also greatly speed up sampling, because the joint posterior of f, eta and lengthscale can have “funnely” shapes that don’t usually influence the marginal posterior of f very much (which is what you probably care about most, not eta or lenghtscale per se).

You may be able to use Marginal or possibly MarginalSparse with OutPiecewiseLinearChangepoints as a custom mean function since your likelihood is Gaussian, but I’m not sure how those implementations would play with the broadcasting here mu=f.reshape((-1, 1)). Worst case scenario, if using those implementations directly doesn’t work, I think it wouldn’t be too hard to copy and modify the Marginal implementation here for your specific model.

They have different lengthscales, etas, etc. That is the way to learn different types of functions for each group. In terms of points, all time series are between 250 to 350 points, usually not much bigger than that. The problem is that I have datasets with 300 time series up to 30,000.

Yes, agreed. When using with a larger model I just add a very small sigma to help stabilize the covariance. I know that it is being added already also in the background but I often get errors because the matrices are not positive definite. Will test again nevertheless.

I use very informative priors in the extended model, but still everything runs very slowly. I cannot really use NUTs to get starting values because my parameters for each GP depend on the parameters learned by the other GPs and so the models are really big. Remember that I am summing them in the likelihood. Let me show you what I was thinking:

   # GP1 with its own parameters
    cov1 = (Ρ_trend1**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=l_t1)
            + pm.gp.cov.WhiteNoise(σ1))
    gp1 = pm.gp.Latent(mean_func=mu_func, cov_func=cov1)
    f1 = gp.prior('f1', X=X, reparameterize=True)
    
    # GP2 with its own parameters
    cov2 = (Ρ_trend2**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=l_t2)
            + pm.gp.cov.WhiteNoise(σ2))
    gp2 = pm.gp.Latent(mean_func=mu_func, cov_func=cov2)
    f2 = gp.prior('f2', X=X, reparameterize=True)

    # Piecewise for each time series
    piece = OutPiecewiseLinearChangepoints(k = k,
                                          m = m,
                                          b = delta,
                                          changepoints = changepoints_t).build(X)

    # Way to map which "latent" GP is to be applied to each series
    f1_m = f1.reshape((-1,1)) * np.array([0, 0, 1, 1]).reshape((1, -1))
    f2_m = f2.reshape((-1,1)) * np.array([1, 1, 1, 0]).reshape((1, -1))

   # GP1 and GP2 are summed in the likelihood
    y_pred = pm.Normal('y_pred', mu=f1_m + f2_m + piece, sigma = σ,observed=x_train)

Does it make sense? Will I be able to do such an implementation using the marginal implementation or is there any other trick to help scale it? Can you please point me in the right direction? Thank you very much for the help.

Best,
Luis

They have different lengthscales, etas, etc. That is the way to learn different types of functions for each group. In terms of points, all time series are between 250 to 350 points, usually not much bigger than that. The problem is that I have datasets with 300 time series up to 30,000.

Are you running multiple models in batches? Are all the time series within a model the same size?

I cannot really use NUTs to get starting values because my parameters for each GP depend on the parameters learned by the other GPs and so the models are really big.

I don’t quite follow this, what do you mean by parameters for each GP depend on parameters learned by other GPs?

Thanks for the additional code BTW, I think this does help it make sense. So, in order to use the Marginal implementation, which I think will help a lot here, you probably could re-express your problem by summing covariances instead of summing GPs. Because:

GP(\mu_1 + \mu_2 \,, K_1 + K_2) = GP(\mu_1\,, K_1) + GP(\mu_2 \,, K_2)

Would it be possible to apply your mixture

    f1_m = f1.reshape((-1,1)) * np.array([0, 0, 1, 1]).reshape((1, -1))
    f2_m = f2.reshape((-1,1)) * np.array([1, 1, 1, 0]).reshape((1, -1))

to the covariances instead of the latent GP? Or possibly using the additive capabilities?

Sounds like a really interesting problem by the way.

I am running a single model. What I did to try to scale training was to use minibatch. Assuming the input is of size (n x s), being n the number of time points and s the number of series. I could only implement minibatch for the dimension s, as minibatch is sampled with replacement which does not work for GPs (for dimension n). Nevertheless, the results were not good, so I had to drop the idea.

Let’s assume that they are all the same size.

Sorry, the choice of wording was not good and now I am thinking that I could be wrong. Let me give you my perspective with the same scenario mentioned above: GP1 applies to time series [0, 0, 1, 1] and GP2 applies to time series [1, 1, 1, 0]. This means that for time series 2 (index starting at 0) both “latent” GPs and the piecewise function are summed: GP1+GP2 + Piecewise. Can we assume that the parameters learned for GP1 and for GP2 are completely independent? It makes sense as our GPs are clearly independent but a single series, e.g. Time series 2, depends on the sum of both GPs, how is that dependence learned?

I already tried to do that in the past, but I cannot really wrap my head around a way to represent it without building a very large cov matrix.

Below you can find my attempt compared to the one using the Latent class. The Latent approach still runs in less time though… Any tips? This is an example where the observed data is (n x s), where n=100 and s=100. The number of GPs is equal to 10.

Generate data
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano
import theano.tensor as tt

class OutPiecewiseLinearChangepoints():
    def __init__(self, 
                 k, 
                 m,
                 b,
                 changepoints,
                 n_series):
        self.k = k
        self.m = m
        self.b = b
        self.changepoints = changepoints
        self.n_series = n_series

    def create_changepoints(self, X, changepoints):
        return (0.5 * (1.0 + tt.sgn(tt.tile(X.reshape((-1,1)), (1,len(changepoints))) - changepoints)))

    def build(self, X):

        X = theano.shared(X)

        A = self.create_changepoints(X, self.changepoints)
        
        growth = (self.k.reshape((1, -1)) + tt.dot(A, self.b))*tt.tile(X, (1, self.n_series))
        offset = (self.m.reshape((1,-1)) + tt.dot(A, (-self.changepoints.reshape((-1,1)) * self.b)))
        piecewise = growth + offset

        return piecewise
    
    def __call__(self, X):

        X = theano.shared(X)

        A = self.create_changepoints(X, self.changepoints)
        
        growth = (self.k.reshape((1, -1)) + tt.dot(A, self.b))*tt.tile(X, (1, self.n_series))
        offset = (self.m.reshape((1,-1)) + tt.dot(A, (-self.changepoints.reshape((-1,1)) * self.b)))
        piecewise = growth + offset
        
        return piecewise.T


s = 100
n = 100
X = np.arange(n)[:, None]
piece = OutPiecewiseLinearChangepoints(k=np.array([0.1]), 
                               m=np.array([0.1]), 
                               b=np.random.normal(0.1, 0.4, size=(4,s)), 
                               changepoints = np.linspace(0, n, 4),
                               n_series = s
                               ).build(X).eval()


n_gps = 10
params_gps = {}
gps = {}
groups = {}

# GPs
for i in range(n_gps):
    groups[f'g{i}'] = np.random.choice([0,1],s).reshape(1, -1)
    params_gps[f'lengthscale{i}'] = 0.2
    params_gps[f'eta{i}'] = 2.0
    params_gps[f'cov{i}'] = params_gps[f'eta{i}'] ** 2 * pm.gp.cov.ExpQuad(1, params_gps[f'lengthscale{i}'])
    params_gps[f'K{i}'] = params_gps[f'cov{i}'](X).eval()

    params_gps[f'gp{i}'] = pm.MvNormal.dist(mu=np.zeros(params_gps[f'K{i}'].shape[0]), cov=params_gps[f'K{i}']).random(size=s).T
    gps[f'gp{i}'] = params_gps[f'gp{i}'] * groups[f'g{i}']
    
gp = sum(gps.values())

x_train = np.random.normal(gp + piece, 0.1)
plt.figure(figsize=(14, 4))
for i in range(s):
    plt.plot(X, x_train[:,i], label=f'ts_{i}')
plt.title("time series generated")
plt.ylabel("y")
plt.xlabel("X");
With `Marginal`
priors_gps = {}
marginal_lk_gps = {}
with pm.Model() as model:

    k = pm.Normal("k", mu=0, sigma=0.01, shape = s)
    m = pm.Normal("m", mu=0, sigma=0.01, shape = s)
    delta = pm.Normal("delta", mu=0, sigma=0.01, shape=(4,s))
    noise = pm.HalfNormal('noise',0.1)
    
    piece = OutPiecewiseLinearChangepoints(k = k,
                                          m = m,
                                          b = delta,
                                          changepoints = np.linspace(0, n, 4),
                                          n_series =s)#.build(X)
    for i in range(n_gps): 
        priors_gps[f'period{i}'] = pm.Gamma(f'period{i}', 40, 10)

        priors_gps[f'l_t{i}'] = pm.InverseGamma(f'l_t{i}', 4, 40)
        priors_gps[f'l_p{i}'] = pm.HalfNormal(f'l_p{i}', 0.5)
        priors_gps[f'Ρ_trend{i}'] = pm.HalfNormal(f'Ρ_trend{i}', 1)
        priors_gps[f'cov{i}'] = (priors_gps[f'Ρ_trend{i}']**2 
                                 * pm.gp.cov.ExpQuad(input_dim=1, ls=priors_gps[f'l_t{i}']))
        priors_gps[f'gp{i}'] = pm.gp.Marginal(piece, priors_gps[f'cov{i}'])
        marginal_lk_gps[f'f{i}'] = priors_gps[f'gp{i}'].marginal_likelihood(f'f{i}', X, x_train.T * groups[f'g{i}'].T, noise)

    # Fitting using ADVI
    trace_vi = pm.fit(10000,
                       method='advi',
                       obj_n_mc=1,
                       obj_optimizer=pm.adamax(),
                       callbacks=[pm.callbacks.CheckParametersConvergence(every=50, diff='absolute',tolerance=1e-3)],
                       progressbar = True)
With `Latent`
priors_gps = {}
gps = {}
with pm.Model() as model:

    k = pm.Normal("k", mu=0, sigma=0.01, shape = s)
    m = pm.Normal("m", mu=0, sigma=0.01, shape = s)
    delta = pm.Normal("delta", mu=0, sigma=0.01, shape=(4,s))
    noise = pm.HalfNormal('noise',0.1)
    
    mu_func = pm.gp.mean.Zero()
    
    piece = OutPiecewiseLinearChangepoints(k = k,
                                          m = m,
                                          b = delta,
                                          changepoints = np.linspace(0, n, 4),
                                          n_series =s).build(X)
    for i in range(n_gps): 
        priors_gps[f'period{i}'] = pm.Gamma(f'period{i}', 40, 10)

        priors_gps[f'l_t{i}'] = pm.InverseGamma(f'l_t{i}', 4, 40)
        priors_gps[f'l_p{i}'] = pm.HalfNormal(f'l_p{i}', 0.5)
        priors_gps[f'Ρ_trend{i}'] = pm.HalfNormal(f'Ρ_trend{i}', 1)
        priors_gps[f'cov{i}'] = (priors_gps[f'Ρ_trend{i}']**2 
                                 * pm.gp.cov.ExpQuad(input_dim=1, ls=priors_gps[f'l_t{i}']))
        
        priors_gps[f'gp{i}'] = pm.gp.Latent(mean_func=mu_func, cov_func=priors_gps[f'cov{i}'])
        priors_gps[f'f_{i}'] = priors_gps[f'gp{i}'].prior(f'f{i}', X=X, reparameterize=True)
        
        gps[f'f{i}'] = priors_gps[f'f_{i}'].reshape((-1,1))*groups[f'g{i}']
        
    
    y_pred = pm.Normal('y_pred', mu=sum(gps.values())+piece, sigma = 0.1,observed=x_train)


    # Fitting using ADVI
    trace_vi = pm.fit(10000,
                       method='advi',
                       obj_n_mc=1,
                       obj_optimizer=pm.adamax(),
                       callbacks=[pm.callbacks.CheckParametersConvergence(every=50, diff='absolute',tolerance=1e-3)],
                       progressbar = True)

Thanks for this and for all the help!

Hey @luisroque, thanks for the examples here. I got a bit mixed up with the sum like here: x_train.T * groups[f'g{i}'].T, perhaps since s=n in the example. To clarify one last time hopefully, I put together my own simplified example. I left out the OutPiecewiseLinearChangepoints just to keep it simpler for me. Let’s add it back in once I get the structure of the problem? Would you mind reading the example below and see if its correct? I also tried adding the covariance functions like I described above.

## generate the data
# number of time series
n = 10

# length of each series
s = 50

# number of "basis" GPs
n_gps = 4

# domain
x = np.linspace(0, 10, s)

# noise standard deviation
sigma = 0.1

# each gp has a different covariance matrix
eta = [1, 2, 3, 4]
ell = [0.5, 1, 2, 3]
covs = []
for i in range(n_gps):
    covs.append(eta[i]**2 * pm.gp.cov.ExpQuad(1, ls=ell[i]))
    
# construct the four "true" gps
gps = []
for i in range(n_gps):
    gps.append(np.random.multivariate_normal(np.zeros(s), covs[i](x[:,None]).eval(), size=1).flatten())
    
# construct y_set, the set of n time series, each one a mixture of the three gps
y_set = []
f_set = []
known_mixtures = [] # Question, are the mixture weights known?
for i in range(n):
    mixture_weights = np.random.randint(0, 2, size=n_gps)
    
    # make sure its not all zeros...
    if sum(mixture_weights) == 0:
        ix = np.random.randint(n_gps) # 0, 1, 2 or 3
        mixture_weights[ix] = 1
        
    known_mixtures.append(mixture_weights)
        
    # define mixture 
    f = np.zeros(s)
    for i, w in enumerate(mixture_weights):
        f += gps[i] * w
   
    noise = sigma * np.random.randn(s)
    y = f + noise

    plt.plot(x, y)
    
    f_set.append(f)
    y_set.append(y)

Screenshot from 2021-11-07 17-57-25

## set up the pymc model
with pm.Model() as model:
    
    covs = []
    for i in range(1, n_gps + 1):
        eta = pm.HalfNormal(f'eta_{i}', sd=5)
        ell = pm.Gamma(f'ell_{i}', alpha=2, beta=1)
        cov = eta**2 * pm.gp.cov.ExpQuad(1, ls=ell)
        covs.append(cov)
    
    # apply mixtures to covariances
    selected_covs = []
    mixed_covs = []
    for i in range(n):
        mixture_weights = known_mixtures[i]
        for w_ix in range(len(mixture_weights)):
            w = mixture_weights[w_ix]
            if w == 1.0:
                selected_covs.append(covs[w_ix])
        mixed_cov = sum(selected_covs) # because GP(cov1 + cov2) = GP(cov1) + GP(cov2)
        mixed_covs.append(mixed_cov) 
        selected_covs = [] # clear out cov list
     
    gps = []
    for i in range(n):
        gp = pm.gp.Marginal(cov_func=mixed_covs[i])
        gps.append(gp)
        
    
    noise = pm.HalfNormal('noise', sd=2)    
    for i in range(n):
        lik = gps[i].marginal_likelihood(f"lik_{i}", X=x[:, None], y=y_set[i], noise=noise)
   
    tr = pm.sample(draws=100, tune=500, chains=1, cores=1)

So, am I getting the structure of the model? And, if so, does applying the mixture to the covariances work?

1 Like

Hi @bwengals, you are getting the structure right! The implementation makes a lot of sense(!) and it is learning the mixture correctly.

I took the liberty to add the piecewise linear function. Feel free to suggest another approach:

Helpers
class PiecewiseLinearMatrix():
    def __init__(self, 
                 k, 
                 m,
                 b,
                 changepoints,
                 n_series):
        self.k = k
        self.m = m
        self.b = b
        self.changepoints = changepoints
        self.n_series = n_series

    def create_changepoints(self, X, changepoints):
        return (0.5 * (1.0 + tt.sgn(tt.tile(X.reshape((-1,1)), (1,len(changepoints))) - changepoints)))

    def build(self, X):

        X = theano.shared(X)

        A = self.create_changepoints(X, self.changepoints)
        
        growth = (self.k.reshape((1, -1)) + tt.dot(A, self.b))*tt.tile(X, (1, self.n_series))
        offset = (self.m.reshape((1,-1)) + tt.dot(A, (-self.changepoints.reshape((-1,1)) * self.b)))
        piecewise = growth + offset

        return piecewise

class PiecewiseLinear(pm.gp.mean.Mean):
    def __init__(self, changepoints, k, m, delta):
        self.changepoints = changepoints
        self.k = k
        self.m = m
        self.delta = delta
        
    def dm_changepoints_theano(self, X, changepoints_t):
        return 0.5 * (1.0 + tt.sgn(tt.tile(X, (1, len(changepoints_t))) - changepoints_t))

    def __call__(self, X):
        A = self.dm_changepoints_theano(X, self.changepoints)
        return (self.k + tt.dot(A, self.delta)) * X.flatten() + (
            self.m + tt.dot(A, -self.changepoints * self.delta)
        )
Generate Data
## generate the data
# number of time series
n = 10

# length of each series
s = 50

# number of "basis" GPs
n_gps = 4

# domain
x = np.linspace(0, 10, s)

# noise standard deviation
sigma = 0.1

# each gp has a different covariance matrix
eta = [1, 2, 3, 4]
ell = [0.5, 1, 2, 3]
covs = []

piece = PiecewiseLinearMatrix(k=np.array([1]), 
                               m=np.array([1]), 
                               b=np.random.normal(0.1, 1, size=(4,n)), 
                               changepoints = np.linspace(0, 10, 4),
                               n_series = n
                               ).build(x[:, None]).eval()

for i in range(n_gps):
    covs.append(eta[i]**2 * pm.gp.cov.ExpQuad(1, ls=ell[i]))
    
# construct the four "true" gps
gps = []
for i in range(n_gps):
    gps.append(np.random.multivariate_normal(np.zeros(s), covs[i](x[:,None]).eval(), size=1).flatten())
    
# construct y_set, the set of n time series, each one a mixture of the four gps
y_set = []
f_set = []
known_mixtures = [] # Question, are the mixture weights known? Answer: No
for i in range(n):
    mixture_weights = np.random.randint(0, 2, size=n_gps)
    
    # make sure its not all zeros...
    if sum(mixture_weights) == 0:
        ix = np.random.randint(3) # 0, 1 or 2
        mixture_weights[ix] = 1
        
    known_mixtures.append(mixture_weights)
        
    # define mixture 
    f = np.zeros(s)
    for i, w in enumerate(mixture_weights):
        f += gps[i] * w
   
    noise = sigma * np.random.randn(s)
    y = f + noise + piece[:,i]

    plt.plot(x, y)
    
    f_set.append(f)
    y_set.append(y)
Model
## set up the pymc model
with pm.Model() as model:
    
    k = pm.Normal("k", mu=0, sigma=1, shape = n)
    m = pm.Normal("m", mu=0, sigma=1, shape = n)
    delta = pm.Normal("delta", mu=0, sigma=2, shape=(4,n))

    covs = []
    for i in range(1, n_gps + 1):
        eta = pm.HalfNormal(f'eta_{i}', sd=5)
        ell = pm.Gamma(f'ell_{i}', alpha=2, beta=1)
        cov = eta**2 * pm.gp.cov.ExpQuad(1, ls=ell)
        covs.append(cov)
    
    # apply mixtures to covariances
    selected_covs = []
    mixed_covs = []
    for i in range(n):
        mixture_weights = known_mixtures[i]
        for w_ix in range(len(mixture_weights)):
            w = mixture_weights[w_ix]
            if w == 1.0:
                selected_covs.append(covs[w_ix])
        mixed_cov = sum(selected_covs) # because GP(cov1 + cov2) = GP(cov1) + GP(cov2)
        mixed_covs.append(mixed_cov) 
        selected_covs = [] # clear out cov list
     
    gps = []
    for i in range(n):
        piece = PiecewiseLinear(k = k[i],
                                  m = m[i],
                                  delta = delta[:,i],
                                  changepoints = np.linspace(0, s, 4))
        gp = pm.gp.Marginal(mean_func = piece,
                            cov_func=mixed_covs[i])
        gps.append(gp)
        
    
    noise = pm.HalfNormal('noise', sd=2)    
    for i in range(n):
        lik = gps[i].marginal_likelihood(f"lik_{i}", X=x[:, None], y=y_set[i], noise=noise)
   
    tr = pm.sample(draws=1000, tune=500, chains=1, cores=1)
Predictions
# predict length of each series
s_new = 60

x_new = np.linspace(0, 12, s_new)[:,None]

f_new = []

with model:
    for i in range(n):
        f_new.append(gps[i].conditional(f"f_new_{i}", Xnew=x_new, pred_noise=True))

with model:
    y_pred = pm.sample_posterior_predictive(tr,
                                           vars=[*f_new],
                                           samples=500)

# plot the results
fig, ax = plt.subplots(2, 3, figsize=(12, 5))
ax = ax.ravel()

for i in range(6):
    plot_gp_dist(ax[i], y_pred[f"f_new_{i}"], x_new)

    # plot the data and the true latent function
    #plt.plot(X, f_true, "dodgerblue", lw=3, label="True f")
    ax[i].plot(x, y_set[i], "ok", ms=3, alpha=0.5, label="Observed data");

After getting your feedback, I will test it with larger datasets.

I will also try out the results using Marginal_Sparse.
I have two additional questions to improve scalability:

  1. Any concerns with using ADVI to scale it further?
  2. Can we use Minibatches with GPs or the fact that they are sampled randomly with replacement does not allow it?

Feel free to share any other suggestions.

Thank you very much!

1 Like

Great to hear! Yeah that looks good with the mean function added in, I hope that helps.

For your follow up questions, I really don’t know unfortunately. I honestly haven’t really tried ADVI with GP models – I try to stick to NUTs or worst case maximize the marginal likelihood (which actually, you might try now, since you are not using a Latent implementation. This should be faster than ADVI too, and not sacrifice much in uncertainty quantification). And then, in theory, Minibatches should work with your model, as long as you are batching over GPs (not within a single GP). I don’t know how to implement this in PyMC though. Perhaps @ferrine or someone who knows the ADVI stuff really well might have some thoughts?

1 Like

Hi @bwengals, just a quick update. I’ve implemented the MarginalSparse and it works fine. But if I try to increase the dummy data size (e.g. n=100, s=20, n\_gps=10), it just hangs forever with one core of the CPU at 100%. I believe that the model would take a reasonable time to run (MAP is pretty fast) but it won’t even start. The full code is below. What could be the cause?

Also, I am not able to run it with the latest PyMC3 version (3.11.2 from conda and python=3.9.7), are you? Actually, I have to go back as far as pymc3=3.10.0. Is this a bug or am I missing something?

Full traceback with latest pymc3 version
You can find the C code in this temporary file: /tmp/theano_compilation_error_k5ixfvsi
Traceback (most recent call last):
  File "/home/mach1ne/WIP/multi_observed_GPs_with_mixture_in_cov.py", line 80, in <module>
    piece = PiecewiseLinearMatrix(k=np.array([0.2]), 
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/graph/basic.py", line 554, in eval
    self._fn_cache[inputs] = theano.function(inputs, self)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/compile/function/__init__.py", line 337, in function
    fn = pfunc(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/compile/function/pfunc.py", line 524, in pfunc
    return orig_function(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/compile/function/types.py", line 1981, in orig_function
    fn = m.create(defaults)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/compile/function/types.py", line 1836, in create
    _fn, _i, _o = self.linker.make_thunk(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/basic.py", line 266, in make_thunk
    return self.make_all(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/vm.py", line 1131, in make_all
    node.op.make_thunk(node, storage_map, compute_map, [], impl=impl)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/graph/op.py", line 634, in make_thunk
    return self.make_c_thunk(node, storage_map, compute_map, no_recycling)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/graph/op.py", line 600, in make_c_thunk
    outputs = cl.make_thunk(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/basic.py", line 1203, in make_thunk
    cthunk, module, in_storage, out_storage, error_storage = self.__compile__(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/basic.py", line 1138, in __compile__
    thunk, module = self.cthunk_factory(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/basic.py", line 1634, in cthunk_factory
    module = get_module_cache().module_from_key(key=key, lnk=self)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/cmodule.py", line 1191, in module_from_key
    module = lnk.compile_cmodule(location)
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/basic.py", line 1543, in compile_cmodule
    module = c_compiler.compile_str(
  File "/home/mach1ne/anaconda3/envs/pymc3_test/lib/python3.9/site-packages/theano/link/c/cmodule.py", line 2546, in compile_str
    raise Exception(
Exception: ('The following error happened while compiling the node', Elemwise{Composite{((i0 + i1) * i2)}}[(0, 1)](TensorConstant{(1, 1) of 0.2}, Dot22.0, Reshape{2}.0), '\n', 'Compilation failed (return status=1): /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp: In member function ‘int {anonymous}::__struct_compiled_op_m0e832eaf80de42bf9790ad948edc032faf7da093be86d8ce155fb3c56a0d3a2d::run()’:. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:589:60: error: expected primary-expression before ‘{’ token.   589 |                             #pragma omp parallel for if(n>={int(config.openmp_elemwise_minsize)}).       |                                                            ^. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:627:27: warning: narrowing conversion of ‘V5_n0’ from ‘npy_intp’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   627 |     int init_totals[2] = {V5_n0, V5_n1};.       |                           ^~~~~. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:627:34: warning: narrowing conversion of ‘V5_n1’ from ‘npy_intp’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   627 |     int init_totals[2] = {V5_n0, V5_n1};.       |                                  ^~~~~. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:640:1: warning: narrowing conversion of ‘V5_stride0’ from ‘ssize_t’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   640 | V5_stride0, V5_stride1,.       | ^~~~~~~~~~. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:640:13: warning: narrowing conversion of ‘V5_stride1’ from ‘ssize_t’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   640 | V5_stride0, V5_stride1,.       |             ^~~~~~~~~~. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:641:1: warning: narrowing conversion of ‘V3_stride0’ from ‘ssize_t’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   641 | V3_stride0, V3_stride1.       | ^~~~~~~~~~. /home/mach1ne/.theano/compiledir_Linux-5.4--generic-x86_64-with-glibc2.31-x86_64-3.9.7-64/tmpkmb9d2k0/mod.cpp:641:13: warning: narrowing conversion of ‘V3_stride1’ from ‘ssize_t’ {aka ‘long int’} to ‘int’ [-Wnarrowing].   641 | V3_stride0, V3_stride1.       |             ^~~~~~~~~~. At global scope:. cc1plus: warning: unrecognized command line option ‘-Wno-c++11-narrowing’. ', 'FunctionGraph(Elemwise{Composite{((i0 + i1) * i2)}}[(0, 1)](TensorConstant{(1, 1) of 0.2}, <TensorType(float64, matrix)>, <TensorType(int64, matrix)>))')

Full code with Marginal Sparse:

import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
from pymc3.gp.util import plot_gp_dist
import theano
import theano.tensor as tt
from scipy import stats
import pickle
import warnings
warnings.filterwarnings("ignore")


class PiecewiseLinearMatrix():
    def __init__(self, 
                 k, 
                 m,
                 b,
                 changepoints,
                 n_series):
        self.k = k
        self.m = m
        self.b = b
        self.changepoints = changepoints
        self.n_series = n_series

    def create_changepoints(self, X, changepoints):
        return (0.5 * (1.0 + tt.sgn(tt.tile(X.reshape((-1,1)), (1,len(changepoints))) - changepoints)))

    def build(self, X):

        X = theano.shared(X)

        A = self.create_changepoints(X, self.changepoints)
        
        growth = (self.k.reshape((1, -1)) + tt.dot(A, self.b))*tt.tile(X, (1, self.n_series))
        offset = (self.m.reshape((1,-1)) + tt.dot(A, (-self.changepoints.reshape((-1,1)) * self.b)))
        piecewise = growth + offset

        return piecewise


class PiecewiseLinear(pm.gp.mean.Mean):
    def __init__(self, changepoints, k, m, delta):
        self.changepoints = changepoints
        self.k = k
        self.m = m
        self.delta = delta
        
    def dm_changepoints_theano(self, X, changepoints_t):
        return 0.5 * (1.0 + tt.sgn(tt.tile(X, (1, len(changepoints_t))) - changepoints_t))

    def __call__(self, X):
        A = self.dm_changepoints_theano(X, self.changepoints)
        return (self.k + tt.dot(A, self.delta)) * X.flatten() + (
            self.m + tt.dot(A, -self.changepoints * self.delta)
        )


## generate the data
# number of time series
n = 100

# length of each series
s = 20

# number of "basis" GPs
n_gps = 10

# domain
x = np.arange(s)

# noise standard deviation
sigma = 0.1

# each gp has a different covariance matrix
eta = stats.halfnorm.rvs(2, size=n_gps)
ell = stats.halfnorm.rvs(2, size=n_gps)
covs = []

piece = PiecewiseLinearMatrix(k=np.array([0.2]), 
                               m=np.array([0.2]), 
                               b=np.random.normal(0.1, 0.3, size=(4,n)), 
                               changepoints = np.linspace(0, s, 4),
                               n_series = n
                               ).build(x[:, None]).eval()

for i in range(n_gps):
    covs.append(eta[i]**2 * pm.gp.cov.ExpQuad(1, ls=ell[i]))
    
# construct the four "true" gps
gps = []
for i in range(n_gps):
    gps.append(np.random.multivariate_normal(np.zeros(s), covs[i](x[:,None]).eval(), size=1).flatten())
    
# construct y_set, the set of n time series, each one a mixture of the four gps
y_set = []
f_set = []
known_mixtures = [] # Question, are the mixture weights known? Answer: No
for i in range(n):
    mixture_weights = np.random.randint(0, 2, size=n_gps)
    
    # make sure its not all zeros...
    if sum(mixture_weights) == 0:
        ix = np.random.randint(3) # 0, 1 or 2
        mixture_weights[ix] = 1
        
    known_mixtures.append(mixture_weights)
        
    # define mixture 
    f = np.zeros(s)
    for i, w in enumerate(mixture_weights):
        f += gps[i] * w
   
    noise = sigma * np.random.randn(s)
    y = f + noise + piece[:,i]
    
    f_set.append(f)
    y_set.append(y)

# With MarginalSparse

s_u = 10
Xu_init = np.linspace(0, 50, s_u)
## set up the pymc model
with pm.Model() as model:
    
    k = pm.Normal("k", mu=0, sigma=1, shape = n)
    m = pm.Normal("m", mu=0, sigma=1, shape = n)
    delta = pm.Normal("delta", mu=0, sigma=2, shape=(4,n))

    covs = []
    for i in range(1, n_gps + 1):
        eta = pm.HalfNormal(f'eta_{i}', sd=5)
        ell = pm.Gamma(f'ell_{i}', alpha=2, beta=1)
        cov = eta**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=ell)
        covs.append(cov)
    
    # apply mixtures to covariances
    selected_covs = []
    mixed_covs = []
    for i in range(n):
        mixture_weights = known_mixtures[i]
        for w_ix in range(len(mixture_weights)):
            w = mixture_weights[w_ix]
            if w == 1.0:
                selected_covs.append(covs[w_ix])
        mixed_cov = sum(selected_covs) # because GP(cov1 + cov2) = GP(cov1) + GP(cov2)
        mixed_covs.append(mixed_cov) 
        selected_covs = [] # clear out cov list
        

    # set flat prior for Xu
    Xu = pm.Flat("Xu", shape=s_u, testval=Xu_init)
     
    gps = []
    for i in range(n):
        piece = PiecewiseLinear(k = k[i],
                                  m = m[i],
                                  delta = delta[:,i],
                                  changepoints = np.linspace(0, s, 4))
        gp = pm.gp.MarginalSparse(mean_func = piece,
                            cov_func=mixed_covs[i],
                            approx="VFE")
        gps.append(gp)
        
    
    noise = pm.HalfNormal('noise', sd=2)    
    for i in range(n):
        lik = gps[i].marginal_likelihood(f"lik_{i}", X=x[:, None], Xu=Xu[:, None], y=y_set[i], noise=noise)
   
    mp = pm.find_MAP()


# predict length of each series
s_new = 60

x_new = np.arange(s_new)[:,None]

f_new = []

with model:
    for i in range(n):
        f_new.append(gps[i].conditional(f"f_new_{i}", Xnew=x_new, pred_noise=True))

with model:
    y_pred = pm.sample_posterior_predictive([mp],
                                           vars=[*f_new],
                                           samples=500)

with open(f'results_gp_cov_{n_gps}.pickle', 'wb') as handle:
    pickle.dump(y_pred, handle, pickle.HIGHEST_PROTOCOL)

Thanks,
Luis

I don’t have enough experience with ADVI + GP. I usually try to make good priors and parametrization so NUTS works