Hidden Markov Model - Estimating Transition and Emission CPDs from multiple sequences - not working

NickN · November 25, 2020, 3:52pm

Hi,
I’m trying to estimate the parameters of the Conditional Probability Distributions (CPD) in of Hidden Markov Model from multiple short sequences (using Pymc3).

The CPD for the transition probabilities of the hidden states is a probability table.
The CPD for the emissions is a Beta distribution, where the parameter values depend on the states of the hidden variable.

I followed two previous posts:
(1) https://ericmjl.github.io/essays-on-data-science/machine-learning/markov-models/
(2) Sequence of Observed in a loop, how the log-likelihood are estimated? add them all?

By adapting the code of (1), I can successfully estimate all parameters when using a single sequence (eg n=100). If increasing the size of the sequence (eg. from 100 to 300), the estimates become obviously more accurate and precise.
However, I can’t reproduce the results when using more than 1 sequence (here d=3).

Subsequently, I tried the suggestion of (2) and I wasn’t successful either.

Any ideas what I’m doing wrong?

Reproducible code:

# import libraries
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multinomial
from typing import List
import scipy.stats as stats
import pymc3 as pm
import theano.tensor as tt
import theano.tensor.slinalg as sla 
import theano
import warnings
import arviz as az
warnings.simplefilter(action="ignore", category=FutureWarning)
theano.config.gcc.cxxflags = "-Wno-c++11-narrowing"





# analytic solution to obtain equilibrium distribution
def equilibrium_distribution(p_transition):
    
    n_states = p_transition.shape[0]
    A = np.append(
        arr=p_transition.T - np.eye(n_states),
        values=np.ones(n_states).reshape(1, -1),
        axis=0
    )
    b = np.transpose(np.array([0] * n_states + [1]))
    p_eq = np.linalg.solve(
        a=np.transpose(A).dot(A),
        b=np.transpose(A).dot(b)
    )
    return p_eq



# create/sample states 
def markov_sequence(p_init: np.array, p_transition: np.array, sequence_length: int) -> List[int]:
    """
    Generate a Markov sequence based on p_init and p_transition.
    """
    if p_init is None:
        p_init = equilibrium_distribution(p_transition)
    initial_state = list(multinomial.rvs(1, p_init)).index(1)

    states = [initial_state]
    for _ in range(sequence_length - 1):
        p_tr = p_transition[states[-1]]
        new_state = list(multinomial.rvs(1, p_tr)).index(1)
        states.append(new_state)
    return states   



def beta_emissions(states: List[int], As: List[float], Bs: List[float]) -> List[float]:
    emissions = []
    for state in states:
        a = As[state]
        b = Bs[state]
        e = stats.beta.rvs(a, b, size=1)
        emissions.append(e)
    return emissions




def generate(t,d, p_transition, a, b): 
    p_init = np.array([1,0])
    
    # transition probability matrix
    # each row must sum to one
    assert p_transition[0, :].sum() == 1
    assert p_transition[1, :].sum() == 1
    

    #Generating a Markov Sequence
    states_np = np.full((t, d), np.nan)
    for i in range(d):
        states = markov_sequence(p_init, p_transition, sequence_length=t)
        states_np[:,i] = states
    
    
    
    # Generate Emissions: states and observables
    beta_ems_np = np.full((t,d), np.nan)
    for i in range(d):
        beta_ems = beta_emissions(states_np[:,i].astype(int), As = a, Bs = b)
        beta_ems_np[:,i] = beta_ems
        
    x=np.arange(0,t)
    plt.plot(x,beta_ems_np)
    plt.show()
    
    return states_np, beta_ems_np


class HMMStates(pm.Categorical):
    def __init__(self, p_transition, p_equilibrium, n_states, *args, **kwargs):
        super(pm.Categorical, self).__init__(*args, **kwargs)
        self.p_transition = p_transition
        self.p_equilibrium = p_equilibrium
        self.k = n_states
        self.mode = tt.cast(0,dtype='int64')

    def logp(self, x):
        p_eq = self.p_equilibrium
        # Broadcast out the transition probabilities,
        # so that we can broadcast the calculation
        # of log-likelihoods
        p_tr = self.p_transition[x[:-1]]

        # the logp of the initial state evaluated against the equilibrium probabilities
        initial_state_logp = pm.Categorical.dist(p_eq).logp(x[0])

        # the logp of the rest of the states.
        x_i = x[1:]
        ou_like = pm.Categorical.dist(p_tr).logp(x_i)
        transition_logp = tt.sum(ou_like)
        return initial_state_logp + transition_logp
  

def solve_equilibrium(n_states, p_transition):
    A = tt.dmatrix('A')
    A = tt.eye(n_states) - p_transition + tt.ones(shape=(n_states, n_states))
    p_equilibrium = pm.Deterministic("p_equilibrium", sla.solve(A.T, tt.ones(shape=(n_states))))
    return p_equilibrium  


class HMMBetaEmissions(pm.Continuous):
    def __init__(self, states, a, b, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.states = states
        self.a = a
        self.b = b

    def logp(self, x):
        """
        x: observations
        """
        states = self.states
        a = self.a[states] #np.array([1,10]).astype(int)[states] 
        b = self.b[states] #np.array([2,1]).astype(int)[states]
        return tt.sum(pm.Beta.dist(alpha=a, beta=b).logp(x))



if __name__ == '__main__': 
    
    # ground truth parameters of the transition CPD
    p_transition_sim = np.array(
        [[0.9, 0.1],  
         [0.01, 0.99]] 
    )
    # ground truth parameters of the emission CPD (beta dist)
    a_sim = [1,10]
    b_sim = [2,1]

model from (1) - single sequence

    n = 100 # lenght of sequence 
    d =  1  # number of sequences
    # Generate data
    states, ems = generate(n, d, p_transition_sim, a_sim, b_sim)
    
    n_states = 2
    model1 = pm.Model()
    with model1:
        # Priors for transition matrix
        p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
    
        # Solve for the equilibrium state
        p_equilibrium = solve_equilibrium(n_states, p_transition)
    
        # HMM state
        hmm_states = HMMStates(
            "hmm_states",
            p_transition=p_transition,
            p_equilibrium=p_equilibrium,
            n_states=n_states,
            shape=(ems.shape[0],ems.shape[1]),
        )
    
        # Prior for a and b
        a=pm.Exponential("a", lam=2, shape=(n_states,))
        b=pm.Exponential("b", lam=2, shape=(n_states,))
        
    
        # Observed emission likelihood
        obs = HMMBetaEmissions(
            "emission",
            states=hmm_states,
            a=a,
            b=b,
            observed = ems
        )
        
    
    with model1:
        trace = pm.sample(2000, chains=2) 
    
    az.plot_trace(trace, var_names=["a"]);
    az.plot_trace(trace, var_names=["b"]);
    az.plot_trace(trace, var_names=["p_transition"]);

Results: The parameters could be estimated. It’s not perfect, but when increasing the length of the sequence, I would get better results.

model from (1) multiple sequences

    n = 100 # lenght of sequence 
    d =  3  # number of sequences
    # generate data
    states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)

    
    n_states = 2
    model1 = pm.Model()
    with model1:
        # Priors for transition matrix
        p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
    
        # Solve for the equilibrium state
        p_equilibrium = solve_equilibrium(n_states, p_transition)
    
        # HMM state
        hmm_states = HMMStates(
            "hmm_states",
            p_transition=p_transition,
            p_equilibrium=p_equilibrium,
            n_states=n_states,
            shape=(ems.shape[0],ems.shape[1]),
        )
    
        # Prior for mu and sigma
        a=pm.Exponential("a", lam=2, shape=(n_states,))
        b=pm.Exponential("b", lam=2, shape=(n_states,))
        
    
        # Observed emission likelihood
        obs = HMMBetaEmissions(
            "emission",
            states=hmm_states,
            a=a,
            b=b,
            observed = ems
        )
        
    
    with model1:
        trace = pm.sample(2000, chains=2)   
     
    az.plot_trace(trace, var_names=["a"]);
    az.plot_trace(trace, var_names=["b"]);
    az.plot_trace(trace, var_names=["p_transition"]);

Results: Although I have more observations available than in example 1, I obtained poorer parameter estimations. Even if I increase the number of sequences to d=1000, Im not able to estimate the parameters.

model from (2) single sequence

    n = 100 # lenght of sequence 
    d =  1  # number of sequences
    # generate data
    states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)   

    
model2 = pm.Model()
with model2:
    # Priors for transition matrix
    p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
        # Priors for a and b
        a=pm.Exponential("a", lam=2, shape=(n_states,))
        b=pm.Exponential("b", lam=2, shape=(n_states,))
    
        # Solve for the equilibrium state
        p_equilibrium = solve_equilibrium(n_states, p_transition)
    
        hmm_states_list = []
        emissions_list = []
        for i in range(d):
            # HMM state
            hmm_states_i = HMMStates(
                "hmm_states_" + str(i),
                p_transition=p_transition,
                p_equilibrium=p_equilibrium,
                n_states=n_states,
                shape=(ems.shape[0],1),
            )
    
    
            # Observed emission likelihood
            obs_i = HMMBetaEmissions(
                "emission_" + str(i),
                states=hmm_states_i,
                a = a,#np.array([1,10]).astype(int),
                b = b,#np.array([2,1]).astype(int),
                observed = ems[:,i]
            )
            
            
            hmm_states_list += [hmm_states_i]
            emissions_list += [obs_i]
            
      
    with model2:
        trace = pm.sample(2000, chains=2)     
    
    
    az.plot_trace(trace, var_names=["a"]);
    az.plot_trace(trace, var_names=["b"]);
    az.plot_trace(trace, var_names=["p_transition"]);

Results: The code adaption of (2) doesn’t seem to work.

model from (2) multiple sequences

    n = 100 # lenght of sequence 
    d =  3  # number of sequences
    # generate data
    states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)   
    
    model2 = pm.Model()
    with model2:
        # Priors for transition matrix
        p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
        # Priors for a and b
        a=pm.Exponential("a", lam=2, shape=(n_states,))
        b=pm.Exponential("b", lam=2, shape=(n_states,))
    
        # Solve for the equilibrium state
        p_equilibrium = solve_equilibrium(n_states, p_transition)
    
        hmm_states_list = []
        emissions_list = []
        for i in range(d):
            # HMM state
            hmm_states_i = HMMStates(
                "hmm_states_" + str(i),
                p_transition=p_transition,
                p_equilibrium=p_equilibrium,
                n_states=n_states,
                shape=(ems.shape[0],1),
            )
    
    
            # Observed emission likelihood
            obs_i = HMMBetaEmissions(
                "emission_" + str(i),
                states=hmm_states_i,
                a = a,#np.array([1,10]).astype(int),
                b = b,#np.array([2,1]).astype(int),
                observed = ems[:,i]
            )
            
            
            hmm_states_list += [hmm_states_i]
            emissions_list += [obs_i]
            
            
    with model2:
        trace = pm.sample(2000, chains=2)     
    
    
    az.plot_trace(trace, var_names=["a"]);
    az.plot_trace(trace, var_names=["b"]);
    az.plot_trace(trace, var_names=["p_transition"]);

junpenglao · November 26, 2020, 9:41am

Strongly recommend you to try the marginalized HMM (see discussion in How to marginalized Hidden Markov Model with categorical?)

HMM model as in the main post use a discrete latent variable to represent the state, which cannot be sampled using HMC/NUTS - this usually means that you will get poor inference result. Marginalized HMM have other problems but a semi-supervised method should give reasonable inference result.

NickN · November 26, 2020, 10:48am

Hi Junpenglao,
Thanks for your answer. I will definitely consider marginalization.
However, I still don’t understand why it is possible to get better parameter estimates with fewer data.
In the provided example, I obtain better parameter estimates when using a single sequence (length 100) compare to 2 to 1000 sequences (length 100).
Do you think this also owes to the fact that we can’t sample discrete latent states, as you mentioned?

junpenglao · November 26, 2020, 11:18am

Most likely - I have not look closely into the model or actually run the code, but you should check:
The MCMC traceplot of the discrete latent variable - if they are a straight line, then they are stuck and your inference is not valid.
The reason for this is usually when you have more sequences, and each sequence have its own latent state, you are in a factorial discrete space that there is just no good MCMC kernel currently could effectively explore.

NickN · November 27, 2020, 11:41am

I adapted your code from https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/PyMC3QnA/discourse_2230.ipynb for multiple sequences and it works really well!

However, I would like to change the emission CPD table to a continuous CPD. In my case, it would be a beta distribution where the parameters a and b depend on the current hidden state.
I assume that I need to change the onestep function (log likelihood), but I’m a bit uncertain about how to do this. Could you provide any hints?

Also, what is necessary to make the code more general so that the current state depends on more than the previous state (eg. P(X_t | X_t-1, X_t-2, … X_t-n)?

import pymc3 as pm
import theano.tensor as tt
import numpy as np
import theano
import matplotlib.pyplot as plt


######### Generate Data
# simulation of data. Two state model for simplicity.
d_seqs = 30
N_seqs = 100
N_labels = 3
N_states = 2

# transition probability
P = np.array([[0.8, 0.2], [0.4, 0.6]])
# emission probabilityu
Pe = np.array([
    [0.8, 0.1, 0.1],
    [0.3, 0.4, 0.3]
])
N_labels = Pe.shape[1]

AA = np.eye(N_states) - P + np.ones(shape=(N_states, N_states))
PA = np.linalg.solve(AA.T, np.ones(shape=(N_states)))

state_seq_d = np.full((N_seqs, d_seqs), np.nan)
label_seq_d = np.full((N_seqs, d_seqs), np.nan)

for d in range(d_seqs):
    state_seq = [np.random.choice(N_states, p=PA)]
    for i in range(1, N_seqs):
        state_seq += [np.random.choice(N_states, p=P[state_seq[-1]])]
    
    label_seq = [np.random.choice(N_labels, p=Pe[state_seq[i]])
                 for i in range(N_seqs)]

    state_seq_d[:, d] = state_seq
    label_seq_d[:, d] = label_seq


state_seq_d = state_seq_d.astype(int)
label_seq_d = label_seq_d.astype(int)





########## unsupervised model
observed_label = theano.shared(np.array(label_seq_d))

def onestep(obs, gamma_, theta, phi):
    i = tt.cast(obs, 'int32')
    alpha = gamma_ + tt.log(theta) + \
        tt.tile(tt.log(phi[:, i]), (phi.shape[0], 1))
    return pm.math.logsumexp(alpha, axis=0).T


T = len(label_seq_d)

with pm.Model() as model:
    Pt = pm.Dirichlet('P_transition',
                      a=np.ones((N_states, N_states)),
                      shape=(N_states, N_states),
                      testval=P)

    Pem = pm.Dirichlet('P_emission',
                       a=np.ones((N_states, N_labels)),
                       shape=(N_states, N_labels),
                       testval=Pe)
    aux = 0
    for d in range(d_seqs):
        gamma = tt.log(Pem[:, label_seq_d[0,d]])
        gamma = tt.tile(gamma, (1, 1)).T
    
        result, updates = theano.scan(fn=onestep,   # the function the scan uses in the loop
                                      outputs_info=gamma,   #initialization occus here, update happens automatically
                                      sequences=observed_label[:,d], #input
                                      non_sequences=[Pt, Pem],  #unchanging variables passed to scan
                                      n_steps=T-1)  # number of iterations
        aux += pm.math.logsumexp(result[-1])
    obs_logp = pm.Potential('obs_logp', aux)
    #obs_logp = pm.Potential('obs_logp', pm.math.logsumexp(result[-1]))
        
        
        
    
    trace = pm.sample(1000, tune=1000)


pm.traceplot(trace, combined=True)

brandonwillard · November 30, 2020, 5:33pm

Check out pymc3-hmm if you want efficient samplers specifically for HMM (e.g. FFBS).

NickN · December 1, 2020, 2:25pm

Hi, thanks for your reply. Unfortunately I have some Theano issues with pymc3-hmm.

when running the provided example:

import numpy as np
import theano.tensor as tt
import pymc3 as pm    
from pymc3_hmm.utils import compute_steady_state
from pymc3_hmm.distributions import SwitchingProcess, DiscreteMarkovChain
from pymc3_hmm.step_methods import FFBSStep, TransMatConjugateStep


def create_poisson_zero_hmm(mu, observed, p_0_a=np.r_[5, 1], p_1_a=np.r_[3, 5]):

    p_0_rv = pm.Dirichlet("p_0", p_0_a)
    p_1_rv = pm.Dirichlet("p_1", p_1_a)

    P_tt = tt.stack([p_0_rv, p_1_rv])
    P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))

    pi_0_tt = compute_steady_state(P_rv)

    S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=np.shape(observed)[-1])

    Y_rv = SwitchingProcess(
        "Y_t", [pm.Constant.dist(0), pm.Poisson.dist(mu)], S_rv, observed=observed
    )

    return Y_rv




np.random.seed(2032)

mu_true = 5000

with pm.Model() as sim_model:
    _ = create_poisson_zero_hmm(mu_true, np.zeros(10000))

sim_point = pm.sample_prior_predictive(samples=1, model=sim_model)
sim_point["Y_t"] = sim_point["Y_t"].squeeze()

y_t = sim_point["Y_t"]

I obtain the following error :

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/__init__.py", line 5, in <module>
    from .distributions import *

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/__init__.py", line 1, in <module>
    from . import timeseries

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/timeseries.py", line 6, in <module>
    from .continuous import get_tau_sigma, Normal, Flat

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/continuous.py", line 18, in <module>
    from .dist_math import (

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/dist_math.py", line 12, in <module>
    from theano.scan_module import until

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/__init__.py", line 41, in <module>
    from theano.scan_module import scan_opt

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/scan_opt.py", line 71, in <module>
    from theano.scan_module import scan_op

  File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/scan_op.py", line 2882, in <module>
    @theano.compile.profiling.register_profiler_printer

AttributeError: module 'theano.compile' has no attribute 'profiling'

Any ideas?

NickN · December 1, 2020, 4:04pm

after reinstalling pymc3 and pymc3-hmm I obtain the following error when compiling the code from above:

with pm.Model() as sim_model:

  File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/model.py", line 365, in __call__
with instance:  # appends context

  File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/model.py", line 264, in __enter__
self._old_theano_config = set_theano_conf(self._theano_config)

  File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/theanof.py", line 458, in set_theano_conf
raise ValueError("Unknown theano config settings: %s" % unknown)

ValueError: Unknown theano config settings: {'compute_test_value'}

brandonwillard · December 1, 2020, 9:55pm

Ah, yes, we need to update pymc3-hmm to account for recent Theano-PyMC changes. One minute…

brandonwillard · December 1, 2020, 11:35pm

All right, just merged a PR that should fix these problems.

NickN · December 2, 2020, 6:23pm

@brandonwillard thanks!, I will try your code.

In the meanwhile, I adapted @junpenglao 's code from https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/PyMC3QnA/discourse_2230.ipynb to my problem and changed the discrete emission CPD with a beta distribution.

Check out the following code:

#######generate toy example
# transition CPD
theta = np.array([[0.8, 0.2], [0.4, 0.6]]) 
# parameters for emission CPD (beta dist)
A = [1, 10]   
B = [2, 1]


N_states = theta.shape[0]
AA = np.eye(N_states) - theta + np.ones(shape=(N_states, N_states))
PA = np.linalg.solve(AA.T, np.ones(shape=(N_states)))


T = 100 # lenght of sequence
ddim = 5 #number of sequences
emission_d = np.full((T,ddim), np.nan)
for d in range(ddim):
    state = np.argmax(np.random.binomial(1, PA)) #start 
    emission=[np.random.beta(A[state], B[state], size=1)]
    for i in range(T-1):
        state=np.argmax(np.random.binomial(1, theta[state,:]))
        emission=emission+[np.random.beta(A[state], B[state], size=1)]
    emission_d[:,d] = emission
plt.plot(emission_d)





############### Model in Pymc3
observed_label = theano.shared(np.array(emission_d))

def onestep(obs, gamma_, theta, A, B):
    i = obs
    alpha = gamma_ + tt.log(theta) + tt.tile(tt.log(pm.Beta.dist(alpha=A, beta=B).logp(i)), (theta.shape[0], 1))
    return pm.math.logsumexp(alpha, axis=0).T

T = len(emission_d)

with pm.Model() as model:
    Pt = pm.Dirichlet('P_transition',
                      a=np.ones((N_states, N_states)),
                      shape=(N_states, N_states),
                      testval=theta)


    
    A = pm.Exponential("A", lam=1, shape=(N_states,), testval = A)
    B = pm.Exponential("B", lam=1, shape=(N_states,), testval = B)
    
    
    aux = 0
    for d in range(ddim):
        gamma=tt.log(pm.Beta.dist(alpha=A, beta=B).logp(emission_d[0,d]))
        #gamma = tt.log(Pem[:, emission_d[0,d]])
        gamma = tt.tile(gamma, (1, 1)).T
    
        result, updates = theano.scan(fn=onestep,
                                      outputs_info=gamma,
                                      sequences=observed_label[:,d],
                                      non_sequences=[Pt, A, B],
                                      n_steps=T-1)
        aux += pm.math.logsumexp(result[-1])
    obs_logp = pm.Potential('obs_logp', aux)
    
    trace = pm.sample(1000, tune=1000)

The sampler fails immediately:
Here is the output:

Auto-assigning NUTS sampler…
Initializing NUTS using jitter+adapt_diag…
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [B, A, P_transition]
Sampling 2 chains: 0%| | 0/4000 [00:00<?, ?draws/s]
Traceback (most recent call last):

File “”, line 35, in
trace = pm.sample(1000, tune=1000)

File “//anaconda3/lib/python3.7/site-packages/pymc3/sampling.py”, line 449, in sample
trace = _mp_sample(**sample_args)

File “//anaconda3/lib/python3.7/site-packages/pymc3/sampling.py”, line 999, in _mp_sample
for draw in sampler:

File “//anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py”, line 305, in iter
draw = ProcessAdapter.recv_draw(self._active)

File “//anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py”, line 223, in recv_draw
six.raise_from(RuntimeError(‘Chain %s failed.’ % proc.chain), old)

File “”, line 3, in raise_from

RuntimeError: Chain 0 failed.