 Hidden Markov Model - Estimating Transition and Emission CPDs from multiple sequences - not working

Hi,
I’m trying to estimate the parameters of the Conditional Probability Distributions (CPD) in of Hidden Markov Model from multiple short sequences (using Pymc3).

The CPD for the transition probabilities of the hidden states is a probability table.
The CPD for the emissions is a Beta distribution, where the parameter values depend on the states of the hidden variable.

By adapting the code of (1), I can successfully estimate all parameters when using a single sequence (eg n=100). If increasing the size of the sequence (eg. from 100 to 300), the estimates become obviously more accurate and precise.
However, I can’t reproduce the results when using more than 1 sequence (here d=3).

Subsequently, I tried the suggestion of (2) and I wasn’t successful either.

Any ideas what I’m doing wrong?

Reproducible code:

# import libraries
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multinomial
from typing import List
import scipy.stats as stats
import pymc3 as pm
import theano.tensor as tt
import theano.tensor.slinalg as sla
import theano
import warnings
import arviz as az
warnings.simplefilter(action="ignore", category=FutureWarning)
theano.config.gcc.cxxflags = "-Wno-c++11-narrowing"

# analytic solution to obtain equilibrium distribution
def equilibrium_distribution(p_transition):

n_states = p_transition.shape
A = np.append(
arr=p_transition.T - np.eye(n_states),
values=np.ones(n_states).reshape(1, -1),
axis=0
)
b = np.transpose(np.array( * n_states + ))
p_eq = np.linalg.solve(
a=np.transpose(A).dot(A),
b=np.transpose(A).dot(b)
)
return p_eq

# create/sample states
def markov_sequence(p_init: np.array, p_transition: np.array, sequence_length: int) -> List[int]:
"""
Generate a Markov sequence based on p_init and p_transition.
"""
if p_init is None:
p_init = equilibrium_distribution(p_transition)
initial_state = list(multinomial.rvs(1, p_init)).index(1)

states = [initial_state]
for _ in range(sequence_length - 1):
p_tr = p_transition[states[-1]]
new_state = list(multinomial.rvs(1, p_tr)).index(1)
states.append(new_state)
return states

def beta_emissions(states: List[int], As: List[float], Bs: List[float]) -> List[float]:
emissions = []
for state in states:
a = As[state]
b = Bs[state]
e = stats.beta.rvs(a, b, size=1)
emissions.append(e)
return emissions

def generate(t,d, p_transition, a, b):
p_init = np.array([1,0])

# transition probability matrix
# each row must sum to one
assert p_transition[0, :].sum() == 1
assert p_transition[1, :].sum() == 1

#Generating a Markov Sequence
states_np = np.full((t, d), np.nan)
for i in range(d):
states = markov_sequence(p_init, p_transition, sequence_length=t)
states_np[:,i] = states

# Generate Emissions: states and observables
beta_ems_np = np.full((t,d), np.nan)
for i in range(d):
beta_ems = beta_emissions(states_np[:,i].astype(int), As = a, Bs = b)
beta_ems_np[:,i] = beta_ems

x=np.arange(0,t)
plt.plot(x,beta_ems_np)
plt.show()

return states_np, beta_ems_np

class HMMStates(pm.Categorical):
def __init__(self, p_transition, p_equilibrium, n_states, *args, **kwargs):
super(pm.Categorical, self).__init__(*args, **kwargs)
self.p_transition = p_transition
self.p_equilibrium = p_equilibrium
self.k = n_states
self.mode = tt.cast(0,dtype='int64')

def logp(self, x):
p_eq = self.p_equilibrium
# Broadcast out the transition probabilities,
# so that we can broadcast the calculation
# of log-likelihoods
p_tr = self.p_transition[x[:-1]]

# the logp of the initial state evaluated against the equilibrium probabilities
initial_state_logp = pm.Categorical.dist(p_eq).logp(x)

# the logp of the rest of the states.
x_i = x[1:]
ou_like = pm.Categorical.dist(p_tr).logp(x_i)
transition_logp = tt.sum(ou_like)
return initial_state_logp + transition_logp

def solve_equilibrium(n_states, p_transition):
A = tt.dmatrix('A')
A = tt.eye(n_states) - p_transition + tt.ones(shape=(n_states, n_states))
p_equilibrium = pm.Deterministic("p_equilibrium", sla.solve(A.T, tt.ones(shape=(n_states))))
return p_equilibrium

class HMMBetaEmissions(pm.Continuous):
def __init__(self, states, a, b, *args, **kwargs):
super().__init__(*args, **kwargs)
self.states = states
self.a = a
self.b = b

def logp(self, x):
"""
x: observations
"""
states = self.states
a = self.a[states] #np.array([1,10]).astype(int)[states]
b = self.b[states] #np.array([2,1]).astype(int)[states]
return tt.sum(pm.Beta.dist(alpha=a, beta=b).logp(x))

if __name__ == '__main__':

# ground truth parameters of the transition CPD
p_transition_sim = np.array(
[[0.9, 0.1],
[0.01, 0.99]]
)
# ground truth parameters of the emission CPD (beta dist)
a_sim = [1,10]
b_sim = [2,1]

model from (1) - single sequence

n = 100 # lenght of sequence
d =  1  # number of sequences
# Generate data
states, ems = generate(n, d, p_transition_sim, a_sim, b_sim)

n_states = 2
model1 = pm.Model()
with model1:
# Priors for transition matrix
p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))

# Solve for the equilibrium state
p_equilibrium = solve_equilibrium(n_states, p_transition)

# HMM state
hmm_states = HMMStates(
"hmm_states",
p_transition=p_transition,
p_equilibrium=p_equilibrium,
n_states=n_states,
shape=(ems.shape,ems.shape),
)

# Prior for a and b
a=pm.Exponential("a", lam=2, shape=(n_states,))
b=pm.Exponential("b", lam=2, shape=(n_states,))

# Observed emission likelihood
obs = HMMBetaEmissions(
"emission",
states=hmm_states,
a=a,
b=b,
observed = ems
)

with model1:
trace = pm.sample(2000, chains=2)

az.plot_trace(trace, var_names=["a"]);
az.plot_trace(trace, var_names=["b"]);
az.plot_trace(trace, var_names=["p_transition"]);

Results: The parameters could be estimated. It’s not perfect, but when increasing the length of the sequence, I would get better results.

model from (1) multiple sequences

n = 100 # lenght of sequence
d =  3  # number of sequences
# generate data
states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)

n_states = 2
model1 = pm.Model()
with model1:
# Priors for transition matrix
p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))

# Solve for the equilibrium state
p_equilibrium = solve_equilibrium(n_states, p_transition)

# HMM state
hmm_states = HMMStates(
"hmm_states",
p_transition=p_transition,
p_equilibrium=p_equilibrium,
n_states=n_states,
shape=(ems.shape,ems.shape),
)

# Prior for mu and sigma
a=pm.Exponential("a", lam=2, shape=(n_states,))
b=pm.Exponential("b", lam=2, shape=(n_states,))

# Observed emission likelihood
obs = HMMBetaEmissions(
"emission",
states=hmm_states,
a=a,
b=b,
observed = ems
)

with model1:
trace = pm.sample(2000, chains=2)

az.plot_trace(trace, var_names=["a"]);
az.plot_trace(trace, var_names=["b"]);
az.plot_trace(trace, var_names=["p_transition"]);

Results: Although I have more observations available than in example 1, I obtained poorer parameter estimations. Even if I increase the number of sequences to d=1000, Im not able to estimate the parameters.

model from (2) single sequence

n = 100 # lenght of sequence
d =  1  # number of sequences
# generate data
states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)

model2 = pm.Model()
with model2:
# Priors for transition matrix
p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
# Priors for a and b
a=pm.Exponential("a", lam=2, shape=(n_states,))
b=pm.Exponential("b", lam=2, shape=(n_states,))

# Solve for the equilibrium state
p_equilibrium = solve_equilibrium(n_states, p_transition)

hmm_states_list = []
emissions_list = []
for i in range(d):
# HMM state
hmm_states_i = HMMStates(
"hmm_states_" + str(i),
p_transition=p_transition,
p_equilibrium=p_equilibrium,
n_states=n_states,
shape=(ems.shape,1),
)

# Observed emission likelihood
obs_i = HMMBetaEmissions(
"emission_" + str(i),
states=hmm_states_i,
a = a,#np.array([1,10]).astype(int),
b = b,#np.array([2,1]).astype(int),
observed = ems[:,i]
)

hmm_states_list += [hmm_states_i]
emissions_list += [obs_i]

with model2:
trace = pm.sample(2000, chains=2)

az.plot_trace(trace, var_names=["a"]);
az.plot_trace(trace, var_names=["b"]);
az.plot_trace(trace, var_names=["p_transition"]);

Results: The code adaption of (2) doesn’t seem to work.

model from (2) multiple sequences

n = 100 # lenght of sequence
d =  3  # number of sequences
# generate data
states, ems = generate(n,d, p_transition_sim, a_sim, b_sim)

model2 = pm.Model()
with model2:
# Priors for transition matrix
p_transition = pm.Dirichlet("p_transition", a=tt.ones((n_states, n_states)), shape=(n_states, n_states))
# Priors for a and b
a=pm.Exponential("a", lam=2, shape=(n_states,))
b=pm.Exponential("b", lam=2, shape=(n_states,))

# Solve for the equilibrium state
p_equilibrium = solve_equilibrium(n_states, p_transition)

hmm_states_list = []
emissions_list = []
for i in range(d):
# HMM state
hmm_states_i = HMMStates(
"hmm_states_" + str(i),
p_transition=p_transition,
p_equilibrium=p_equilibrium,
n_states=n_states,
shape=(ems.shape,1),
)

# Observed emission likelihood
obs_i = HMMBetaEmissions(
"emission_" + str(i),
states=hmm_states_i,
a = a,#np.array([1,10]).astype(int),
b = b,#np.array([2,1]).astype(int),
observed = ems[:,i]
)

hmm_states_list += [hmm_states_i]
emissions_list += [obs_i]

with model2:
trace = pm.sample(2000, chains=2)

az.plot_trace(trace, var_names=["a"]);
az.plot_trace(trace, var_names=["b"]);
az.plot_trace(trace, var_names=["p_transition"]);

Strongly recommend you to try the marginalized HMM (see discussion in How to marginalized Hidden Markov Model with categorical?)

HMM model as in the main post use a discrete latent variable to represent the state, which cannot be sampled using HMC/NUTS - this usually means that you will get poor inference result. Marginalized HMM have other problems but a semi-supervised method should give reasonable inference result.

2 Likes

Hi Junpenglao,
However, I still don’t understand why it is possible to get better parameter estimates with fewer data.
In the provided example, I obtain better parameter estimates when using a single sequence (length 100) compare to 2 to 1000 sequences (length 100).
Do you think this also owes to the fact that we can’t sample discrete latent states, as you mentioned?

Most likely - I have not look closely into the model or actually run the code, but you should check:
The MCMC traceplot of the discrete latent variable - if they are a straight line, then they are stuck and your inference is not valid.
The reason for this is usually when you have more sequences, and each sequence have its own latent state, you are in a factorial discrete space that there is just no good MCMC kernel currently could effectively explore.

I adapted your code from https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/PyMC3QnA/discourse_2230.ipynb for multiple sequences and it works really well!

However, I would like to change the emission CPD table to a continuous CPD. In my case, it would be a beta distribution where the parameters a and b depend on the current hidden state.
I assume that I need to change the onestep function (log likelihood), but I’m a bit uncertain about how to do this. Could you provide any hints?

Also, what is necessary to make the code more general so that the current state depends on more than the previous state (eg. P(X_t | X_t-1, X_t-2, … X_t-n)?

import pymc3 as pm
import theano.tensor as tt
import numpy as np
import theano
import matplotlib.pyplot as plt

######### Generate Data
# simulation of data. Two state model for simplicity.
d_seqs = 30
N_seqs = 100
N_labels = 3
N_states = 2

# transition probability
P = np.array([[0.8, 0.2], [0.4, 0.6]])
# emission probabilityu
Pe = np.array([
[0.8, 0.1, 0.1],
[0.3, 0.4, 0.3]
])
N_labels = Pe.shape

AA = np.eye(N_states) - P + np.ones(shape=(N_states, N_states))
PA = np.linalg.solve(AA.T, np.ones(shape=(N_states)))

state_seq_d = np.full((N_seqs, d_seqs), np.nan)
label_seq_d = np.full((N_seqs, d_seqs), np.nan)

for d in range(d_seqs):
state_seq = [np.random.choice(N_states, p=PA)]
for i in range(1, N_seqs):
state_seq += [np.random.choice(N_states, p=P[state_seq[-1]])]

label_seq = [np.random.choice(N_labels, p=Pe[state_seq[i]])
for i in range(N_seqs)]

state_seq_d[:, d] = state_seq
label_seq_d[:, d] = label_seq

state_seq_d = state_seq_d.astype(int)
label_seq_d = label_seq_d.astype(int)

########## unsupervised model
observed_label = theano.shared(np.array(label_seq_d))

def onestep(obs, gamma_, theta, phi):
i = tt.cast(obs, 'int32')
alpha = gamma_ + tt.log(theta) + \
tt.tile(tt.log(phi[:, i]), (phi.shape, 1))
return pm.math.logsumexp(alpha, axis=0).T

T = len(label_seq_d)

with pm.Model() as model:
Pt = pm.Dirichlet('P_transition',
a=np.ones((N_states, N_states)),
shape=(N_states, N_states),
testval=P)

Pem = pm.Dirichlet('P_emission',
a=np.ones((N_states, N_labels)),
shape=(N_states, N_labels),
testval=Pe)
aux = 0
for d in range(d_seqs):
gamma = tt.log(Pem[:, label_seq_d[0,d]])
gamma = tt.tile(gamma, (1, 1)).T

result, updates = theano.scan(fn=onestep,   # the function the scan uses in the loop
outputs_info=gamma,   #initialization occus here, update happens automatically
sequences=observed_label[:,d], #input
non_sequences=[Pt, Pem],  #unchanging variables passed to scan
n_steps=T-1)  # number of iterations
aux += pm.math.logsumexp(result[-1])
obs_logp = pm.Potential('obs_logp', aux)
#obs_logp = pm.Potential('obs_logp', pm.math.logsumexp(result[-1]))

trace = pm.sample(1000, tune=1000)

pm.traceplot(trace, combined=True)

Check out pymc3-hmm if you want efficient samplers specifically for HMM (e.g. FFBS).

2 Likes

Hi, thanks for your reply. Unfortunately I have some Theano issues with pymc3-hmm.

when running the provided example:

import numpy as np
import theano.tensor as tt
import pymc3 as pm
from pymc3_hmm.distributions import SwitchingProcess, DiscreteMarkovChain
from pymc3_hmm.step_methods import FFBSStep, TransMatConjugateStep

def create_poisson_zero_hmm(mu, observed, p_0_a=np.r_[5, 1], p_1_a=np.r_[3, 5]):

p_0_rv = pm.Dirichlet("p_0", p_0_a)
p_1_rv = pm.Dirichlet("p_1", p_1_a)

P_tt = tt.stack([p_0_rv, p_1_rv])

S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=np.shape(observed)[-1])

Y_rv = SwitchingProcess(
"Y_t", [pm.Constant.dist(0), pm.Poisson.dist(mu)], S_rv, observed=observed
)

return Y_rv

np.random.seed(2032)

mu_true = 5000

with pm.Model() as sim_model:
_ = create_poisson_zero_hmm(mu_true, np.zeros(10000))

sim_point = pm.sample_prior_predictive(samples=1, model=sim_model)
sim_point["Y_t"] = sim_point["Y_t"].squeeze()

y_t = sim_point["Y_t"]

I obtain the following error :

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/__init__.py", line 5, in <module>
from .distributions import *

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/__init__.py", line 1, in <module>
from . import timeseries

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/timeseries.py", line 6, in <module>
from .continuous import get_tau_sigma, Normal, Flat

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/continuous.py", line 18, in <module>
from .dist_math import (

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/pymc3/distributions/dist_math.py", line 12, in <module>
from theano.scan_module import until

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/__init__.py", line 41, in <module>
from theano.scan_module import scan_opt

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/scan_opt.py", line 71, in <module>
from theano.scan_module import scan_op

File "//anaconda3/envs/pymc/lib/python3.7/site-packages/theano/scan_module/scan_op.py", line 2882, in <module>
@theano.compile.profiling.register_profiler_printer

AttributeError: module 'theano.compile' has no attribute 'profiling'

Any ideas?

after reinstalling pymc3 and pymc3-hmm I obtain the following error when compiling the code from above:

with pm.Model() as sim_model:

File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/model.py", line 365, in __call__
with instance:  # appends context

File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/model.py", line 264, in __enter__
self._old_theano_config = set_theano_conf(self._theano_config)

File "//anaconda3/envs/theano/lib/python3.7/site-packages/pymc3/theanof.py", line 458, in set_theano_conf
raise ValueError("Unknown theano config settings: %s" % unknown)

ValueError: Unknown theano config settings: {'compute_test_value'}

Ah, yes, we need to update pymc3-hmm to account for recent Theano-PyMC changes. One minute…

All right, just merged a PR that should fix these problems.

@brandonwillard thanks!, I will try your code.

In the meanwhile, I adapted @junpenglao 's code from https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/PyMC3QnA/discourse_2230.ipynb to my problem and changed the discrete emission CPD with a beta distribution.

Check out the following code:

#######generate toy example
# transition CPD
theta = np.array([[0.8, 0.2], [0.4, 0.6]])
# parameters for emission CPD (beta dist)
A = [1, 10]
B = [2, 1]

N_states = theta.shape
AA = np.eye(N_states) - theta + np.ones(shape=(N_states, N_states))
PA = np.linalg.solve(AA.T, np.ones(shape=(N_states)))

T = 100 # lenght of sequence
ddim = 5 #number of sequences
emission_d = np.full((T,ddim), np.nan)
for d in range(ddim):
state = np.argmax(np.random.binomial(1, PA)) #start
emission=[np.random.beta(A[state], B[state], size=1)]
for i in range(T-1):
state=np.argmax(np.random.binomial(1, theta[state,:]))
emission=emission+[np.random.beta(A[state], B[state], size=1)]
emission_d[:,d] = emission
plt.plot(emission_d)

############### Model in Pymc3
observed_label = theano.shared(np.array(emission_d))

def onestep(obs, gamma_, theta, A, B):
i = obs
alpha = gamma_ + tt.log(theta) + tt.tile(tt.log(pm.Beta.dist(alpha=A, beta=B).logp(i)), (theta.shape, 1))
return pm.math.logsumexp(alpha, axis=0).T

T = len(emission_d)

with pm.Model() as model:
Pt = pm.Dirichlet('P_transition',
a=np.ones((N_states, N_states)),
shape=(N_states, N_states),
testval=theta)

A = pm.Exponential("A", lam=1, shape=(N_states,), testval = A)
B = pm.Exponential("B", lam=1, shape=(N_states,), testval = B)

aux = 0
for d in range(ddim):
gamma=tt.log(pm.Beta.dist(alpha=A, beta=B).logp(emission_d[0,d]))
#gamma = tt.log(Pem[:, emission_d[0,d]])
gamma = tt.tile(gamma, (1, 1)).T

outputs_info=gamma,
sequences=observed_label[:,d],
non_sequences=[Pt, A, B],
n_steps=T-1)
aux += pm.math.logsumexp(result[-1])
obs_logp = pm.Potential('obs_logp', aux)

trace = pm.sample(1000, tune=1000)

The sampler fails immediately:
Here is the output:

Auto-assigning NUTS sampler…
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [B, A, P_transition]
Sampling 2 chains: 0%| | 0/4000 [00:00<?, ?draws/s]
Traceback (most recent call last):

File “”, line 35, in
trace = pm.sample(1000, tune=1000)

File “//anaconda3/lib/python3.7/site-packages/pymc3/sampling.py”, line 449, in sample
trace = _mp_sample(**sample_args)

File “//anaconda3/lib/python3.7/site-packages/pymc3/sampling.py”, line 999, in _mp_sample
for draw in sampler:

File “//anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py”, line 305, in iter

File “//anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py”, line 223, in recv_draw
six.raise_from(RuntimeError(‘Chain %s failed.’ % proc.chain), old)

File “”, line 3, in raise_from

RuntimeError: Chain 0 failed.

Any ideas?

I found it. It was just a bug. I took 2 times the log of the emission likelihood.
It has to be:
pm.Beta.dist(alpha=A, beta=B).logp(i)
and not:
tt.log(pm.Beta.dist(alpha=A, beta=B).logp(i))