Theano.scan and pm.Model() are strangely incompatible

I really have no clue here.
Some how, my function works fine if it is not within the pm.Model() context.
But if I move the function to inside the pm.Model() context, I get error.

import theano
import theano.tensor as tt
import numpy as np
import pymc3 as pm


def step(z_now, prev_logp):
    return z_now * prev_logp


def scan(B):
    initial_logp = tt.constant(
        0.0, dtype="float64",
        name="initial_logp")
    outputs_info = [initial_logp]

    sequences = B
    result, updates = theano.scan(
        fn=step,
        outputs_info=outputs_info,
        sequences=sequences,
        n_steps=B.size)
    return result[-1]


def make_func(emis_trj):
    output = scan(emis_trj)
    return theano.function([emis_trj], output)


def main():
    n_emis = 4
    z = np.random.choice(n_emis, size=10)
    z = z.astype(np.int32)

    # This is fine
    # z_tt = tt.ivector("z")
    # f = make_func(z_tt)
    # logp = f(z)

    # Error
    with pm.Model():
        z_tt = tt.ivector("z")
        f = make_func(z_tt)
        logp = f(z)


main()
Traceback (most recent call last):
  File "theano/gof/op.py", line 625, in __call__
    storage_map[ins] = [self._get_test_value(ins)]
  File "theano/gof/op.py", line 581, in _get_test_value
    raise AttributeError('%s has no test value %s' % (v, detailed_err_msg))
AttributeError: z has no test value  
Backtrace when that variable is created:

  File "check_scan6.py", line 48, in <module>
    main()
  File "check_scan6.py", line 43, in main
    z_tt = tt.ivector("z")


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "check_scan6.py", line 48, in <module>
    main()
  File "check_scan6.py", line 44, in main
    f = make_func(z_tt)
  File "check_scan6.py", line 27, in make_func
    output = scan(emis_trj)
  File "check_scan6.py", line 22, in scan
    n_steps=B.size)
  File "theano/tensor/var.py", line 277, in <lambda>
    size = property(lambda self: self.shape[0] if self.ndim == 1 else
  File "theano/tensor/var.py", line 275, in <lambda>
    shape = property(lambda self: theano.tensor.basic.shape(self))
  File "theano/gof/op.py", line 639, in __call__
    (i, ins, node, detailed_err_msg))
ValueError: Cannot compute test value: input 0 (z) of Op Shape(z) missing default value.  
Backtrace when that variable is created:

  File "check_scan6.py", line 48, in <module>
    main()
  File "check_scan6.py", line 43, in main
    z_tt = tt.ivector("z")

You dont need to compile it into a theano function using make_func - that for calling it in python/numpy.
Within pm.Model everything is a theano tensor (there are some automatic casting happening if the variable is not a tensor). Since the theano.scan already output a tensor you can just do: logp = scan(z_tt)

Got another error. It says no “test value”.

import theano
import theano.tensor as tt
import pymc3 as pm


def step(z_now, prev_logp):
    return z_now * prev_logp


def scan(B):
    initial_logp = tt.constant(
        0.0, dtype="float64",
        name="initial_logp")
    outputs_info = [initial_logp]

    sequences = B
    result, updates = theano.scan(
        fn=step,
        outputs_info=outputs_info,
        sequences=sequences,
        n_steps=B.size)
    return result[-1]


def main():
    # Error
    with pm.Model():
        z_tt = tt.ivector("z_tt")
        logp = scan(z_tt)


main()
scan_fix.py
Traceback (most recent call last):
  File "theano/gof/op.py", line 625, in __call__
    storage_map[ins] = [self._get_test_value(ins)]
  File "theano/gof/op.py", line 581, in _get_test_value
    raise AttributeError('%s has no test value %s' % (v, detailed_err_msg))
AttributeError: z_tt has no test value  
Backtrace when that variable is created:

  File "scan_fix.py", line 32, in <module>
    main()
  File "scan_fix.py", line 28, in main
    z_tt = tt.ivector("z_tt")


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "scan_fix.py", line 32, in <module>
    main()
  File "scan_fix.py", line 29, in main
    logp = scan(z_tt)
  File "scan_fix.py", line 21, in scan
    n_steps=B.size)
  File "theano/tensor/var.py", line 277, in <lambda>
    size = property(lambda self: self.shape[0] if self.ndim == 1 else
  File "theano/tensor/var.py", line 275, in <lambda>
    shape = property(lambda self: theano.tensor.basic.shape(self))
  File "theano/gof/op.py", line 639, in __call__
    (i, ins, node, detailed_err_msg))
ValueError: Cannot compute test value: input 0 (z_tt) of Op Shape(z_tt) 
missing default value.  
Backtrace when that variable is created:

  File "scan_fix.py", line 32, in <module>
    main()
  File "scan_fix.py", line 28, in main
    z_tt = tt.ivector("z_tt")


Process finished with exit code 1

When I set z_tt to z_tt = np.arange(150), I get a different error.

scan_fix.py
Traceback (most recent call last):
  File "scan_fix.py", line 33, in <module>
    main()
  File "scan_fix.py", line 30, in main
    logp = scan(z_tt)
  File "scan_fix.py", line 21, in scan
    n_steps=B.size)
  File "theano/scan_module/scan.py", line 586, in scan
    scan_seqs = [seq[:actual_n_steps] for seq in scan_seqs]
  File "theano/scan_module/scan.py", line 586, in <listcomp>
    scan_seqs = [seq[:actual_n_steps] for seq in scan_seqs]
TypeError: slice indices must be integers or None or have an __index__ method

Thanks for the pointer. I finally found a fix that “seems” to work.

From https://github.com/GeekLiB/Theano-Deep-learning/blob/master/theano/tests/test_ifelse.py

I guess theano needs a test value because it doesn’t know the shape of the tt.ivector?

Fix by adding test values.
Hopefully this won’t be a performance hit when the “test value” is a large array.

import theano
import theano.tensor as tt
import pymc3 as pm
import numpy as np


def step(z_now, prev_logp):
    return z_now * prev_logp


def scan(B):
    initial_logp = tt.constant(
        0.0, dtype="float64",
        name="initial_logp")
    outputs_info = [initial_logp]

    sequences = B
    result, updates = theano.scan(
        fn=step,
        outputs_info=outputs_info,
        sequences=sequences,
        n_steps=B.size)
    return result[-1]


def main():
    # Error
    z_tt_test = np.arange(150).astype(np.int32)
    with pm.Model():
        z_tt = tt.ivector("z_tt")
        z_tt.tag.test_value = z_tt_test
        logp = scan(z_tt)


main()

Hmm, if your input is counter like 0, 1, … there should be a better way to implement it, possible without using the scan. For example, in your snippet you can use a cumprod

The whole thing is a hidden Markov model with categorical emission now.
I am scanning over the observation over time, which is a 1D vector.

It is very slow. I don’t know what will happen if my data has 4 million data points…

I am going to implement multiple categorical emissions later. The observations over time is going to be a 2D vector.

import numpy as np
import theano
import theano.tensor as tt
import pymc3 as pm
import matplotlib.pyplot as plt


# ---------------- #
# -- Simulation -- #
# ---------------- #
def make_true_parameters(n_states, n_emis):
    trans_mat = np.random.uniform(0, 1, (n_states, n_states))
    trans_mat /= np.sum(trans_mat, axis=1)[:, np.newaxis]
    init_probs = np.random.uniform(0, 1, (n_states,))
    init_probs /= np.sum(init_probs)
    emis_mat = np.random.uniform(0, 1, (n_states, n_emis))
    emis_mat /= np.sum(emis_mat, axis=1)[:, np.newaxis]
    return init_probs, trans_mat, emis_mat


def simulate_state_trj(init_probs, trans_mat, size=1000):
    n_states = init_probs.size
    out = np.empty(size, dtype=np.int8)
    out[0] = np.random.choice(n_states, p=init_probs)
    for i in range(1, size):
        out[i] = np.random.choice(n_states, p=trans_mat[out[i - 1]])
    return out


def simulate_emis_trj(state_trj, emis_mat):
    n_states, n_emis = emis_mat.shape
    size = state_trj.size
    emis_trj = np.empty(size, dtype=np.int8)
    for i in range(size):
        emis_trj[i] = np.random.choice(
            n_emis, p=emis_mat[state_trj[i]])
    return emis_trj


# -------------------- #
# -- Log-likelihood -- #
# -------------------- #
def find_phi(emis_now, emis_mat):
    emis_now = tt.cast(emis_now, 'int64')
    return emis_mat[:, emis_now]


def forward_step(emis_now, alpha, partial_logp, trans_mat, emis_mat):
    alpha = alpha * find_phi(emis_now, emis_mat)
    alpha = pm.math.dot(alpha, trans_mat)
    c = pm.math.sum(alpha)
    alpha = alpha / c
    partial_logp = partial_logp + pm.math.log(c)
    return alpha, partial_logp


def forward_algorithm(emis_trj, init_probs, trans_mat, emis_mat):
    initial_logp = pm.math.constant(0.0, dtype="float64", name="initial_logp")
    outputs_info = [init_probs, initial_logp]

    sequences = emis_trj
    non_sequences = [trans_mat, emis_mat]

    result, updates = theano.scan(fn=forward_step,
                                  outputs_info=outputs_info,
                                  sequences=sequences,
                                  non_sequences=non_sequences,
                                  n_steps=emis_trj.size)
    alpha_arr, partial_logp_arr = result
    return partial_logp_arr[-1]


# ---------- #
# -- Main -- #
# ---------- #
def main():
    # Model size
    n_states = 3
    n_emis = 4

    # Make true parameters
    true_pi, true_A, true_B = make_true_parameters(n_states, n_emis)

    # Simulate trajectories
    state_trj = simulate_state_trj(true_pi, true_A, size=1000)
    true_z = simulate_emis_trj(state_trj, true_B).astype(np.int64)

    with pm.Model():
        pi = pm.Dirichlet("pi", np.ones(n_states), shape=(n_states,))
        A = pm.Dirichlet("A", np.ones(n_states), shape=(
            n_states, n_states))
        B = pm.Dirichlet("B", np.ones(n_emis), shape=(
            n_states, n_emis))
        z = tt.lvector("z_tt")
        z.tag.test_value = true_z
        pm.DensityDist('likelihood',
                       lambda z_, pi_, A_, B_: forward_algorithm(
                           z_, pi_, A_, B_),
                       observed={'z_': true_z, 'pi_': pi, 'A_': A, 'B_': B})
        trace = pm.sample()
        _ = pm.traceplot(trace)
        plt.show()


if __name__ == '__main__':
    main()

4m data point - well I would say you definitively need either subsample your data or find some kind of approximation.
FYI, here is a post discussing implementation of HMM: How to marginalized Markov Chain with categorical?

How to know if the sampler works well for my model? I can’t use ppc directly because permutation of the matrix parameters won’t change the likelihood.

Did anything go wrong?


At first, I try to fit a simple hidden Markov model with 1 categorical emissions with 4 possible values. The observation is 1000 steps x 1 emission per step. An adaptation of your note book’s unsupervised model took 47 min to fit a 1000 step observation with 500 tuning steps and 500 sampling steps.

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [B, A]
Sampling 4 chains: 100%|█████████████████| 4000/4000 [46:50<00:00,  1.84s/draws]
The number of effective samples is smaller than 25% for some parameters.


My implementation took 48 min to fit a 1000 step observation with 500 tuning steps and 500 sampling steps. I set target_accept to 0.85 after getting the There was 1 divergence after tuning. Increasetarget_acceptor reparameterize. warning. That warning goes away.

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [B, A, pi]
Sampling 4 chains: 100%|█████████████████| 4000/4000 [47:10<00:00,  3.27s/draws]
The number of effective samples is smaller than 25% for some parameters.


I have just implemented a hidden Markov model with four categorical emissions at each point of time. Each categorical emission has 5 possible values. The observation is 100 steps x 4 emissions per step.

Sampling 4 chains: 100%|██████████| 8000/8000 [12:54<00:00,  3.02draws/s]
The estimated number of effective samples is smaller than 200 for some parameters.

The plot for B seems a bit odd.


Here is my adaptation of your notebook:

import numpy as np
import theano
import theano.tensor as tt
import pymc3 as pm
import matplotlib.pyplot as plt


# ---------------- #
# -- Simulation -- #
# ---------------- #
def make_true_parameters(n_states, n_emis):
    trans_mat = np.random.uniform(0, 1, (n_states, n_states))
    trans_mat /= np.sum(trans_mat, axis=1)[:, np.newaxis]
    init_probs = np.random.uniform(0, 1, (n_states,))
    init_probs /= np.sum(init_probs)
    emis_mat = np.random.uniform(0, 1, (n_states, n_emis))
    emis_mat /= np.sum(emis_mat, axis=1)[:, np.newaxis]
    return trans_mat, emis_mat


def make_trj(A, B, n_steps):
    n_states = A.shape[0]
    n_emis = B.shape[1]
    AA = np.eye(n_states) - A + np.ones(shape=(n_states, n_states))
    pi = np.linalg.solve(AA.T, np.ones(shape=(n_states)))

    state_trj = [np.random.choice(n_states, p=pi)]
    for i in range(1, n_steps):
        state_trj += [np.random.choice(n_states, p=A[state_trj[-1]])]

    emis_trj = [np.random.choice(n_emis, p=B[state_trj[i]])
                for i in range(n_steps)]
    return pi, emis_trj


# -------------------- #
# -- Log-likelihood -- #
# -------------------- #
def onestep(obs, gamma_, A, B):
    i = tt.cast(obs, 'int32')
    n_states = B.shape[0]
    B_tiled = tt.tile(tt.log(B[:, i]), (n_states, 1))
    alpha = gamma_ + tt.log(A) + B_tiled
    return pm.math.logsumexp(alpha, axis=0).T


def forward_algorithm(emis_trj, A, B):
    observed_label = theano.shared(np.array(emis_trj))
    gamma = tt.log(B[:, emis_trj[0]])
    gamma = tt.tile(gamma, (1, 1)).T
    n_steps = len(emis_trj)
    result, updates = theano.scan(fn=onestep,
                                  outputs_info=gamma,
                                  sequences=observed_label,
                                  non_sequences=[A, B],
                                  n_steps=n_steps)
    return pm.math.logsumexp(result[-1])


# ---------- #
# -- Main -- #
# ---------- #
def main():
    n_states = 3
    n_emis = 4
    n_steps = 1000
    true_A, true_B = make_true_parameters(n_states, n_emis)
    true_pi, true_z = make_trj(true_A, true_B, n_steps)

    with pm.Model() as model:
        A = pm.Dirichlet('A',
                         a=np.ones((n_states, n_states)),
                         shape=(n_states, n_states),
                         testval=true_A)

        B = pm.Dirichlet('B',
                         a=np.ones((n_states, n_emis)),
                         shape=(n_states, n_emis),
                         testval=true_B)

        potential = forward_algorithm(true_z, A, B)
        obs_logp = pm.Potential('obs_logp', potential)

        trace = pm.sample()
        _ = pm.traceplot(trace)
        plt.savefig('trace2.png')


main()

fix the trace to try to get rid of the permutation problem.

def fix_params(pi, A, B):
    indices = np.argsort(A.sum(axis=1))
    A[:] = A[indices][:, indices]
    B[:] = B[indices]
    pi[:] = pi[indices]


def fix_trace(trace):
    for i in range(len(trace)):
        trace_now = trace[i]
        fix_params(trace_now['pi'], trace_now['A'], trace_now['B'])

Yeah the unsupervised model is nearly impossible to get good sample in my experience. A semisupervised model usually work fairly well.