Hidden Markov Models

Ali Taylan Cemgil, Bogazici University

The latex equations on the Github version do not render well. Please work on a clone.


In [5]:
%matplotlib inline
import networkx as nx
#import pygraphviz
import pyparsing
import numpy as np
import matplotlib.pylab as plt

from IPython.display import Math

np.set_printoptions(precision=5, suppress=True)

In [6]:
def makeDBN(inter, intra, T, labels):
    """Unfold a graph for T time slices"""
    N = max(max([i for i,j in inter]),max([j for i,j in inter]))+1

    G = np.zeros((N*T,N*T))
    pos = []
    all_labels = []
    for n in range(N):
        pos.append((0,-n))
        all_labels.append('$'+labels[n]+'_{'+str(0+1)+"}"+'$')
        
    for e in inter:
        s,d = e
        G[s,d] = 1

    for t in range(1,T):
        for n in range(N):
            pos.append((t,-n))
            all_labels.append('$'+labels[n]+'_{'+str(t+1)+"}"+'$')

        for e in inter:
            s,d = e
            s = s + N*t
            d = d + N*t
            G[s,d] = 1
        
        for e in intra:
            s,d = e
            s = s + N*(t-1)
            d = d + N*t
            G[s,d] = 1
    return G,pos,all_labels

#inter = [(0,1),(1,2),(2,3)]
#intra = [(0,0),(1,1),(0,1),(0,2)]
#variable_names = ["r","z","x", "y"] 
inter = [(0,1)]
intra = [(0,0)]
variable_names = ["x", "y"] 
T = 5

A, pos, label_list = makeDBN(inter, intra, T, variable_names)

G = nx.DiGraph(A)
labels = {i: s for i,s in enumerate(label_list)}
plt.figure(figsize=(12,2.5))
nx.draw(G, pos, node_color="white", node_size=2500, labels=labels, font_size=24, arrows=True)
#nx.draw_graphviz(G,node_size=500, labels=labels, font_size=24, arrows=True)
plt.show()


Forward Pass

\begin{eqnarray} p(y_{1:K}) & = & \sum_{x_{1:K}} p(y_{1:K}|x_{1:K}) p(x_{1:K}) \\ & = & \underbrace{\sum_{x_K} p(y_K | x_K ) \sum_{x_{K-1}} p(x_K|x_{K-1}) \dots \sum_{x_{2}} p(x_3|x_{2}) \underbrace{ p(y_{2}|x_{2}) \overbrace{ \sum_{x_{1}} p(x_2|x_{1}) \underbrace{ p(y_{1}|x_{1}) \overbrace{ p(x_1)}^{\alpha_{1|0}}}_{\alpha_{1|1}} }^{\alpha_{2|1}} }_{\alpha_{2|2}}}_{\alpha_{K|K}} \end{eqnarray}\begin{eqnarray} \alpha_{1|0} & \equiv & p(x_1) \end{eqnarray}\begin{eqnarray} \alpha_{k|k} & \equiv & p(y_{1:k}, x_k) \end{eqnarray}\begin{eqnarray} \alpha_{k|k-1} & \equiv & p(y_{1:k-1}, x_k) \end{eqnarray}

For $k=1, 2, \dots, K$

Predict

$k=1$:

\begin{eqnarray} \alpha_{1|0}(x_1) = p(x_1) \end{eqnarray}

$k>1$:

\begin{eqnarray} {\alpha_{k|k-1}(x_k)} & = & p(y_{1:k-1}, x_k) = \sum_{x_{k-1}} p(x_k| x_{k-1}) p(y_{1:k-1}, x_{k-1}) \\ & = & \sum_{x_{k-1}} p(x_k| x_{k-1}) { \alpha_{k-1|k-1}(x_{k-1}) } \end{eqnarray}

Update

\begin{eqnarray} {\alpha_{k|k}(x_k) } & = & p(y_{1:k}, x_k) = p(y_k | x_k) p(y_{1:k-1}, x_k) \\ & = & p(y_k | x_k) {\alpha_{k|k-1}(x_k)} \end{eqnarray}

Backward Pass

\begin{eqnarray} p(y_{1:K}) & = & \sum_{x_1} p(x_1) p(y_1 | x_1 ) %underbrace{\sum_{x_2} p(x_2|x_{1}) p(y_2 | x_2 )}_{\beta_1} \dots \underbrace{ \sum_{x_{K-1}} p(x_{K-1}|x_{K-2}) p(y_{K-1} | x_{K-1} ) \underbrace{ \sum_{x_K} p(x_K|x_{K-1}) p(y_K | x_K ) \underbrace{{\pmb 1}}_{\beta_{K|K+1}}}_{\beta_{K-1|K}}}_{\beta_{K-2|K-1}} \end{eqnarray}\begin{eqnarray} \beta_{k|k+1}(x_k) & \equiv & p(y_{k+1:K}| x_k) \\ \beta_{k|k}(x_k) & \equiv & p(y_{k:K}| x_k) \end{eqnarray}

For $k=K, K-1, \dots, 1$

'Postdict' : (Backward Prediction)

$k=K$

\begin{eqnarray} \beta_{K|K+1}(x_K) & = & \mathbf{1} \end{eqnarray}

$k<K$ \begin{eqnarray} \beta_{k|k+1}(x_k) & = & p(y_{k+1:K}| x_k) = \sum_{x_{k+1}} p(x_{k+1}| x_{k}) p(y_{k+1:K}| x_{k+1}) \\ & = & \sum_{x_{k+1}} p(x_{k+1}| x_{k}) \beta_{k+1|k+1}(x_{k+1}) \end{eqnarray}

Update \begin{eqnarray} \beta_{k|k}(x_k) & = & p(y_{k:K}| x_k) = p(y_k | x_k) p(y_{k+1:K}| x_k) \\ & = & p(y_k | x_k) {\beta_{k|k+1}(x_k)} \end{eqnarray}

Numerically Stable computation of $\log(\sum_i \exp (l_i ) ))$

Derivation

\begin{eqnarray} L & = & \log(\sum_i \exp (l_i) ) = \log(\sum_i \exp (l_i) \frac{\exp(l^*)}{\exp(l^*)} ) \\ & = & \log( \exp(l^*) \sum_i \exp (l_i - l^*) ) \\ & = & l^* + \log( \sum_i \exp (l_i - l^*) ) \end{eqnarray}

Choose $l^* = \max_i l_i$


In [7]:
import numpy as np

def log_sum_exp_naive(l):
    return np.log(np.sum(np.exp(l)))

def log_sum_exp(l, axis=0):
    l_star = np.max(l, axis=axis, keepdims=True)
    return l_star + np.log(np.sum(np.exp(l - l_star),axis=axis,keepdims=True)) 
    
    
l = np.array([-1000, -10000])

print('Naive evaluation  :', log_sum_exp_naive(l))
print('Numerically stable:', log_sum_exp(l))


Naive evaluation  : -inf
Numerically stable: [-1000.]
/Users/cemgil/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:4: RuntimeWarning: divide by zero encountered in log

An implementation of the forward-backward algorithm


In [8]:
# An implementation of the forward backward algorithm
# For numerical stability, we calculate everything in the log domain

def normalize_exp(log_P, axis=None):
    a = np.max(log_P, keepdims=True, axis=axis)
    P = normalize(np.exp(log_P - a), axis=axis)
    return P


def normalize(A, axis=None):
    Z = np.sum(A, axis=axis,keepdims=True)
    idx = np.where(Z == 0)
    Z[idx] = 1
    return A/Z

def randgen(pr, N=1): 
    L = len(pr)
    return np.random.choice(range(L), size=N, replace=True, p=pr)

def predict(A, lp):
    lstar = np.max(lp)
    return lstar + np.log(np.dot(A,np.exp(lp-lstar)))


def postdict(A, lp):
    lstar = np.max(lp)
    return lstar + np.log(np.dot(np.exp(lp-lstar), A))

def update(y, logB, lp):
    return logB[y,:] + lp

In [9]:
# Generate Parameter Structure
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T

logA = np.log(A)
logB = np.log(B)

# Generate Data

# Number of steps
T = 100

x = np.zeros(T,int)
y = np.zeros(T,int)
for t in range(T):
    if t==0:
        x[t] = randgen(p)
    else:
        x[t] = randgen(A[:,x[t-1]])
    
    y[t] = randgen(B[:,x[t]])

In [10]:
# Forward Pass

# Python indices start from zero so
# log \alpha_{k|k} will be in log_alpha[:,k-1]
# log \alpha_{k|k-1} will be in log_alpha_pred[:,k-1]
log_alpha  = np.zeros((S, T))
log_alpha_pred = np.zeros((S, T))
for k in range(T):
    if k==0:
        log_alpha_pred[:,0] = np.log(p)
    else:
        log_alpha_pred[:,k] = predict(A, log_alpha[:,k-1])
    
    log_alpha[:,k] = update(y[k], logB, log_alpha_pred[:,k])
    
# Backward Pass
log_beta  = np.zeros((S, T))
log_beta_post = np.zeros((S, T))

for k in range(T-1,-1,-1):
    if k==T-1:
        log_beta_post[:,k] = np.zeros(S)
    else:
        log_beta_post[:,k] = postdict(A, log_beta[:,k+1])
    
    log_beta[:,k] = update(y[k], logB, log_beta_post[:,k])

Smoothing - Forward Backward Algorithm (Two filter formulation)

\begin{eqnarray} p(y_{1:K}, x_k) & = & p(y_{1:k}, x_k) p(y_{k+1:K} | x_k) \\ & = & {\alpha_{k|k}(x_k) } {\beta_{k|k+1}(x_k)} \\ & \equiv & \gamma_k(x_k) \end{eqnarray}

In [10]:
# Smoother check
# All numbers must be equal to the marginal likelihood p(y_{1:K})

log_gamma = log_alpha + log_beta_post
print(log_sum_exp(log_gamma))


[[-145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894 -145.16894 -145.16894
  -145.16894 -145.16894 -145.16894 -145.16894]]

Viterbi Algorithm

Finding the most likely state trajectory

$+ \rightarrow \max$, $\times \rightarrow +$,

\begin{eqnarray} x^*_{1:K} & = & \arg\max_{x_{1:K}} p(y_{1:K}|x_{1:K}) p(x_{1:K}) \\ & = & \arg\max_{x_{1:K}} \log p(y_{1:K}|x_{1:K}) + \log p(x_{1:K}) \\ & = & \arg\max_{x_{1:K}} L(y_{1:K}|x_{1:K}) + L(x_{1:K}) \\ \end{eqnarray}\begin{eqnarray} p(y_{1:K}|x^*_{1:K}) & = & \max_{x_{1:K}} p(y_{1:K}|x_{1:K}) p(x_{1:K}) \\ & = & \underbrace{\max_{x_K} L(y_T | x_K ) +\max_{x_{K-1}} L(x_K|x_{K-1})}_{\alpha_K} \dots +\max_{x_{2}} L(x_3|x_{2}) \underbrace{L(y_{2}|x_{2})\overbrace{ +\max_{x_{1}} L(x_2|x_{1})}^{\alpha_{2|1}} }_{\alpha_2} \underbrace{L(y_{1}|x_{1})+\overbrace{L(x_1)}^{\alpha_{1|0}}}_{\alpha_1} \end{eqnarray}\begin{eqnarray} p(y_{1:K}|x^*_{1:K}) = \max_{x_1} p(x_1) p(y_1 | x_1 ) %underbrace{\max_{x_2} p(x_2|x_{1}) p(y_2 | x_2 )}_{\beta_1} \dots \underbrace{ \max_{x_{K-1}} p(x_{K-1}|x_{K-2}) p(y_{K-1} | x_{K-1} )}_{\beta_{K-2}} \underbrace{ \max_{x_K} p(x_K|x_{K-1}) p(y_K | x_K )}_{\beta_{K-1}} \underbrace{{\pmb 1}}_{\beta_{K}} \end{eqnarray}

In [15]:
def predict_maxm(log_A, lp):
    return np.max(log_A + lp, axis=1)

def postdict_maxm(log_A, lp):
    return np.max(log_A.T + lp, axis=1)

# Forward Pass

# Python indices start from zero so
# log \alpha_{k|k} will be in log_alpha[:,k-1]
# log \alpha_{k|k-1} will be in log_alpha_pred[:,k-1]
log_alpha  = np.zeros((S, T))
log_alpha_pred = np.zeros((S, T))
for k in range(T):
    if k==0:
        log_alpha_pred[:,0] = np.log(p)
    else:
        log_alpha_pred[:,k] = predict_maxm(logA, log_alpha[:,k-1])
    
    log_alpha[:,k] = update(y[k], logB, log_alpha_pred[:,k])
    
# Backward Pass
log_beta  = np.zeros((S, T))
log_beta_post = np.zeros((S, T))

for k in range(T-1,-1,-1):
    if k==T-1:
        log_beta_post[:,k] = np.zeros(S)
    else:
        log_beta_post[:,k] = postdict_maxm(logA, log_beta[:,k+1])
    
    log_beta[:,k] = update(y[k], logB, log_beta_post[:,k])
    
log_delta = log_alpha + log_beta_post
print(np.argmax(log_delta,axis=0))
print(x)


[0 2 2 2 0 2 2 0 2 1 0 2 1 0 2 0 2 0 2 1 0 2 2 0 0 2 1 0 2 2 2 0 2 1 0 2 1
 0 2 0 2 0 2 1 0 2 2 1 0 2 1 0 2 0 2 1 0 0 2 0 2 1 0 2 1 0 2 1 0 2 0 0 2 0
 1 0 2 2 1 0 2 0 2 2 2 1 0 0 2 0 2 1 0 1 0 2 2 1 0 1]
[0 2 2 2 0 2 0 2 0 0 0 2 1 0 0 0 2 0 2 0 0 0 2 0 0 2 0 2 0 2 2 0 2 0 2 0 2
 1 0 0 0 2 2 2 2 2 1 0 2 0 0 2 1 0 2 0 0 1 0 0 2 1 0 2 0 0 2 1 0 2 2 0 0 1
 0 0 2 2 2 0 2 0 0 2 2 1 0 0 0 0 2 2 0 1 0 0 2 1 0 0]

In [20]:
print(log_alpha[:,-2])
log_beta_post[:,-2]


[-190.76641 -193.51502 -192.13658]
Out[20]:
array([-3.39781, -3.23542, -2.82864])

Smoothing (Forward filtering - Backward smoothing), The Correction Smoother

Suppose we have computed the filtered quantities $p(x_t| y_{1:t})$ via the forward pass. The forward-backward algorithm requires storing all the observations. In a batch settings, storing the observations and filtering densities may be OK however when observations are arriving indeed sequentially, this may be not desired.

We will derive a recursive algorithm to compute the marginals $p(x_t | y_{1:T})$.

Note that if we calculate instead the so-called pairwise marginal $p(x_t, x_{t+1} | y_{1:T} )$, we can get each marginal simply by: \begin{align} p(x_t | y_{1:T}) & = \sum_{x_{t+1}} p(x_t, x_{t+1} | y_{1:T} ) & \text{Definition} \end{align}

\begin{align} p(x_t, x_{t+1} | y_{1:T} ) & = p(x_{t} |x_{t+1}, y_{1:t},y_{t+1:T} ) p(x_{t+1}|y_{1:T} ) & \text{Factorize} \\ & = p(x_{t} |x_{t+1}, y_{1:t} ) p(x_{t+1}|y_{1:T} ) & \text{Conditional Independence}\\ & = \frac{p(x_{t}, x_{t+1}| y_{1:t} )}{p(x_{t+1}| y_{1:t} )} p(x_{t+1}|y_{1:T} ) & \text{Definition of Conditional} \end{align}

This update has the form: \begin{eqnarray} \text{New Pairwise Marginal}_{t,t+1} & = & \frac{\text{Old Pairwise Marginal}_{t,t+1}}{\text{Old Marginal}_{t+1}} \times {\text{New Marginal}_{t+1}} \\ \end{eqnarray}

The old pairwise marginal can be simply calculated from the filtered marginals as

\begin{align} p(x_{t}, x_{t+1}| y_{1:t} ) & = p(x_{t+1} | x_t, y_{1:t} ) p(x_t | y_{1:t}) & \text{Definition} \\ & = p(x_{t+1} | x_t ) p(x_t | y_{1:t}) & \text{Conditional Independence} \\ & = \text{Transition Model} \times \text{Filtering distribution} \end{align}

The correction smoother calculates a factorisation of the posterior of form

\begin{eqnarray} p(x_{1:T}|y_{1:T}) & = & \frac{\prod_{t=1}^{T-1} p(x_{t}, x_{t+1} | y_{1:T}) }{ \prod_{t=2}^{T-1} p(x_{t} | y_{1:T}) } \end{eqnarray}

In [11]:
# Correction Smoother
# For numerical stability, we calculate everything in the log domain
log_gamma_corr = np.zeros_like(log_alpha)
log_gamma_corr[:,T-1] = log_alpha[:,T-1]

for k in range(T-2,-1,-1):
    log_old_pairwise_marginal = log_alpha[:,k].reshape(1,S) + logA 
    log_old_marginal = predict(A, log_alpha[:,k])
    log_new_pairwise_marginal = log_old_pairwise_marginal + log_gamma_corr[:,k+1].reshape(S,1) - log_old_marginal.reshape(S,1)
    log_gamma_corr[:,k] = log_sum_exp(log_new_pairwise_marginal, axis=0).reshape(S)
    

# Verify that result coincide

gam = normalize_exp(log_gamma, axis=0)
gam_corr = normalize_exp(log_gamma_corr, axis=0)

plt.figure(figsize=(20,5))
plt.subplot(2,1,1)
plt.imshow(gam, interpolation='nearest')
plt.subplot(2,1,2)
plt.imshow(gam_corr, interpolation='nearest')

plt.show()
#print(log_gamma)
#print(log_gamma_corr)


An Implementation of the HMM in Python

We will integrate filtering, smoothing and training functionality into an HMM object.


In [12]:
%load HiddenMarkovModel.py

In [12]:
import numpy as np

from HiddenMarkovModel import *
    
hm = HMM.from_random_parameters()

y,x = hm.generate_sequence(300)

log_alpha, log_alpha_pred = hm.forward(y)
log_gamma = hm.forward_backward_smoother(y)
log_gamma_corr, C1_corr, C2_corr, C3_corr = hm.correction_smoother(y)
C1, C2, C3, ll, V = hm.forward_only_SS(y)

print(C2)
print(C2_corr)


[[ 88.82238   2.92167  75.60473]
 [  1.16289   3.77477  21.76026]
 [ 76.7071   20.2628    7.98339]]
[[ 88.82238   2.92167  75.60473]
 [  1.16289   3.77477  21.76026]
 [ 76.7071   20.2628    7.98339]]

In [39]:
l = list()

l.insert(0,3)
l.insert(0,2)
l.insert(0,13)
l


Out[39]:
[13, 2, 3]

In [17]:
LL = hm.train_EM(y, 100)

In [18]:
plt.plot(LL)
plt.show()


Toy Example 1

A Robot moving around of a circular corridor


In [13]:
# Number of states
S = 50

# Probability of staying on the same tile
ep = 0.7
# Probability of making an arbitrary jump
kidnap = 0.01
# Probability of correct observation
a = 0.9

# Set up the transition matrix
idx = [i for i in range(1,S)]+[0]
I = np.diag(np.ones(S))
A2 = (1-kidnap)*(ep*I + (1-ep)*I[:,idx]) + kidnap*np.ones((S,S))/S
kidnap = 0.0
A = (1-kidnap)*(ep*I + (1-ep)*I[:,idx]) + kidnap*np.ones((S,S))/S

# Set up the observation matrix
c = a*np.random.randint(0,2, S) + (1-a)*np.ones(S)/2
C = np.array([c,1-c])

# Prior
p0 = np.ones(S)/S
#p0 = np.random.rand(S)
p0 = p0/sum(p0)

hm2 = HMM(p0, A2, C)

T = 100
y,x = hm2.generate_sequence(T)
xs = list()
hm = HMM(p0, A, C)



log_alpha, log_alpha_pred = hm.forward(y)
log_gamma = hm.forward_backward_smoother(y)
xs = hm.viterbi(y)

alpha = normalize_exp(log_alpha, axis=0)
alpha_pred = normalize_exp(log_alpha_pred, axis=0)
gam = normalize_exp(log_gamma, axis=0)


plt.figure(figsize=(hm.S//5,1))
plt.imshow(C[1,:].reshape(1,hm.S), interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.set_yticks([])
ax.invert_yaxis()
plt.title('Probability of observing a black tile at state x')
plt.show()


plt.figure(figsize=(20,5))
plt.imshow(y.reshape(1,len(y)), interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.set_yticks([])
ax.invert_yaxis()
plt.title('Observations')
plt.xlabel('time')
plt.show()

plt.figure(figsize=(20,5))
plt.imshow(alpha, interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.invert_yaxis()
plt.title('Filtered marginals')
plt.show()

plt.figure(figsize=(20,5))
plt.imshow(alpha_pred, interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.invert_yaxis()
plt.title('Predicted marginals')
plt.show()

plt.figure(figsize=(20,5))
plt.imshow(gam, interpolation='nearest', cmap='gray_r')
plt.plot(xs,'wo')
plt.plot(x,'.')
ax = plt.gca()
ax.invert_yaxis()
ax.set_xlim((-0.5,T-0.5))
ax.set_ylim((-0.5,S-0.5))
plt.title('Smoothed marginals')

plt.show()


/Users/cemgil/src/ipynb/notes/HiddenMarkovModel.py:17: RuntimeWarning: divide by zero encountered in log
  self.logA = np.log(self.A)

Toy Example 2

Positioning on a lake with a depth sensor.

Learning Hidden Markov Models

Prelude: Estimating parameters of an homogeneous Markov chain

$\newcommand{\ind}[1]{\left[{#1}\right]}$

We have a Markov chain with transition probabilities $p(x_t = i| x_{t-1} = j) = A_{i,j}$ and initial state $p(x_1) = \pi_i$.

The distributions are \begin{eqnarray} p(x_1 |\pi)& = &\prod_{s=1}^{S} \pi_s^{\ind{s = x_1}} \\ p(x_t | x_{t-1}, A) &=& \prod_{j=1}^{S} \prod_{s=1}^{S} A_{s,j}^{{\ind{s = x_t}}\ind{j = x_{t-1}}} \\ \end{eqnarray}

The loglikelihood is \begin{eqnarray} {\cal L}(\pi, A) & = & \log \left( p(x_1 | \pi) \prod_{t=2}^T p(x_t | x_{t-1}, A) \right) \\ & = & \sum_{s=1}^{S} {\ind{s = x_1}} \log \pi_s + \sum_{t=2}^T \sum_{j=1}^{S}\sum_{s=1}^{S} {{\ind{s = x_t}}\ind{j = x_{t-1}}} \log A_{s,j} \end{eqnarray}

We have the constraints $\sum_s \pi_s = 1$ and $\sum_i A_{i,j} = 1$ for all $j=1 \dots S$ so we have $S+1$ constraints. We write the Lagrangian \begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A) & = & \sum_{s=1}^{S} {\ind{s = x_1}} \log \pi_s + \sum_{t=2}^T \sum_{j=1}^{S} \sum_{s=1}^{S} {{\ind{s = x_t}}\ind{j = x_{t-1}}} \log A_{s,j} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) \end{eqnarray}

To find $\pi$ and $A$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A)}{\partial \pi_i} & = & {\ind{i = x_1}} \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A)}{\partial A_{i,j}} & = & \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{A_{i,j}} - \lambda^A_j = 0 \end{eqnarray}

Substitute the constraints $\sum_s \pi_s = 1$ and $\sum_s A_{s,j} = 1, \; j=1\dots S$.

\begin{eqnarray} \pi_i & = & {\ind{i = x_1}} \frac{1}{\lambda^\pi} \\ \sum_i \pi_i & = & \frac{1}{\lambda^\pi} \sum_i {\ind{i = x_1}} = 1\\ \lambda^\pi & = & 1\\ \pi_i & = & {\ind{i = x_1}} \end{eqnarray}

As we have effectively only a single observation for $x_1$, we have a crisp estimate.

\begin{eqnarray} A_{i,j} & = & \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{\lambda^A_j} \\ \sum_i A_{i,j} & = & \sum_i \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{\lambda^A_j} = 1 \\ \lambda^A_j & = & \sum_{t=2}^T \sum_i {{\ind{i = x_t}}\ind{j = x_{t-1}}} \\ A_{i,j} & = & \frac{\sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}}}{\sum_{t=2}^T \sum_i {{\ind{i = x_t}}\ind{j = x_{t-1}}}}\\ & = & \frac{\sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}}}{\sum_{t=2}^T \ind{j = x_{t-1}}} \end{eqnarray}

The result is intuitive. The denominator counts the number of times the chain visited state $j$ in the previous state. The numerator counts the number of times we visit $i$ given we were at $j$ the previous time.

Estimating parameters of an homogeneous Markov chain when several sequences are observed

Suppose we have observed several sequences \begin{eqnarray} X = \{x_1^{(n)}, x_2^{(n)}, \dots, x_{T_n}^{(n)} \} \end{eqnarray} for $n = 1\dots N$. Here $T_n$ is the length of the $n$'th sequence.

The notation becomes slightly more complicated but conceptully the derivation is similar.

The joint probability of all sequences is \begin{eqnarray} p(X | \pi, A) & = & \prod_n \left( p(x_1^{(n)}) \prod_{t=2}^{T_n} p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \end{eqnarray} The loglikelihood is \begin{eqnarray} {\cal L}(\pi, A) & = & \sum_n \left( \log p(x_1^{(n)}) + \sum_{t=2}^{T_n} \log p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \\ & = & \sum_n \left( \sum_{s=1}^{S} {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_{t=2}^{T_n}\sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \right) \\ & = & \sum_{s=1}^{S} \sum_n {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_n \sum_{t=2}^{T_n} \sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \end{eqnarray}

We write the Lagrangian

\begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A) & = & \sum_{s=1}^{S} \sum_n {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_n \sum_{t=2}^{T_n} \sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) \end{eqnarray}

To find $\pi$ and $A$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A)}{\partial \pi_i} & = & \sum_n {\ind{i = x_1^{(n)}}} \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A)}{\partial A_{i,j}} & = & \sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \frac{1}{A_{i,j}} - \lambda^A_j = 0 \end{eqnarray}

Prior

\begin{eqnarray} \pi_i & = & \sum_n {\ind{i = x_1^{(n)}}} \frac{1}{\lambda^\pi}\\ \sum \pi_i & = & \frac{1}{\lambda^\pi} \sum_i \sum_n {\ind{i = x_1^{(n)}}} = 1 \\ \lambda^\pi & = & \sum_i \sum_n {\ind{i = x_1^{(n)}}} = N \\ \pi_i & = & \frac{1}{N} \sum_n {\ind{i = x_1^{(n)}}} \end{eqnarray}

Transition Matrix

\begin{eqnarray} A_{i,j} & = & \sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \frac{1}{\lambda^A_j} \\ \sum_i A_{i,j} & = & \sum_i \sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \frac{1}{\lambda^A_j} = 1 \\ \lambda^A_j & = & \sum_i \sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \\ & = & \sum_n \sum_{t=2}^{T_n} \ind{j = x_{t-1}^{(n)}} \\ A_{i,j} & = & \frac{\sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}}}{\sum_n \sum_{t=2}^{T_n} {\ind{j = x_{t-1}^{(n)}}}} \end{eqnarray}

The EM Algorithm

$\newcommand{\E}[1]{\left\langle#1\right\rangle}$

The EM algorithm is a standart approach for ML estimation, when we have hidden variables. The canonical model is $p(y, x| \theta)$ where we observe only $y$.

The observed data loglikelihood is \begin{eqnarray} {\cal L}(\theta) & = & \log p(y| \theta) = \log \sum_x p(y, x| \theta) \end{eqnarray}

The key to the EM algorithm is the Jensen's inequality, that states for a concave function $f$ we have for $0 \leq \lambda \leq 1$

\begin{eqnarray} f(\lambda x_1 + (1 - \lambda) x_2) \geq \lambda f( x_1) + (1 - \lambda) f(x_2) \end{eqnarray}

In words the value of a function evaluated at the convex combination (lhs) is always equal and larger then the convex combination of the function values. As mathematical expectation \begin{eqnarray} \E{f(x)} & = & \sum_x p(x) f(x) \\ \sum_x p(x) & = & 1 \end{eqnarray}

As $\log(x)$ is a concave function, we have

\begin{eqnarray} f(\E{x}) & \geq & \E{f(x)} \end{eqnarray}

The key idea of the EM algorithm is to lower bound the observed data likelihood an maximize the bound with respect to the parameters. We take any distribution $q(x)$ \begin{eqnarray} {\cal L}(\theta) & = & \log \sum_x p(y, x| \theta) \\ & = & \log \sum_x p(y, x| \theta) \frac{q(x)}{q(x)} \\ & = & \log \E{\frac{p(y, x| \theta)}{q(x)}}_{q(x)}\\ & \geq & \E{\log {p(y, x| \theta)}}_{q(x)} -\E{\log{q(x)} }_{q(x)}\\ \end{eqnarray} For any $q(x)$, we have a lower bound. The natural strategy here is to choose the $q(x)$ that will maximise the lower bound. This is an optimisation problem. To make the notation more familiar, we let $q(x = i) = q_i$. Then, we arrive at the Lagrangian \begin{eqnarray} \Lambda(q, \lambda) & = & \sum_i q_i \log p(y, x=i| \theta) - \sum_i q_i \log q_i \\ & & + \lambda (1 - \sum_i q_i) \end{eqnarray} We take the derivative with respect to $q_i$ \begin{eqnarray} \frac{\partial \Lambda(q, \lambda)}{\partial q_k} & = & \log p(y, x=k| \theta) - (\log q_k + 1) - \lambda = 0\\ \log q_k & = & \log p(y, x=k| \theta) -1 - \lambda \\ q_k & = & p(y, x=k| \theta) \exp(-1 - \lambda) \\ \sum_k q_k & = & \exp(-1 - \lambda) \sum_k p(y, x=k| \theta) = 1\\ \exp(1 + \lambda) & = & p(y | \theta) \\ \exp(-1 - \lambda) & = & 1/p(y | \theta) \\ \end{eqnarray} hence we have \begin{eqnarray} q_k & = & p(y, x=k| \theta)/p(y | \theta) = p(x=k| y \theta) \end{eqnarray} This result shows that the best we can do is to choose the posterior distribution \begin{eqnarray} q(x) & = & p(x| y, \theta) \end{eqnarray} The EM algorithm is an iterative algorithm that exploits this bound result. Given a particular parameter setting $\theta^{(\tau)}$ at iteration $\tau$, we can compute a lower bound of the true likelihood function. \begin{eqnarray} {\cal L}(\theta) & \geq & \E{\log {p(y, x| \theta)}}_{p(x| y, \theta^{(\tau)})} -\E{\log p(x| y, \theta^{(\tau)}) }_{p(x| y, \theta^{(\tau)})}\\ & \equiv & {\cal F}[\theta; \theta^{(\tau)}] + H[p(x| y, \theta^{(\tau)})] \end{eqnarray} We need to show that \begin{eqnarray} {\cal L}(\theta^{(\tau)}) & = & \E{\log {p(y, x| \theta)}}_{p(x| y, \theta^{(\tau)})} -\E{\log p(x| y, \theta^{(\tau)}) }_{p(x| y, \theta^{(\tau)})}\\ & = & {\cal F}[\theta^{(\tau)}; \theta^{(\tau)}] + H[p(x| y, \theta^{(\tau)})] \end{eqnarray} In other words, the bound is tight at $\theta^{(\tau)}$, hence maximizing the bound guarantees maximization of the true loglikelihoood.

In most cases, where the EM algorithm can be applied, the joint distribution is from an {\it exponential family}, i.e., it has the generic algebraic form \begin{eqnarray} p(y, x| \theta) & = & b(y, x)\exp( \sum_l \phi_l(y, x) \psi(\theta_l) - A(\theta) ) \end{eqnarray} where $\phi_l$ are the sufficient statistics and $\psi(\theta_l)$ are the {\it canonical} parameters. The canonical parameters are in one to one relation with a conventional parametrisation. We will give several explicit examples when we cover the HMM's of the next section.

In the case when the complete data likelihood is an exponential family, the computation of the bound requires the expectation \begin{eqnarray} \E{\log p(y, x| \theta)} & = & \E{\log b(x, y)} + \sum_l \E{\phi_l(y, x)} \psi(\theta_l) - A(\theta) \end{eqnarray} where the expectation is taken with respect to the posterior $p(x|y, \theta^{(\tau)})$. In other words, we need to compute expectations of form $\E{\phi_l(y, x)}$. Once these are available, we have effectively an expression for ${\cal F}(\theta; \theta^{(\tau)})$. By maximisation of ${\cal F}$ with respect to $\theta$, and arrive at $\theta^{(\tau + 1)}$ and complete the iteration.

In a rather abstract sense, the EM algorithm proceeds as follows:

The Expectation/Maximization (EM) algorithm.
\begin{algorithmic} \STATE Initialise $\theta^{(0)}$ \FOR{ $\text{epoch} = 1 \dots $ MAXITER} \STATE E-step. Compute the sufficient statistics of the complete data likelihood \STATE M-step. Maximize with respect to the parameters $\theta$ to find $\theta^{(\tau + 1)}$ \ENDFOR \end{algorithmic}

Learning HMM parameters by EM

Suppose we have observed several sequences \begin{eqnarray} Y = \{y_1^{(n)}, y_2^{(n)}, \dots, y_{T_n}^{(n)} \} \end{eqnarray} for $n = 1\dots N$. Here $T_n$ is the length of the $n$'th sequence. Let $Y \in \{1,\dots, R\}$ and $X \in \{1,\dots, S \}$.

The discrete observation, discrete state space HMM has the following factors: \begin{eqnarray} p(x_1 = i) & = & \pi_i \\ p(y_k = r| x_k = i) & = & B_{r,i} \\ p(x_k = i| x_{k-1} = j) & = & A_{i,j} \end{eqnarray}

The joint probability of all observed sequences and corresponding hidden sequences are \begin{eqnarray} p(Y, X | \pi, A, B) & = & \prod_n \left( p(x_1^{(n)}) p(y_1^{(n)} | x_1^{(n)}) \prod_{t=2}^{T_n} p(y_t^{(n)} | x_t^{(n)}) p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \end{eqnarray} The expected complete data loglikelihood is

\begin{eqnarray} {\cal L}(\pi, A, B) & = & \E{\sum_n \left( \log p(x_1^{(n)}) + \sum_{t=1}^{T_n} \log p(x_t^{(n)}| x_{t-1}^{(n)} ) + \sum_{t=2}^{T_n} \log p(y_t^{(n)} | x_t^{(n)}) \right)} \\ & = & \E{\sum_n \left( \sum_{s=1}^{S} {\ind{s = x_1^{(n)}}} \log \pi_s + \log \pi_s \sum_{t=2}^{T_n}\sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} + \sum_{t=1}^{T_n}\sum_{r=1}^{R} \sum_{i=1}^{S} {\ind{r = y_t^{(n)}}}\ind{i = x_{t}^{(n)}} \log B_{r,i}\right)} \\ & = & \sum_n \sum_{s=1}^{S} \E{{\ind{s = x_1^{(n)}}}} \log \pi_s + \sum_n \sum_{t=2}^{T_n} \sum_{j=1}^{S} \sum_{s=1}^{S} \E{{\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \log A_{s,j} + \sum_n \sum_{t=1}^{T_n}\sum_{r=1}^{R} \sum_{i=1}^{S} \E{{\ind{r = y_t^{(n)}}}\ind{i = x_{t}^{(n)}}} \log B_{r,i} \\ & = & \sum_{s=1}^{S} \underbrace{\left( \sum_n \E{ {\ind{s = x_1^{(n)}}}} \right)}_{\equiv C_{\pi}} \log \pi_s + \sum_{j=1}^{S} \sum_{s=1}^{S} \underbrace{\left( \sum_n \sum_{t=2}^{T_n} \E{{\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \right)}_{\equiv C_{A}} \log A_{s,j} + \sum_{r=1}^{R} \sum_{i=1}^{S} \underbrace{\left( \sum_n \sum_{t=1}^{T_n} \E{{\ind{r = y_t^{(n)}}}\ind{i = x_{t}^{(n)}}} \right)}_{\equiv C_{B}} \log B_{r,i} \end{eqnarray}

The M-Step

We write the Lagrangian to ensure that the columns of $A$ and $B$ are positive and normalized

\begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A, \lambda^B) & = & \sum_{s=1}^{S} C_{\pi}(s) \log \pi_s + \sum_{j=1}^{S} \sum_{s=1}^{S} C_{A}(s,j) \log A_{s,j} + \sum_{r=1}^{R} \sum_{i=1}^{S} C_{B}(r,i) \log B_{r,i} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) + \sum_i \lambda^B_i \left( 1 - \sum_r B_{r,i} \right) \end{eqnarray}

To find $\pi$, $A$ and $B$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A, \lambda^B)}{\partial \pi_i} & = & C_{\pi}(i) \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A, \lambda^B)}{\partial A_{i,j}} & = & C_{A}(i,j) \frac{1}{A_{i,j}} - \lambda^A_j = 0 \\ \frac{\partial \Lambda(\pi, B, \lambda^\pi, \lambda^A, \lambda^B)}{\partial B_{k,i}} & = & C_{B}(k,i) \frac{1}{B_{k,i}} - \lambda^B_i = 0 \\ \end{eqnarray}

Prior

We set the derivative to zero and solve for $\pi$ \begin{eqnarray} \pi_i & = & C_{\pi}(i) \frac{1}{\lambda^\pi} \end{eqnarray} As we have the normalization constraint for $\pi$, we also have the following equality from which we can solve for the Lagrange multiplier: \begin{eqnarray} \sum_i \pi_i & = & \frac{1}{\lambda^\pi} \sum_i C_{\pi}(i) = 1 \\ \lambda^\pi & = & \sum_i C_{\pi}(i) = N \end{eqnarray} Substituting, we obtain the intuitive answer: \begin{eqnarray} \pi_i & = & \frac{1}{N} C_{\pi}(i) \end{eqnarray}

Transition Matrix

\begin{eqnarray} A_{i,j} & = & C_A(i,j) \frac{1}{\lambda^A_j} \\ \sum_i A_{i,j} & = & \sum_i C_A(i,j) \frac{1}{\lambda^A_j} = 1 \\ \lambda^A_j & = & \sum_i C_A(i,j) \\ A_{i,j} & = & \frac{C_A(i,j)}{\sum_i C_A(i,j)} \end{eqnarray}

Observation Matrix

\begin{eqnarray} B_{k,i} & = & C_B(k,i) \frac{1}{\lambda^B_i} \\ \sum_k B_{k,i} & = & \sum_k C_B(k,i) \frac{1}{\lambda^B_i} = 1 \\ \lambda^B_i & = & \sum_k C_B(k,i) \\ B_{k,i} & = & \frac{C_B(k,i)}{\sum_k C_B(k,i)} \end{eqnarray}

In [12]:
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T

y = np.array([0, 1, 3, 2, 4])

hm = HMM(p, A, B)

Forward smoothers

The EM algorithm requires obtaining the sufficient statistics of an HMM.

The key observation is that the sufficient statistics are additive: \begin{eqnarray} C_t & = & \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k}) \right) p(x_{1:t}|y_{1:t}) dx_{1:t} \end{eqnarray}

The derivation depends on the following decomposition. By the chain rule of probabilities, we write \begin{eqnarray} p(x_{1:t}|y_{1:t}) & = & p(x_{1}|x_{2:t}, y_{1:t}) \cdots p(x_{t-2}|x_{t-1:t}, y_{1:t}) p(x_{t-1}|x_t, y_{1:t}) p(x_{t}|y_{1:t}) \end{eqnarray} The key observation is that this expression admits computation in the forward direction. By conditional independence \begin{eqnarray} p(x_{1:t}|y_{1:t}) & = & p(x_{1}|x_{2}, y_{1}) \cdots p(x_{t-2}|x_{t-1}, y_{1:t-2}) p(x_{t-1}|x_t, y_{1:t-1}) p(x_{t}|y_{1:t}) \end{eqnarray}

We will use this observation as the basis of a forward recursion. First we decompose the posterior as a product of the filtering density at time $t$ and a conditional quantity familiar from the correction smoother \begin{eqnarray} C_t & = & \int \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k})\right) p(x_{1:t-1}|y_{1:t},x_t) p(x_{t}|y_{1:t}) dx_{1:t-1} dx_t \\ & = & \int \underbrace{\left( \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k})\right) p(x_{1:t-1}|y_{1:t-1},x_t) dx_{1:t-1} \right)}_{=V_t(x_t)} p(x_{t}|y_{1:t}) dx_t \end{eqnarray}

Due to additivity, we can decompose further \begin{eqnarray} V_t(x_t) & = & \int \left( s_t(x_{t-1}, x_{t}) + \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) \right) p(x_{1:t-1}|y_{1:t-1}, x_t) dx_{1:t-1} \\ & = & \int \int \left( s_t(x_{t-1}, x_{t}) + \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) \right) p(x_{1:t-2}|y_{1:t-2}, x_{t-1}) p(x_{t-1}|y_{1:t-1}, x_t) dx_{1:t-2} dx_{t-1} \\ & = & \int \left( s_t(x_{t-1}, x_{t}) + \int \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) p(x_{1:t-2}|y_{1:t-2}, x_{t-1}) dx_{1:t-2} \right) p(x_{t-1}|y_{1:t-1}, x_t) dx_{t-1} \\ & = & \int \left( s_t(x_{t-1}, x_{t}) + V_{t-1}(x_{t-1}) \right) p(x_{t-1}|y_{1:t-1}, x_t) dx_{t-1} \end{eqnarray}

\begin{eqnarray} V_1(x_1) & = & 0 \\ V_2(x_2) & = & \int s_2(x_{1}, x_{2}) p(x_{1}|y_{1}, x_2) dx_{1} \\ V_3(x_3) & = & \int \left( s_3(x_{2}, x_{3}) + V_{2}(x_{2}) \right) p(x_{2}|y_{1:2}, x_3) dx_{2} \end{eqnarray}

Example

Suppose only $y_{1:4}$ are observed. The above recursion, when applied to the sufficient statistics of an HMM has the following specific form. Recall that, $C_{\pi}, C_A, C_B$ for stand for the sufficient statistics for the estimation of the prior, state transitions and observations, respectively. Here, we will denote the expected sufficient statistics explicitely as $C_{A,t}$ where $t$ is the number of observations.

The state transition statistics are \begin{eqnarray} C_{A,4}(a,b) &=& \sum_{x_4} \sum_{x_3} \sum_{x_2} \sum_{x_1} \left( {\ind{a = x_2}}\ind{b = x_{1}} + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) p(x_{1:4}|y_{1,4})\\ \end{eqnarray} By introducing the forward decomposition \begin{eqnarray} C{A,4}(a,b) &=& \sum{x4} \sum{x3} \sum{x2} \left(\sum{x_1} {\ind{a = x2}}\ind{b = x{1}} p(x_1|y_1, x_2) + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x2|y{1,2}, x_3) p(x3|y{1,3}, x_4) p(x4|y{1,4})\ &=& \sum_{x4}\sum{x3} \sum{x_2} \left( \underbrace{{\ind{a = x_2}} p(x_1=b|y_1, x2)}{V_{A,2}(x_2)} + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x2|y{1,2}, x_3) p(x3|y{1,3}, x_4) p(x4|y{1,4}) \ &=& \sum_{x4}\sum{x_3} \left( \underbrace{p(x_1=b|y_1, x_2=a) p(x2=a|y{1,2}, x_3) + {\ind{a = x_3}} p(x2=b|y{1,2}, x3)}{V_{A,3}(x_3)} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x3|y{1,3}, x_4) p(x4|y{1,4}) \ &=& \sum_{x_4} \left(p(x_1=b|y_1, x2=a)\sum{x_3} p(x2=a|y{1,2}, x_3) p(x3|y{1,3}, x_4) + p(x2=b|y{1,2}, x_3=a) p(x3=a|y{1,3}, x_4) \

  • {\ind{a = x_4}} p(x3=b |y{1,3}, x_4) \right) p(x4|y{1,4}) \ &=& p(x_1=b|y_1, x2=a)\sum{x_3} p(x2=a|y{1,2}, x3) \sum{x_4} p(x3|y{1,3}, x_4) p(x4|y{1,4}) \ & & + p(x2=b|y{1,2}, x3=a) \sum{x_4} p(x3=a|y{1,3}, x_4)p(x4|y{1,4}) \ & & + p(x3=b |y{1,3}, x_4=a) p(x4=a|y{1,4})
    \end{eqnarray}

One can verify, that the last line is indeed equal to the required sufficient statistics given below.

\begin{eqnarray} C_{A,T}(a,b) & = & \sum_{x_{1:T}} \sum_{t=2}^T \ind{a = x_t}\ind{b = x_{t-1}} \left( \prod_{t=2:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ & = & \sum_{x_{2:T}} \left(\sum_{x_{1}} \ind{a = x_2}\ind{b = x_{1}} p(x_{1}|y_{1},x_{2}) + \sum_{t=3}^T \ind{a = x_t}\ind{b = x_{t-1}} \right) \left( \prod_{t=3:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{A,2}(a, b, x_2) & = & \ind{a = x_2} p(x_{1} = b |y_{1},x_{2}) \\ C_{A,T}(a,b) & = & \sum_{x_{3:T}} \left( \sum_{x_2} V_2(a, b, x_2) p(x_{2}|y_{1:2},x_{3}) + \sum_{x_2} \ind{a = x_3}\ind{b = x_{2}} p(x_{2}|y_{1:2},x_{3}) + \sum_{t=4}^T \ind{a = x_t}\ind{b = x_{t-1}} \right) \left( \prod_{t=4:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{A,3}(a, b, x_3) & = & \sum_{x_2} V_2(a, b, x_2) p(x_{2}|y_{1:2},x_{3}) + \sum_{x_2} \ind{a = x_3}\ind{b = x_{2}} p(x_{2}|y_{1:2},x_{3}) \\ & = & \sum_{x_2} V_2(a, b, x_2) p(x_{2}|y_{1:2},x_{3}) + \ind{a = x_3} p(x_{2}=b|y_{1:2},x_{3}) \end{eqnarray}

For $t$, the update rule is

\begin{eqnarray} V_{A,t}(a, b, x_t) & = & \sum_{x_{t-1}} V_{A,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \sum_{x_{t-1}} \ind{a = x_t}\ind{b = x_{t-1}} p(x_{t-1}|y_{1:t-1},x_{t}) \\ & = & \sum_{x_{t-1}} V_{A,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \ind{a = x_t} p(x_{t-1}=b|y_{1:t-1},x_{t}) \end{eqnarray}

The advantage of this algorithm is that it is entirely forward and has attractive space properties, requiring only $S^3 + 2S$ space. The other statistics are derived below

\begin{eqnarray} C_{B,T}(a,b) & = & \sum_{x_{1:T}} \sum_{t=1}^T \ind{a = y_t}\ind{b = x_{t}} \left( \prod_{t=2:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ & = & \sum_{x_{2:T}} \left(\ind{a = y_1} p(x_{1}=b|y_{1},x_{2}) + \sum_{t=2}^T \ind{a = y_t}\ind{b = x_{t}} \right) \left( \prod_{t=3:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{B,2}(a, b, x_2) & = & \ind{a = y_1} p(x_{1}=b|y_{1},x_{2}) \\ C_{B,T}(a,b) & = & \sum_{x_{3:T}} \left(\sum_{x_2} V_{B,2}(a, b, x_2) p(x_{2}|y_{1:2},x_{3}) + \ind{a = y_2} p(x_{2}=b|y_{1:2},x_{3}) + \sum_{t=3}^T \ind{a = y_t}\ind{b = x_{t}} \right) \left( \prod_{t=4:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{B,3}(a, b, x_3) & = & \sum_{x_2} V_{B,2}(a, b, x_2) p(x_{2}|y_{1:2},x_{3}) + \ind{a = y_2} p(x_{2}=b|y_{1:2},x_{3}) \end{eqnarray}

For $t$, the update rule is

\begin{eqnarray} V_{B,t}(a, b, x_t) & = & \sum_{x_{t-1}} V_{B,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \ind{a = y_t} p(x_{t-1}=b|y_{1:t-1},x_{t}) \end{eqnarray}
\begin{eqnarray} C_{\pi,T}(a) & = & \sum_{x_{1:T}} \ind{a = x_{1}} \left( \prod_{t=2:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ & = & \sum_{x_{2:T}} p(x_{1}=a|y_{1},x_{2}) \left( \prod_{t=3:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{\pi,2}(a, x_2) & = & p(x_{1}=a|y_{1},x_{2}) \\ C_{\pi,T}(a) & = & \sum_{x_{3:T}} \sum_{x_2} V_{1,2}(a, x_2) p(x_{2}|y_{1:2},x_{3}) \left( \prod_{t=4:T} p(x_{t-1}|y_{1:t-1},x_{t}) \right) p(x_T|y_{1:T}) \\ V_{\pi,3}(a, x_3) & = & \sum_{x_2} V_{\pi,2}(a, x_2) p(x_{2}|y_{1:2},x_{3}) \end{eqnarray}

For $t$, the update equation is \begin{eqnarray} V_{\pi,t}(a, x_t) & = & \sum_{x_{t-1}} V_{\pi,t-1}(a, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) \end{eqnarray}

An implementation is given below.


In [13]:
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T

logA = np.log(A)
logB = np.log(B)

hm = HMM(p, A, B)

y = np.array([1, 1, 1, 3, 2, 0, 3, 2, 1, 0, 0, 3, 2, 4])
T = y.shape[0]

# Forward only estimation of sufficient statistics
V_pi  = np.eye((S))
V_A  = np.zeros((S,S,S))
V_B  = np.zeros((R,S,S))
I_S1S = np.eye(S).reshape((S,1,S))
I_RR = np.eye(R)

for k in range(T):
    if k==0:
        log_alpha_pred = np.log(p)
    else:
        log_alpha_pred = predict(A, log_alpha)
    
    if k>0:
        # Calculate p(x_{k-1}|y_{1:k-1}, x_k) 
        lp = np.log(normalize_exp(log_alpha)).reshape(S,1) + logA.T    
        P = normalize_exp(lp, axis=0)
        
        # Update
        V_pi = np.dot(V_pi, P)             
        V_A = np.dot(V_A, P) + I_S1S*P.reshape((1,S,S))    
        V_B = np.dot(V_B, P) + I_RR[:,y[k-1]].reshape((R,1,1))*P.reshape((1,S,S))    
        
    log_alpha = update(y[k], logB, log_alpha_pred)    
    p_xT = normalize_exp(log_alpha)    
    
C1 = np.dot(V_pi, p_xT.reshape(S,1))
C2 = np.dot(V_A, p_xT.reshape(1,S,1)).reshape((S,S))
C3 = np.dot(V_B, p_xT.reshape(1,S,1)).reshape((R,S))
C3[y[-1],:] +=  p_xT
    
print("Results with the Forward Smoother")
print(C1)
print(np.sum(C1))

print(C2)
print(np.sum(C2))

print(C3)
print(np.sum(C3))

print("Results with the Correction Smoother")
lg, C1_corr, C2_corr, C3_corr = hm.correction_smoother(y)

print(C1_corr)
print(np.sum(C1_corr))

print(C2_corr)
print(np.sum(C2_corr))

print(C3_corr)
print(np.sum(C3_corr))


Results with the Forward Smoother
[[ 0.02937541]
 [ 0.01479263]
 [ 0.95583195]]
1.0
[[ 0.10147144  2.25321693  1.3688555 ]
 [ 3.29750268  2.2347144   1.2879098 ]
 [ 0.01597626  2.12380948  0.3165435 ]]
13.0
[[ 0.77975177  1.80508346  0.41516477]
 [ 1.22828121  1.31928887  1.45242992]
 [ 0.10490948  2.66139344  0.23369708]
 [ 1.30200792  0.82597504  0.87201704]
 [ 0.33796891  0.2231787   0.4388524 ]]
14.0
Results with the Correction Smoother
[ 0.02937541  0.01479263  0.95583195]
1.0
[[ 0.10147144  2.25321693  1.3688555 ]
 [ 3.29750268  2.2347144   1.2879098 ]
 [ 0.01597626  2.12380948  0.3165435 ]]
13.0
[[ 0.77975177  1.80508346  0.41516477]
 [ 1.22828121  1.31928887  1.45242992]
 [ 0.10490948  2.66139344  0.23369708]
 [ 1.30200792  0.82597504  0.87201704]
 [ 0.33796891  0.2231787   0.4388524 ]]
14.0

In [14]:
%connect_info


{
  "stdin_port": 51865, 
  "ip": "127.0.0.1", 
  "control_port": 51866, 
  "hb_port": 51867, 
  "signature_scheme": "hmac-sha256", 
  "key": "859ed0bd-25dd-44c3-8963-504c654b53f5", 
  "kernel_name": "", 
  "shell_port": 51863, 
  "transport": "tcp", 
  "iopub_port": 51864
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-41ae066c-2372-47bc-b4f9-9e82ca8021b6.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.