In [5]:
%matplotlib inline
import networkx as nx
#import pygraphviz
import pyparsing
import numpy as np
import matplotlib.pylab as plt
from IPython.display import Math
np.set_printoptions(precision=5, suppress=True)
In [6]:
def makeDBN(inter, intra, T, labels):
"""Unfold a graph for T time slices"""
N = max(max([i for i,j in inter]),max([j for i,j in inter]))+1
G = np.zeros((N*T,N*T))
pos = []
all_labels = []
for n in range(N):
pos.append((0,-n))
all_labels.append('$'+labels[n]+'_{'+str(0+1)+"}"+'$')
for e in inter:
s,d = e
G[s,d] = 1
for t in range(1,T):
for n in range(N):
pos.append((t,-n))
all_labels.append('$'+labels[n]+'_{'+str(t+1)+"}"+'$')
for e in inter:
s,d = e
s = s + N*t
d = d + N*t
G[s,d] = 1
for e in intra:
s,d = e
s = s + N*(t-1)
d = d + N*t
G[s,d] = 1
return G,pos,all_labels
#inter = [(0,1),(1,2),(2,3)]
#intra = [(0,0),(1,1),(0,1),(0,2)]
#variable_names = ["r","z","x", "y"]
inter = [(0,1)]
intra = [(0,0)]
variable_names = ["x", "y"]
T = 5
A, pos, label_list = makeDBN(inter, intra, T, variable_names)
G = nx.DiGraph(A)
labels = {i: s for i,s in enumerate(label_list)}
plt.figure(figsize=(12,2.5))
nx.draw(G, pos, node_color="white", node_size=2500, labels=labels, font_size=24, arrows=True)
#nx.draw_graphviz(G,node_size=500, labels=labels, font_size=24, arrows=True)
plt.show()
For $k=1, 2, \dots, K$
Predict
$k=1$:
\begin{eqnarray} \alpha_{1|0}(x_1) = p(x_1) \end{eqnarray}$k>1$:
\begin{eqnarray} {\alpha_{k|k-1}(x_k)} & = & p(y_{1:k-1}, x_k) = \sum_{x_{k-1}} p(x_k| x_{k-1}) p(y_{1:k-1}, x_{k-1}) \\ & = & \sum_{x_{k-1}} p(x_k| x_{k-1}) { \alpha_{k-1|k-1}(x_{k-1}) } \end{eqnarray}Update
\begin{eqnarray} {\alpha_{k|k}(x_k) } & = & p(y_{1:k}, x_k) = p(y_k | x_k) p(y_{1:k-1}, x_k) \\ & = & p(y_k | x_k) {\alpha_{k|k-1}(x_k)} \end{eqnarray}For $k=K, K-1, \dots, 1$
'Postdict' : (Backward Prediction)
$k=K$
\begin{eqnarray} \beta_{K|K+1}(x_K) & = & \mathbf{1} \end{eqnarray}$k<K$ \begin{eqnarray} \beta_{k|k+1}(x_k) & = & p(y_{k+1:K}| x_k) = \sum_{x_{k+1}} p(x_{k+1}| x_{k}) p(y_{k+1:K}| x_{k+1}) \\ & = & \sum_{x_{k+1}} p(x_{k+1}| x_{k}) \beta_{k+1|k+1}(x_{k+1}) \end{eqnarray}
Update \begin{eqnarray} \beta_{k|k}(x_k) & = & p(y_{k:K}| x_k) = p(y_k | x_k) p(y_{k+1:K}| x_k) \\ & = & p(y_k | x_k) {\beta_{k|k+1}(x_k)} \end{eqnarray}
Derivation
\begin{eqnarray} L & = & \log(\sum_i \exp (l_i) ) = \log(\sum_i \exp (l_i) \frac{\exp(l^*)}{\exp(l^*)} ) \\ & = & \log( \exp(l^*) \sum_i \exp (l_i - l^*) ) \\ & = & l^* + \log( \sum_i \exp (l_i - l^*) ) \end{eqnarray}Choose $l^* = \max_i l_i$
In [7]:
import numpy as np
def log_sum_exp_naive(l):
return np.log(np.sum(np.exp(l)))
def log_sum_exp(l, axis=0):
l_star = np.max(l, axis=axis, keepdims=True)
return l_star + np.log(np.sum(np.exp(l - l_star),axis=axis,keepdims=True))
l = np.array([-1000, -10000])
print('Naive evaluation :', log_sum_exp_naive(l))
print('Numerically stable:', log_sum_exp(l))
In [8]:
# An implementation of the forward backward algorithm
# For numerical stability, we calculate everything in the log domain
def normalize_exp(log_P, axis=None):
a = np.max(log_P, keepdims=True, axis=axis)
P = normalize(np.exp(log_P - a), axis=axis)
return P
def normalize(A, axis=None):
Z = np.sum(A, axis=axis,keepdims=True)
idx = np.where(Z == 0)
Z[idx] = 1
return A/Z
def randgen(pr, N=1):
L = len(pr)
return np.random.choice(range(L), size=N, replace=True, p=pr)
def predict(A, lp):
lstar = np.max(lp)
return lstar + np.log(np.dot(A,np.exp(lp-lstar)))
def postdict(A, lp):
lstar = np.max(lp)
return lstar + np.log(np.dot(np.exp(lp-lstar), A))
def update(y, logB, lp):
return logB[y,:] + lp
In [9]:
# Generate Parameter Structure
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T
logA = np.log(A)
logB = np.log(B)
# Generate Data
# Number of steps
T = 100
x = np.zeros(T,int)
y = np.zeros(T,int)
for t in range(T):
if t==0:
x[t] = randgen(p)
else:
x[t] = randgen(A[:,x[t-1]])
y[t] = randgen(B[:,x[t]])
In [10]:
# Forward Pass
# Python indices start from zero so
# log \alpha_{k|k} will be in log_alpha[:,k-1]
# log \alpha_{k|k-1} will be in log_alpha_pred[:,k-1]
log_alpha = np.zeros((S, T))
log_alpha_pred = np.zeros((S, T))
for k in range(T):
if k==0:
log_alpha_pred[:,0] = np.log(p)
else:
log_alpha_pred[:,k] = predict(A, log_alpha[:,k-1])
log_alpha[:,k] = update(y[k], logB, log_alpha_pred[:,k])
# Backward Pass
log_beta = np.zeros((S, T))
log_beta_post = np.zeros((S, T))
for k in range(T-1,-1,-1):
if k==T-1:
log_beta_post[:,k] = np.zeros(S)
else:
log_beta_post[:,k] = postdict(A, log_beta[:,k+1])
log_beta[:,k] = update(y[k], logB, log_beta_post[:,k])
In [10]:
# Smoother check
# All numbers must be equal to the marginal likelihood p(y_{1:K})
log_gamma = log_alpha + log_beta_post
print(log_sum_exp(log_gamma))
Finding the most likely state trajectory
$+ \rightarrow \max$, $\times \rightarrow +$,
\begin{eqnarray} x^*_{1:K} & = & \arg\max_{x_{1:K}} p(y_{1:K}|x_{1:K}) p(x_{1:K}) \\ & = & \arg\max_{x_{1:K}} \log p(y_{1:K}|x_{1:K}) + \log p(x_{1:K}) \\ & = & \arg\max_{x_{1:K}} L(y_{1:K}|x_{1:K}) + L(x_{1:K}) \\ \end{eqnarray}\begin{eqnarray} p(y_{1:K}|x^*_{1:K}) & = & \max_{x_{1:K}} p(y_{1:K}|x_{1:K}) p(x_{1:K}) \\ & = & \underbrace{\max_{x_K} L(y_T | x_K ) +\max_{x_{K-1}} L(x_K|x_{K-1})}_{\alpha_K} \dots +\max_{x_{2}} L(x_3|x_{2}) \underbrace{L(y_{2}|x_{2})\overbrace{ +\max_{x_{1}} L(x_2|x_{1})}^{\alpha_{2|1}} }_{\alpha_2} \underbrace{L(y_{1}|x_{1})+\overbrace{L(x_1)}^{\alpha_{1|0}}}_{\alpha_1} \end{eqnarray}\begin{eqnarray} p(y_{1:K}|x^*_{1:K}) = \max_{x_1} p(x_1) p(y_1 | x_1 ) %underbrace{\max_{x_2} p(x_2|x_{1}) p(y_2 | x_2 )}_{\beta_1} \dots \underbrace{ \max_{x_{K-1}} p(x_{K-1}|x_{K-2}) p(y_{K-1} | x_{K-1} )}_{\beta_{K-2}} \underbrace{ \max_{x_K} p(x_K|x_{K-1}) p(y_K | x_K )}_{\beta_{K-1}} \underbrace{{\pmb 1}}_{\beta_{K}} \end{eqnarray}
In [15]:
def predict_maxm(log_A, lp):
return np.max(log_A + lp, axis=1)
def postdict_maxm(log_A, lp):
return np.max(log_A.T + lp, axis=1)
# Forward Pass
# Python indices start from zero so
# log \alpha_{k|k} will be in log_alpha[:,k-1]
# log \alpha_{k|k-1} will be in log_alpha_pred[:,k-1]
log_alpha = np.zeros((S, T))
log_alpha_pred = np.zeros((S, T))
for k in range(T):
if k==0:
log_alpha_pred[:,0] = np.log(p)
else:
log_alpha_pred[:,k] = predict_maxm(logA, log_alpha[:,k-1])
log_alpha[:,k] = update(y[k], logB, log_alpha_pred[:,k])
# Backward Pass
log_beta = np.zeros((S, T))
log_beta_post = np.zeros((S, T))
for k in range(T-1,-1,-1):
if k==T-1:
log_beta_post[:,k] = np.zeros(S)
else:
log_beta_post[:,k] = postdict_maxm(logA, log_beta[:,k+1])
log_beta[:,k] = update(y[k], logB, log_beta_post[:,k])
log_delta = log_alpha + log_beta_post
print(np.argmax(log_delta,axis=0))
print(x)
In [20]:
print(log_alpha[:,-2])
log_beta_post[:,-2]
Out[20]:
Suppose we have computed the filtered quantities $p(x_t| y_{1:t})$ via the forward pass. The forward-backward algorithm requires storing all the observations. In a batch settings, storing the observations and filtering densities may be OK however when observations are arriving indeed sequentially, this may be not desired.
We will derive a recursive algorithm to compute the marginals $p(x_t | y_{1:T})$.
Note that if we calculate instead the so-called pairwise marginal $p(x_t, x_{t+1} | y_{1:T} )$, we can get each marginal simply by: \begin{align} p(x_t | y_{1:T}) & = \sum_{x_{t+1}} p(x_t, x_{t+1} | y_{1:T} ) & \text{Definition} \end{align}
\begin{align} p(x_t, x_{t+1} | y_{1:T} ) & = p(x_{t} |x_{t+1}, y_{1:t},y_{t+1:T} ) p(x_{t+1}|y_{1:T} ) & \text{Factorize} \\ & = p(x_{t} |x_{t+1}, y_{1:t} ) p(x_{t+1}|y_{1:T} ) & \text{Conditional Independence}\\ & = \frac{p(x_{t}, x_{t+1}| y_{1:t} )}{p(x_{t+1}| y_{1:t} )} p(x_{t+1}|y_{1:T} ) & \text{Definition of Conditional} \end{align}This update has the form: \begin{eqnarray} \text{New Pairwise Marginal}_{t,t+1} & = & \frac{\text{Old Pairwise Marginal}_{t,t+1}}{\text{Old Marginal}_{t+1}} \times {\text{New Marginal}_{t+1}} \\ \end{eqnarray}
The old pairwise marginal can be simply calculated from the filtered marginals as
\begin{align} p(x_{t}, x_{t+1}| y_{1:t} ) & = p(x_{t+1} | x_t, y_{1:t} ) p(x_t | y_{1:t}) & \text{Definition} \\ & = p(x_{t+1} | x_t ) p(x_t | y_{1:t}) & \text{Conditional Independence} \\ & = \text{Transition Model} \times \text{Filtering distribution} \end{align}The correction smoother calculates a factorisation of the posterior of form
\begin{eqnarray} p(x_{1:T}|y_{1:T}) & = & \frac{\prod_{t=1}^{T-1} p(x_{t}, x_{t+1} | y_{1:T}) }{ \prod_{t=2}^{T-1} p(x_{t} | y_{1:T}) } \end{eqnarray}
In [11]:
# Correction Smoother
# For numerical stability, we calculate everything in the log domain
log_gamma_corr = np.zeros_like(log_alpha)
log_gamma_corr[:,T-1] = log_alpha[:,T-1]
for k in range(T-2,-1,-1):
log_old_pairwise_marginal = log_alpha[:,k].reshape(1,S) + logA
log_old_marginal = predict(A, log_alpha[:,k])
log_new_pairwise_marginal = log_old_pairwise_marginal + log_gamma_corr[:,k+1].reshape(S,1) - log_old_marginal.reshape(S,1)
log_gamma_corr[:,k] = log_sum_exp(log_new_pairwise_marginal, axis=0).reshape(S)
# Verify that result coincide
gam = normalize_exp(log_gamma, axis=0)
gam_corr = normalize_exp(log_gamma_corr, axis=0)
plt.figure(figsize=(20,5))
plt.subplot(2,1,1)
plt.imshow(gam, interpolation='nearest')
plt.subplot(2,1,2)
plt.imshow(gam_corr, interpolation='nearest')
plt.show()
#print(log_gamma)
#print(log_gamma_corr)
In [12]:
%load HiddenMarkovModel.py
In [12]:
import numpy as np
from HiddenMarkovModel import *
hm = HMM.from_random_parameters()
y,x = hm.generate_sequence(300)
log_alpha, log_alpha_pred = hm.forward(y)
log_gamma = hm.forward_backward_smoother(y)
log_gamma_corr, C1_corr, C2_corr, C3_corr = hm.correction_smoother(y)
C1, C2, C3, ll, V = hm.forward_only_SS(y)
print(C2)
print(C2_corr)
In [39]:
l = list()
l.insert(0,3)
l.insert(0,2)
l.insert(0,13)
l
Out[39]:
In [17]:
LL = hm.train_EM(y, 100)
In [18]:
plt.plot(LL)
plt.show()
In [13]:
# Number of states
S = 50
# Probability of staying on the same tile
ep = 0.7
# Probability of making an arbitrary jump
kidnap = 0.01
# Probability of correct observation
a = 0.9
# Set up the transition matrix
idx = [i for i in range(1,S)]+[0]
I = np.diag(np.ones(S))
A2 = (1-kidnap)*(ep*I + (1-ep)*I[:,idx]) + kidnap*np.ones((S,S))/S
kidnap = 0.0
A = (1-kidnap)*(ep*I + (1-ep)*I[:,idx]) + kidnap*np.ones((S,S))/S
# Set up the observation matrix
c = a*np.random.randint(0,2, S) + (1-a)*np.ones(S)/2
C = np.array([c,1-c])
# Prior
p0 = np.ones(S)/S
#p0 = np.random.rand(S)
p0 = p0/sum(p0)
hm2 = HMM(p0, A2, C)
T = 100
y,x = hm2.generate_sequence(T)
xs = list()
hm = HMM(p0, A, C)
log_alpha, log_alpha_pred = hm.forward(y)
log_gamma = hm.forward_backward_smoother(y)
xs = hm.viterbi(y)
alpha = normalize_exp(log_alpha, axis=0)
alpha_pred = normalize_exp(log_alpha_pred, axis=0)
gam = normalize_exp(log_gamma, axis=0)
plt.figure(figsize=(hm.S//5,1))
plt.imshow(C[1,:].reshape(1,hm.S), interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.set_yticks([])
ax.invert_yaxis()
plt.title('Probability of observing a black tile at state x')
plt.show()
plt.figure(figsize=(20,5))
plt.imshow(y.reshape(1,len(y)), interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.set_yticks([])
ax.invert_yaxis()
plt.title('Observations')
plt.xlabel('time')
plt.show()
plt.figure(figsize=(20,5))
plt.imshow(alpha, interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.invert_yaxis()
plt.title('Filtered marginals')
plt.show()
plt.figure(figsize=(20,5))
plt.imshow(alpha_pred, interpolation='nearest', cmap='gray_r')
ax = plt.gca()
ax.invert_yaxis()
plt.title('Predicted marginals')
plt.show()
plt.figure(figsize=(20,5))
plt.imshow(gam, interpolation='nearest', cmap='gray_r')
plt.plot(xs,'wo')
plt.plot(x,'.')
ax = plt.gca()
ax.invert_yaxis()
ax.set_xlim((-0.5,T-0.5))
ax.set_ylim((-0.5,S-0.5))
plt.title('Smoothed marginals')
plt.show()
$\newcommand{\ind}[1]{\left[{#1}\right]}$
We have a Markov chain with transition probabilities $p(x_t = i| x_{t-1} = j) = A_{i,j}$ and initial state $p(x_1) = \pi_i$.
The distributions are \begin{eqnarray} p(x_1 |\pi)& = &\prod_{s=1}^{S} \pi_s^{\ind{s = x_1}} \\ p(x_t | x_{t-1}, A) &=& \prod_{j=1}^{S} \prod_{s=1}^{S} A_{s,j}^{{\ind{s = x_t}}\ind{j = x_{t-1}}} \\ \end{eqnarray}
The loglikelihood is \begin{eqnarray} {\cal L}(\pi, A) & = & \log \left( p(x_1 | \pi) \prod_{t=2}^T p(x_t | x_{t-1}, A) \right) \\ & = & \sum_{s=1}^{S} {\ind{s = x_1}} \log \pi_s + \sum_{t=2}^T \sum_{j=1}^{S}\sum_{s=1}^{S} {{\ind{s = x_t}}\ind{j = x_{t-1}}} \log A_{s,j} \end{eqnarray}
We have the constraints $\sum_s \pi_s = 1$ and $\sum_i A_{i,j} = 1$ for all $j=1 \dots S$ so we have $S+1$ constraints. We write the Lagrangian \begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A) & = & \sum_{s=1}^{S} {\ind{s = x_1}} \log \pi_s + \sum_{t=2}^T \sum_{j=1}^{S} \sum_{s=1}^{S} {{\ind{s = x_t}}\ind{j = x_{t-1}}} \log A_{s,j} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) \end{eqnarray}
To find $\pi$ and $A$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A)}{\partial \pi_i} & = & {\ind{i = x_1}} \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A)}{\partial A_{i,j}} & = & \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{A_{i,j}} - \lambda^A_j = 0 \end{eqnarray}
Substitute the constraints $\sum_s \pi_s = 1$ and $\sum_s A_{s,j} = 1, \; j=1\dots S$.
\begin{eqnarray} \pi_i & = & {\ind{i = x_1}} \frac{1}{\lambda^\pi} \\ \sum_i \pi_i & = & \frac{1}{\lambda^\pi} \sum_i {\ind{i = x_1}} = 1\\ \lambda^\pi & = & 1\\ \pi_i & = & {\ind{i = x_1}} \end{eqnarray}As we have effectively only a single observation for $x_1$, we have a crisp estimate.
\begin{eqnarray} A_{i,j} & = & \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{\lambda^A_j} \\ \sum_i A_{i,j} & = & \sum_i \sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}} \frac{1}{\lambda^A_j} = 1 \\ \lambda^A_j & = & \sum_{t=2}^T \sum_i {{\ind{i = x_t}}\ind{j = x_{t-1}}} \\ A_{i,j} & = & \frac{\sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}}}{\sum_{t=2}^T \sum_i {{\ind{i = x_t}}\ind{j = x_{t-1}}}}\\ & = & \frac{\sum_{t=2}^T {{\ind{i = x_t}}\ind{j = x_{t-1}}}}{\sum_{t=2}^T \ind{j = x_{t-1}}} \end{eqnarray}The result is intuitive. The denominator counts the number of times the chain visited state $j$ in the previous state. The numerator counts the number of times we visit $i$ given we were at $j$ the previous time.
Suppose we have observed several sequences \begin{eqnarray} X = \{x_1^{(n)}, x_2^{(n)}, \dots, x_{T_n}^{(n)} \} \end{eqnarray} for $n = 1\dots N$. Here $T_n$ is the length of the $n$'th sequence.
The notation becomes slightly more complicated but conceptully the derivation is similar.
The joint probability of all sequences is \begin{eqnarray} p(X | \pi, A) & = & \prod_n \left( p(x_1^{(n)}) \prod_{t=2}^{T_n} p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \end{eqnarray} The loglikelihood is \begin{eqnarray} {\cal L}(\pi, A) & = & \sum_n \left( \log p(x_1^{(n)}) + \sum_{t=2}^{T_n} \log p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \\ & = & \sum_n \left( \sum_{s=1}^{S} {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_{t=2}^{T_n}\sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \right) \\ & = & \sum_{s=1}^{S} \sum_n {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_n \sum_{t=2}^{T_n} \sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \end{eqnarray}
We write the Lagrangian
\begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A) & = & \sum_{s=1}^{S} \sum_n {\ind{s = x_1^{(n)}}} \log \pi_s + \sum_n \sum_{t=2}^{T_n} \sum_{j=1}^{S} \sum_{s=1}^{S} {\ind{s = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}} \log A_{s,j} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) \end{eqnarray}To find $\pi$ and $A$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A)}{\partial \pi_i} & = & \sum_n {\ind{i = x_1^{(n)}}} \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A)}{\partial A_{i,j}} & = & \sum_n \sum_{t=2}^{T_n} {{\ind{i = x_t^{(n)}}}\ind{j = x_{t-1}^{(n)}}} \frac{1}{A_{i,j}} - \lambda^A_j = 0 \end{eqnarray}
$\newcommand{\E}[1]{\left\langle#1\right\rangle}$
The EM algorithm is a standart approach for ML estimation, when we have hidden variables. The canonical model is $p(y, x| \theta)$ where we observe only $y$.
The observed data loglikelihood is \begin{eqnarray} {\cal L}(\theta) & = & \log p(y| \theta) = \log \sum_x p(y, x| \theta) \end{eqnarray}
The key to the EM algorithm is the Jensen's inequality, that states for a concave function $f$ we have for $0 \leq \lambda \leq 1$
\begin{eqnarray} f(\lambda x_1 + (1 - \lambda) x_2) \geq \lambda f( x_1) + (1 - \lambda) f(x_2) \end{eqnarray}In words the value of a function evaluated at the convex combination (lhs) is always equal and larger then the convex combination of the function values. As mathematical expectation \begin{eqnarray} \E{f(x)} & = & \sum_x p(x) f(x) \\ \sum_x p(x) & = & 1 \end{eqnarray}
As $\log(x)$ is a concave function, we have
\begin{eqnarray} f(\E{x}) & \geq & \E{f(x)} \end{eqnarray}The key idea of the EM algorithm is to lower bound the observed data likelihood an maximize the bound with respect to the parameters. We take any distribution $q(x)$ \begin{eqnarray} {\cal L}(\theta) & = & \log \sum_x p(y, x| \theta) \\ & = & \log \sum_x p(y, x| \theta) \frac{q(x)}{q(x)} \\ & = & \log \E{\frac{p(y, x| \theta)}{q(x)}}_{q(x)}\\ & \geq & \E{\log {p(y, x| \theta)}}_{q(x)} -\E{\log{q(x)} }_{q(x)}\\ \end{eqnarray} For any $q(x)$, we have a lower bound. The natural strategy here is to choose the $q(x)$ that will maximise the lower bound. This is an optimisation problem. To make the notation more familiar, we let $q(x = i) = q_i$. Then, we arrive at the Lagrangian \begin{eqnarray} \Lambda(q, \lambda) & = & \sum_i q_i \log p(y, x=i| \theta) - \sum_i q_i \log q_i \\ & & + \lambda (1 - \sum_i q_i) \end{eqnarray} We take the derivative with respect to $q_i$ \begin{eqnarray} \frac{\partial \Lambda(q, \lambda)}{\partial q_k} & = & \log p(y, x=k| \theta) - (\log q_k + 1) - \lambda = 0\\ \log q_k & = & \log p(y, x=k| \theta) -1 - \lambda \\ q_k & = & p(y, x=k| \theta) \exp(-1 - \lambda) \\ \sum_k q_k & = & \exp(-1 - \lambda) \sum_k p(y, x=k| \theta) = 1\\ \exp(1 + \lambda) & = & p(y | \theta) \\ \exp(-1 - \lambda) & = & 1/p(y | \theta) \\ \end{eqnarray} hence we have \begin{eqnarray} q_k & = & p(y, x=k| \theta)/p(y | \theta) = p(x=k| y \theta) \end{eqnarray} This result shows that the best we can do is to choose the posterior distribution \begin{eqnarray} q(x) & = & p(x| y, \theta) \end{eqnarray} The EM algorithm is an iterative algorithm that exploits this bound result. Given a particular parameter setting $\theta^{(\tau)}$ at iteration $\tau$, we can compute a lower bound of the true likelihood function. \begin{eqnarray} {\cal L}(\theta) & \geq & \E{\log {p(y, x| \theta)}}_{p(x| y, \theta^{(\tau)})} -\E{\log p(x| y, \theta^{(\tau)}) }_{p(x| y, \theta^{(\tau)})}\\ & \equiv & {\cal F}[\theta; \theta^{(\tau)}] + H[p(x| y, \theta^{(\tau)})] \end{eqnarray} We need to show that \begin{eqnarray} {\cal L}(\theta^{(\tau)}) & = & \E{\log {p(y, x| \theta)}}_{p(x| y, \theta^{(\tau)})} -\E{\log p(x| y, \theta^{(\tau)}) }_{p(x| y, \theta^{(\tau)})}\\ & = & {\cal F}[\theta^{(\tau)}; \theta^{(\tau)}] + H[p(x| y, \theta^{(\tau)})] \end{eqnarray} In other words, the bound is tight at $\theta^{(\tau)}$, hence maximizing the bound guarantees maximization of the true loglikelihoood.
In most cases, where the EM algorithm can be applied, the joint distribution is from an {\it exponential family}, i.e., it has the generic algebraic form \begin{eqnarray} p(y, x| \theta) & = & b(y, x)\exp( \sum_l \phi_l(y, x) \psi(\theta_l) - A(\theta) ) \end{eqnarray} where $\phi_l$ are the sufficient statistics and $\psi(\theta_l)$ are the {\it canonical} parameters. The canonical parameters are in one to one relation with a conventional parametrisation. We will give several explicit examples when we cover the HMM's of the next section.
In the case when the complete data likelihood is an exponential family, the computation of the bound requires the expectation \begin{eqnarray} \E{\log p(y, x| \theta)} & = & \E{\log b(x, y)} + \sum_l \E{\phi_l(y, x)} \psi(\theta_l) - A(\theta) \end{eqnarray} where the expectation is taken with respect to the posterior $p(x|y, \theta^{(\tau)})$. In other words, we need to compute expectations of form $\E{\phi_l(y, x)}$. Once these are available, we have effectively an expression for ${\cal F}(\theta; \theta^{(\tau)})$. By maximisation of ${\cal F}$ with respect to $\theta$, and arrive at $\theta^{(\tau + 1)}$ and complete the iteration.
In a rather abstract sense, the EM algorithm proceeds as follows:
Suppose we have observed several sequences \begin{eqnarray} Y = \{y_1^{(n)}, y_2^{(n)}, \dots, y_{T_n}^{(n)} \} \end{eqnarray} for $n = 1\dots N$. Here $T_n$ is the length of the $n$'th sequence. Let $Y \in \{1,\dots, R\}$ and $X \in \{1,\dots, S \}$.
The discrete observation, discrete state space HMM has the following factors: \begin{eqnarray} p(x_1 = i) & = & \pi_i \\ p(y_k = r| x_k = i) & = & B_{r,i} \\ p(x_k = i| x_{k-1} = j) & = & A_{i,j} \end{eqnarray}
The joint probability of all observed sequences and corresponding hidden sequences are \begin{eqnarray} p(Y, X | \pi, A, B) & = & \prod_n \left( p(x_1^{(n)}) p(y_1^{(n)} | x_1^{(n)}) \prod_{t=2}^{T_n} p(y_t^{(n)} | x_t^{(n)}) p(x_t^{(n)}| x_{t-1}^{(n)} ) \right) \end{eqnarray} The expected complete data loglikelihood is
We write the Lagrangian to ensure that the columns of $A$ and $B$ are positive and normalized
\begin{eqnarray} \Lambda(\pi, A, \lambda^\pi, \lambda^A, \lambda^B) & = & \sum_{s=1}^{S} C_{\pi}(s) \log \pi_s + \sum_{j=1}^{S} \sum_{s=1}^{S} C_{A}(s,j) \log A_{s,j} + \sum_{r=1}^{R} \sum_{i=1}^{S} C_{B}(r,i) \log B_{r,i} \\ & & + \lambda^\pi \left( 1 - \sum_s \pi_s \right) + \sum_j \lambda^A_j \left( 1 - \sum_s A_{s,j} \right) + \sum_i \lambda^B_i \left( 1 - \sum_r B_{r,i} \right) \end{eqnarray}To find $\pi$, $A$ and $B$ we take the derivative of the Lagrangian \begin{eqnarray} \frac{\partial \Lambda(\pi, A,\lambda^\pi, \lambda^A, \lambda^B)}{\partial \pi_i} & = & C_{\pi}(i) \frac{1}{\pi_i} - \lambda^\pi = 0\\ \frac{\partial \Lambda(\pi, A, \lambda^\pi, \lambda^A, \lambda^B)}{\partial A_{i,j}} & = & C_{A}(i,j) \frac{1}{A_{i,j}} - \lambda^A_j = 0 \\ \frac{\partial \Lambda(\pi, B, \lambda^\pi, \lambda^A, \lambda^B)}{\partial B_{k,i}} & = & C_{B}(k,i) \frac{1}{B_{k,i}} - \lambda^B_i = 0 \\ \end{eqnarray}
We set the derivative to zero and solve for $\pi$ \begin{eqnarray} \pi_i & = & C_{\pi}(i) \frac{1}{\lambda^\pi} \end{eqnarray} As we have the normalization constraint for $\pi$, we also have the following equality from which we can solve for the Lagrange multiplier: \begin{eqnarray} \sum_i \pi_i & = & \frac{1}{\lambda^\pi} \sum_i C_{\pi}(i) = 1 \\ \lambda^\pi & = & \sum_i C_{\pi}(i) = N \end{eqnarray} Substituting, we obtain the intuitive answer: \begin{eqnarray} \pi_i & = & \frac{1}{N} C_{\pi}(i) \end{eqnarray}
In [12]:
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T
y = np.array([0, 1, 3, 2, 4])
hm = HMM(p, A, B)
The EM algorithm requires obtaining the sufficient statistics of an HMM.
The key observation is that the sufficient statistics are additive: \begin{eqnarray} C_t & = & \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k}) \right) p(x_{1:t}|y_{1:t}) dx_{1:t} \end{eqnarray}
The derivation depends on the following decomposition. By the chain rule of probabilities, we write \begin{eqnarray} p(x_{1:t}|y_{1:t}) & = & p(x_{1}|x_{2:t}, y_{1:t}) \cdots p(x_{t-2}|x_{t-1:t}, y_{1:t}) p(x_{t-1}|x_t, y_{1:t}) p(x_{t}|y_{1:t}) \end{eqnarray} The key observation is that this expression admits computation in the forward direction. By conditional independence \begin{eqnarray} p(x_{1:t}|y_{1:t}) & = & p(x_{1}|x_{2}, y_{1}) \cdots p(x_{t-2}|x_{t-1}, y_{1:t-2}) p(x_{t-1}|x_t, y_{1:t-1}) p(x_{t}|y_{1:t}) \end{eqnarray}
We will use this observation as the basis of a forward recursion. First we decompose the posterior as a product of the filtering density at time $t$ and a conditional quantity familiar from the correction smoother \begin{eqnarray} C_t & = & \int \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k})\right) p(x_{1:t-1}|y_{1:t},x_t) p(x_{t}|y_{1:t}) dx_{1:t-1} dx_t \\ & = & \int \underbrace{\left( \int \left(\sum_{k=2}^t s_k(x_{k-1}, x_{k})\right) p(x_{1:t-1}|y_{1:t-1},x_t) dx_{1:t-1} \right)}_{=V_t(x_t)} p(x_{t}|y_{1:t}) dx_t \end{eqnarray}
Due to additivity, we can decompose further \begin{eqnarray} V_t(x_t) & = & \int \left( s_t(x_{t-1}, x_{t}) + \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) \right) p(x_{1:t-1}|y_{1:t-1}, x_t) dx_{1:t-1} \\ & = & \int \int \left( s_t(x_{t-1}, x_{t}) + \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) \right) p(x_{1:t-2}|y_{1:t-2}, x_{t-1}) p(x_{t-1}|y_{1:t-1}, x_t) dx_{1:t-2} dx_{t-1} \\ & = & \int \left( s_t(x_{t-1}, x_{t}) + \int \sum_{k=2}^{t-1} s_k(x_{k-1}, x_{k}) p(x_{1:t-2}|y_{1:t-2}, x_{t-1}) dx_{1:t-2} \right) p(x_{t-1}|y_{1:t-1}, x_t) dx_{t-1} \\ & = & \int \left( s_t(x_{t-1}, x_{t}) + V_{t-1}(x_{t-1}) \right) p(x_{t-1}|y_{1:t-1}, x_t) dx_{t-1} \end{eqnarray}
Suppose only $y_{1:4}$ are observed. The above recursion, when applied to the sufficient statistics of an HMM has the following specific form. Recall that, $C_{\pi}, C_A, C_B$ for stand for the sufficient statistics for the estimation of the prior, state transitions and observations, respectively. Here, we will denote the expected sufficient statistics explicitely as $C_{A,t}$ where $t$ is the number of observations.
The state transition statistics are \begin{eqnarray} C_{A,4}(a,b) &=& \sum_{x_4} \sum_{x_3} \sum_{x_2} \sum_{x_1} \left( {\ind{a = x_2}}\ind{b = x_{1}} + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) p(x_{1:4}|y_{1,4})\\ \end{eqnarray} By introducing the forward decomposition \begin{eqnarray} C{A,4}(a,b) &=& \sum{x4} \sum{x3} \sum{x2} \left(\sum{x_1} {\ind{a = x2}}\ind{b = x{1}} p(x_1|y_1, x_2) + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x2|y{1,2}, x_3) p(x3|y{1,3}, x_4) p(x4|y{1,4})\ &=& \sum_{x4}\sum{x3} \sum{x_2} \left( \underbrace{{\ind{a = x_2}} p(x_1=b|y_1, x2)}{V_{A,2}(x_2)} + {\ind{a = x_3}}\ind{b = x_2} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x2|y{1,2}, x_3) p(x3|y{1,3}, x_4) p(x4|y{1,4}) \ &=& \sum_{x4}\sum{x_3} \left( \underbrace{p(x_1=b|y_1, x_2=a) p(x2=a|y{1,2}, x_3) + {\ind{a = x_3}} p(x2=b|y{1,2}, x3)}{V_{A,3}(x_3)} + {\ind{a = x_4}}\ind{b = x_3} \right) \ & & p(x3|y{1,3}, x_4) p(x4|y{1,4}) \ &=& \sum_{x_4} \left(p(x_1=b|y_1, x2=a)\sum{x_3} p(x2=a|y{1,2}, x_3) p(x3|y{1,3}, x_4) + p(x2=b|y{1,2}, x_3=a) p(x3=a|y{1,3}, x_4) \
One can verify, that the last line is indeed equal to the required sufficient statistics given below.
For $t$, the update rule is
\begin{eqnarray} V_{A,t}(a, b, x_t) & = & \sum_{x_{t-1}} V_{A,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \sum_{x_{t-1}} \ind{a = x_t}\ind{b = x_{t-1}} p(x_{t-1}|y_{1:t-1},x_{t}) \\ & = & \sum_{x_{t-1}} V_{A,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \ind{a = x_t} p(x_{t-1}=b|y_{1:t-1},x_{t}) \end{eqnarray}The advantage of this algorithm is that it is entirely forward and has attractive space properties, requiring only $S^3 + 2S$ space. The other statistics are derived below
For $t$, the update rule is
\begin{eqnarray} V_{B,t}(a, b, x_t) & = & \sum_{x_{t-1}} V_{B,t-1}(a, b, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) + \ind{a = y_t} p(x_{t-1}=b|y_{1:t-1},x_{t}) \end{eqnarray}For $t$, the update equation is \begin{eqnarray} V_{\pi,t}(a, x_t) & = & \sum_{x_{t-1}} V_{\pi,t-1}(a, x_{t-1}) p(x_{t-1}|y_{1:{t-1}},x_{t}) \end{eqnarray}
An implementation is given below.
In [13]:
S = 3
R = 5
A = np.random.dirichlet(0.7*np.ones(S),S).T
B = np.random.dirichlet(0.7*np.ones(R),S).T
p = np.random.dirichlet(0.7*np.ones(S)).T
logA = np.log(A)
logB = np.log(B)
hm = HMM(p, A, B)
y = np.array([1, 1, 1, 3, 2, 0, 3, 2, 1, 0, 0, 3, 2, 4])
T = y.shape[0]
# Forward only estimation of sufficient statistics
V_pi = np.eye((S))
V_A = np.zeros((S,S,S))
V_B = np.zeros((R,S,S))
I_S1S = np.eye(S).reshape((S,1,S))
I_RR = np.eye(R)
for k in range(T):
if k==0:
log_alpha_pred = np.log(p)
else:
log_alpha_pred = predict(A, log_alpha)
if k>0:
# Calculate p(x_{k-1}|y_{1:k-1}, x_k)
lp = np.log(normalize_exp(log_alpha)).reshape(S,1) + logA.T
P = normalize_exp(lp, axis=0)
# Update
V_pi = np.dot(V_pi, P)
V_A = np.dot(V_A, P) + I_S1S*P.reshape((1,S,S))
V_B = np.dot(V_B, P) + I_RR[:,y[k-1]].reshape((R,1,1))*P.reshape((1,S,S))
log_alpha = update(y[k], logB, log_alpha_pred)
p_xT = normalize_exp(log_alpha)
C1 = np.dot(V_pi, p_xT.reshape(S,1))
C2 = np.dot(V_A, p_xT.reshape(1,S,1)).reshape((S,S))
C3 = np.dot(V_B, p_xT.reshape(1,S,1)).reshape((R,S))
C3[y[-1],:] += p_xT
print("Results with the Forward Smoother")
print(C1)
print(np.sum(C1))
print(C2)
print(np.sum(C2))
print(C3)
print(np.sum(C3))
print("Results with the Correction Smoother")
lg, C1_corr, C2_corr, C3_corr = hm.correction_smoother(y)
print(C1_corr)
print(np.sum(C1_corr))
print(C2_corr)
print(np.sum(C2_corr))
print(C3_corr)
print(np.sum(C3_corr))
In [14]:
%connect_info