For continuous observations, use KDE (are there better Bayesian nonparametric methods for this purpose?)



In [1]:

    
# spectral learning for HMM



In [2]:

    
import numpy as np
import numpy.random as npr
import pylab as pl
from scipy import stats
import cython

%load_ext cython
%matplotlib inline



In [3]:

    
# Walker alias method implementation
# adapted from https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/

def alias_setup(probs):
    K       = len(probs)
    q       = np.zeros(K)
    J       = np.zeros(K, dtype=np.int)
 
    # Sort the data into the outcomes with probabilities
    # that are larger and smaller than 1/K.
    smaller = []
    larger  = []
    for kk, prob in enumerate(probs):
        q[kk] = K*prob
        if q[kk] < 1.0:
            smaller.append(kk)
        else:
            larger.append(kk)
 
    # Loop though and create little binary mixtures that
    # appropriately allocate the larger outcomes over the
    # overall uniform mixture.
    while len(smaller) > 0 and len(larger) > 0:
        small = smaller.pop()
        large = larger.pop()
 
        J[small] = large
        q[large] = q[large] - (1.0 - q[small])
 
        if q[large] < 1.0:
            smaller.append(large)
        else:
            larger.append(large)
 
    return (J, q)



In [4]:

    
%%cython
def c_alias_draw(J, q, K, rand1, rand2):
 
    # Draw from the overall uniform mixture.
    kk = int(rand1*K)
 
    # Draw from the binary mixture, either keeping the
    # small one, or choosing the associated larger one.
    if rand2 < q[kk]:
        return kk
    else:
        return J[kk]



In [5]:

    
class Alias():
    def __init__(self,probs):
        self.probs = probs
        self.J,self.q = alias_setup(probs)
        self.K = len(probs)
    
    def draw(self):
        return c_alias_draw(self.J,self.q,self.K,rand1=npr.rand(),rand2=npr.rand())
    
    def __repr__(self):
        return "Alias(" + str(self.probs / sum(self.probs)) + ")"



In [6]:

    
class MSM():
    def __init__(self,T=None,size=None):
        if type(T) == type(None):
            T = self.gen_stoch_matrix(size)
        else:
            assert(T.shape[0] == T.shape[1])
        self.T = T
        self.sampler = np.empty(len(T),dtype=object)
        for i in range(len(T)):
            self.sampler[i] = Alias(T[i])
    
    def draw(self,num_samples=1000):
        samples = np.zeros(num_samples,dtype=int)
        samples[0] = npr.randint(len(self.T))
        for i in range(1,num_samples):
            samples[i] = self.sampler[samples[i-1]].draw()
        return samples
    
    def gen_stoch_matrix(self,size=5,num_iter=20):
        " Generates a random row-stochastic matrix "
        T = npr.rand(size,size)
        for i in range(size):
            T[i] /= sum(T[i])
        #note: doesn't satisfy detailed balance
        return T
    
    def compute_stationary_prob(self):
        " CURRENTLY BROKEN "
        return np.linalg.eigh(self.T)[1][-1]
    
    def __repr__(self):
        return str(len(self.T)) + "-state MSM"



In [7]:

    
# from eq. 5.38 of MSM book
example_T = np.array([[0.86207, 0.12931, 0.00862],[0.15625, 0.83333, 0.01041],[0.00199, 0.00199, 0.99602]])



In [8]:

    
m = MSM(example_T)
observed_sequence = m.draw(1000000)



In [9]:

    
# random symmetric matrix
n = 100
sym_T = npr.rand(n,n)
for i in xrange(n):
    for j in xrange(i):
        sym_T[j,i] = sym_T[i,j]



In [10]:

    
def observations(states,obs_size=10):
    obs = np.array(states)
    for i in range(1,obs_size):
        obs[states % i == 0] = i
    obs[states == 0] = 0
    return obs



In [11]:

    
m = MSM(sym_T)
hidden_sequence = m.draw(10000)



In [12]:

    
observed_sequence = observations(hidden_sequence)
print(len(observed_sequence))
set(observed_sequence)









    



10000






    Out[12]:





{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}



In [13]:

    
def transition_count_matrix(sequence):
    n = len(set(sequence)) # number of states
    C = np.zeros((n,n),dtype=int)
    for t in range(1,len(sequence)):
        i,j = sequence[t-1:t+1]
        C[i,j] += 1
    return C



In [13]:



In [14]:

    
observed_counts = transition_count_matrix(observed_sequence)
observed_counts.shape









    Out[14]:





(10, 10)



In [15]:

    
def triplet_count_matrix(sequence):
    n = len(set(sequence)) # number of states
    C = np.zeros((n,n,n),dtype=int)
    for t in range(2,len(sequence)):
        i,y,j = sequence[t-2:t+1]
        C[y,i,j] += 1
    return C



In [16]:

    
observed_triplets = transition_count_matrix(observed_sequence)



In [17]:

    
observed_triplets.shape









    Out[17]:





(10, 10)



In [18]:

    
observed_triplets









    Out[18]:





array([[  1,  24,   3,   4,   8,  14,   9,   7,  12,  12],
       [ 22, 469, 270, 176, 136, 235, 139, 263, 242, 240],
       [ 10, 254, 135,  95,  67, 125,  61, 161, 135, 150],
       [  5, 193, 103,  56,  51,  91,  52,  95,  71,  74],
       [  8, 138,  84,  43,  23,  80,  24,  77,  65,  62],
       [ 15, 229, 137,  77,  75, 135,  63, 145, 108, 135],
       [  7, 136,  69,  60,  46,  60,  29,  59,  69,  65],
       [ 13, 277, 133,  99,  72, 151,  67, 156, 131, 133],
       [  8, 232, 129,  97,  68, 106,  73, 130, 123, 109],
       [  5, 240, 130,  84,  59, 122,  82, 139, 119, 119]])



In [19]:

    
#spectral method:



In [20]:

    
np.linalg.svd?



In [21]:

    
svd = np.linalg.svd(observed_counts)



In [22]:

    
svd[0].shape









    Out[22]:





(10, 10)

Problem set-up:

Observe a sequence $Y = [y_1,y_2,y_3,\dots,y_\tau]$
Assume a hidden variable that explains the observations, Observe a sequence $X = [x_1,x_2,x_3,\dots,x_\tau]$
- Hidden variable is discrete and Markovian

To learn a HMM from data, we can use EM or spectral method:

Expectation-Maximization (Baum-Welch)
- MLE
- Local optima
- Statistically and computationally inefficient for large hidden state space
Spectral method
- Closed-form
- Consistent, finite sample bounds
- No local optima
- Small loss in statistical efficiency

Goals in Siddiqi et al.:

Generalize spectral learning algorithm to larger class of models
Supply tighter finite sample bounds
Apply algorithm to high-dimensional data

Reduced-rank HMM: special case of predictive-state-representations, HMMs are a special case of RR-HMMs

HMM Definition

States

$m$: number of discrete states
$n$: number of discrete observations

Transitions

$T$: $m\times m$ column-stochastic transition matrix
$T_{i,j} = \text{Pr}[x_{t+1} = i \mid x_t = j]$
$T_{i,j}$ is the probability of transitioning from hidden state $j$ to $i$

Emissions

$O$: $n\times m$ column-stochastic observation matrix
$O_{i,j} = \text{Pr}[y_t = i \mid x_t = j]$
$O_{i,j}$ is the probability of observing $i$ when the hidden state is $j$

Prior

$\pi$: $m \times 1$ prior distribution over states, or equivalently, the initial distribution
$\pi_i = \text{Pr}[x_1 = i]$

Observable operators

For each $y \in \{ 1,\dots,n\}$ define an $m \times m$ matrix $A_y = T \text{diag}(O_{y,\cdot})$ $$ [A_y]_{i,j} \equiv \text{Pr} [x_{t+1} = i \wedge y_t = y \mid x_t] $$
$A_y$ factors as $A_y = \newcommand{\Pr}{\text{Pr}} \underbrace{\Pr[x_{t+1} \mid x_t]}_{\substack{\text{transition}\\ \text{probability}}} \underbrace{\Pr[y \mid x_t]}_{\substack{\text{observation}\\ \text{likelihood}}}$

Inference in HMMs [to-do] $$%\Pr[y_1,y_2,y_3,\dots,y_\tau]= \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau \Pr[y_\tau] \mid x_\tau] \cdots \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau] \Pr[y_\tau] \mid x_\tau]$$

Spectral learning for HMM parameters

Goal:

Recover observable HMM parameters from probabilities of doubles and triples of observations

Define $ [P_{2,1}]_{i,j} \triangleq \Pr[y_2 = i, y_1 = j]$
- Define $ [P_{3,y,1}]_{i,j} \triangleq \Pr[y_3 = i, y_2 = y, y_1 = j]$
Factor into HMM parameters: $$\newcommand{\diag}{\text{diag}} \newcommand{\T}{\text{T}} \begin{align} P_{2,1} &= OT \diag (\pi) O^\T\\ P_{3,y,1} &= O A_y T \diag (\pi) O^\T \text{ where } A_y \triangleq T \diag (O_y,\cdot) \end{align}$$
Pick $U$ such that $(U^\T O)$ is invertible
Define $B_y \triangleq (U^\T P_{3,y,1}) (U^\T P_{2,1})^\dagger = (U^\T O) A_y (U^\T O) ^{-1}$. $B_y$ is within a similarity transform $(U^\T O)$ of the true HMM parameters $A_y$

Algorithm:

Use triples of observations $\langle y_1, y_2, y_3 \rangle$ to estimate frequencies $\hat{P}_{2,1}$ and $\hat{P}_{3,y,1}$
Compute singular value decomposition (SVD) of $\hat{P}_{2,1}$ to find a matrix of the top $m$ singular vectors $\hat{U}$
Find observable operators $\hat{B}_y = (\hat{U}^\T \hat{P}_{3,y,1})(\hat{U}^\T \hat{P}_{2,1})^\dagger$

For RR-HMMs

Rank of $P_{2,1}$ and $P_{3,y,1}$ depends on $R$ and $S$ $$ P_{2,1} = OT \diag (\pi) O^\T$$

*Algorithm: Learn-RR-HMM($k,N$):

Compute empirical estimates of $\hat{P}_1, \hat{P}_{2,1}, \hat{P}_{3,x,1}$ (for $x = 1,\dots,n$)
Compute SVD of $\hat{P}_{2,1}$
$\hat{U} \leftarrow $ matrix of left singular vectors corresponding to k largest singular values
Compute model parameter estimates
- $\hat{b}_1 = \hat{U}^\T \hat{P}_1$
- $\hat{b}_\infty = (\hat{P}_{2,1}^\T \hat{U})^\dagger \hat{P}_1$
- $\hat{B}_x = \hat{U}^T \hat{P}_{3,x,1} (\hat{U}^\T \hat{P}_{2,1})^\dagger$ (for $x = 1,\dots,n$)



In [23]:

    
def transition_count_matrix(sequence):
    n = np.max(sequence)+1 # number of states
    C = np.zeros((n,n))
    for t in xrange(1,len(sequence)):
        i,j = sequence[t-1:t+1]
        C[i,j] += 1
    return C



In [24]:

    
def covariance(sequence):
    P_21 = transition_count_matrix(sequence)
    for i in xrange(len(P_21)):
        P_21[i] /= sum(P_21[i])
    return P_21



In [25]:

    
def trivariance(sequence):
    n = np.max(sequence)+1
    P_31 = np.zeros((n,n,n))
    for t in xrange(2,len(sequence)):
        i,x,j = sequence[t-2:t+1]
        P_31[x,i,j] += 1
    #for x in xrange(n):
    #    P_31[x] /= np.sum(P_31[x],axis=1)
    return P_31



In [26]:

    
def learn_RR_HMM(sequence,k):
    n = np.max(sequence)+1
    
    # compute empirical estimate of P_1
    P_1 = np.zeros(n)
    for i in xrange(len(sequence)):
        P_1[sequence[i]] += 1
    P_1 /= sum(P_1)
    
    # compute empirical estimate of P_21
    P_21 = covariance(sequence)
    
    # compute empirical estimates of P_3x1
    P_31 = trivariance(sequence)
    
    # compute SVD of P_21
    svd = np.linalg.svd(P_21)
    U = svd[0][:,:k]
    UT = U.T
    
    # compute model parameters
    b_1 = UT.dot(P_1)
    b_infty = np.linalg.pinv((P_21.T.dot(U))).dot(P_1)
    
    B = np.zeros((n,k,k))
    UTP21 = np.linalg.pinv(UT.dot(P_21))
    for x in xrange(n):
        B[x] = (UT.dot(P_31[x])).dot(UTP21)
    return b_1,b_infty,B



In [27]:

    
sequence = npr.randint(0,10,10000)
b_1,b_infty,B = learn_RR_HMM(sequence,5)



In [28]:

    
# k-nearest neighbor random walk
sequence = np.zeros(100000)
n = 100
jump=5
sequence[0] = npr.randint(n)
for i in range(1,len(sequence)):
    j=npr.randint(jump*2 + 1)-jump
    sequence[i] = min(abs(sequence[i-1] + j),n)
sequence = np.array(sequence,dtype=int)



In [29]:

    
# random walk with hidden state
length = 1000
hidden_states=5
observed_states=100
sequence = np.zeros(length)
h = np.zeros(length)
jump_prob = 0.1
h[0] = npr.randint(0,hidden_states)
for i in range(1,length):
    if npr.rand() < jump_prob:
        h[i] = (h[i-1] + npr.randint(2)*2-1) % hidden_states
sequence[0] = round(observed_states/2)
mid_h = round(hidden_states/2)
for i in range(1,length):
    sequence[i] = (sequence[i-1] + round(npr.randn()*(h[i]+1)))
    if sequence[i] > observed_states or sequence[i] < 0:
        sequence[i] = sequence[i-1]
sequence = np.array(sequence,dtype=int)



In [30]:

    
sequences = []
for i in range(100):
    n_sequence = np.zeros(length)
    n_sequence[0] = npr.randint(0,observed_states)
    mid_h = npr.randint(0,hidden_states)
    for i in range(1,length):
        n_sequence[i] = (n_sequence[i-1] + round(npr.randn()*(h[i]+1)))
        if n_sequence[i] > observed_states or n_sequence[i] < 0:
            n_sequence[i] = n_sequence[i-1]
    n_sequence = np.array(n_sequence,dtype=int)
    
    sequences.append(n_sequence)



In [31]:

    
big_seq = np.hstack(sequences)
big_seq.shape









    Out[31]:





(100000,)



In [32]:

    
sequence[1]









    Out[32]:





50



In [33]:

    
pl.plot(h)









    Out[33]:





[<matplotlib.lines.Line2D at 0x110ebd250>]



In [34]:

    
pl.plot(sequence)









    Out[34]:





[<matplotlib.lines.Line2D at 0x11280d850>]



In [35]:

    
b_1,b_infty,B = learn_RR_HMM(big_seq,5)



In [36]:

    
%timeit learn_RR_HMM(sequence,5)









    



100 loops, best of 3: 6.31 ms per loop



In [37]:

    
B[0].shape









    Out[37]:





(5, 5)



In [38]:

    
b_infty.shape









    Out[38]:





(5,)



In [39]:

    
b_infty.T.dot(B[0]).dot(b_1)









    Out[39]:





0.15271026611143881



In [40]:

    
b_infty[-len(b_infty)] == b_infty[0]









    Out[40]:





True



In [41]:

    
def score(sequence,b_1,b_infty,B):
    ans = b_infty.T
    for i in range(1,len(B)):
        ans = ans.dot(B[sequence[-i]])
    return ans.dot(b_1)



In [42]:

    
score(sequence, b_1, b_infty, B)









    Out[42]:





6.8754842705539447e+164



In [43]:

    
score(npr.randint(0,n,len(sequence)),b_1, b_infty, B)









    Out[43]:





7.0091990675043781e+88

$$\begin{align} \newcommand{\bti}{\hat{b}^T_\infty} \newcommand{\hB}{\hat{B}} \newcommand{\hb}{\hat{b}} \hat{\text{Pr}}[x_1,\dots,x_t] & = \bti \hB_{x_t} \dots \hB_{x_1}\hb_1\\ \hb_{t+1} & = \frac{\hB_{x_t} \hb_t }{\bti \hB_{x_t} \hb_t}\\ \hat{\text{Pr}}[x_t \mid x_{1:t-1}] & = \frac{\bti \hB_{x_t} \hb_t}{\sum_x \bti \hB_x \hb_t} \end{align}$$



In [44]:

    
def next_b(x_t,b_t,b_infty,B):
    return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))



In [45]:

    
b_1









    Out[45]:





array([-0.09904614,  0.01768298,  0.00090685, -0.00131447, -0.00370897])



In [46]:

    
next_b(sequence[0],b_1,b_infty,B)









    Out[46]:





array([ -9.27558865,   1.57588227,  11.99321289,  -0.78687568, -11.19151555])



In [47]:

    
B.shape









    Out[47]:





(101, 5, 5)



In [48]:

    
def cond_prob(b_infty,B,b):
    ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
    return ans / sum(ans)



In [49]:

    
sequence[0]









    Out[49]:





50



In [50]:

    
bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
    bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)



In [50]:

    
def next_b(x_t,b_t,b_infty,B):
    return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))

def cond_prob(b_infty,B,b):
    ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
    return ans / sum(ans)

bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
    bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)



In [51]:

    
pl.plot(b_1)









    Out[51]:





[<matplotlib.lines.Line2D at 0x114841dd0>]



In [52]:

    
i = 1
y = cond_prob(b_infty,B,bees[i])[:-1]
y = np.log(y-np.min(y)+0.1)
pl.plot(y)
#pl.plot([np.mean(y)]*len(y))
np.mean(y)
#pl.vlines(sequence[0],np.min(y),np.max(y),linestyles='--')
x = range(sequence[i]-jump,sequence[i]+jump)
pl.plot(x,[np.mean(y)]*len(x))









    Out[52]:





[<matplotlib.lines.Line2D at 0x1148459d0>]



In [53]:

    
T = transition_count_matrix(big_seq)
T= (T.T / np.sum(T,1)).T
sum(T[0])









    Out[53]:





1.0000000000000002



In [54]:

    
cond_probs_msm = np.array([T[x] for x in sequence])



In [55]:

    
pl.plot(cond_probs_msm[0])









    Out[55]:





[<matplotlib.lines.Line2D at 0x114913450>]



In [56]:

    
for i in range(10):
    pl.plot(cond_probs_msm[i*10])



In [57]:

    
for i in range(1,10):
    pl.plot(cond_prob(b_infty,B,bees[i*10]))



In [58]:

    
for i in range(1,100):
    pl.plot(cond_prob(b_infty,B,bees[i]))



In [59]:

    
for i in range(1,500):
    pl.plot(cond_prob(b_infty,B,bees[i]))



In [60]:

    
for i in range(1,500):
    pl.plot(cond_prob(b_infty,B,bees[i])[:90],linewidth=5)



In [61]:

    
cond_probs = np.array([cond_prob(b_infty,B,b) for b in bees])
cond_probs -= np.min(cond_probs)
cond_probs/= np.max(cond_probs)



In [62]:

    
np.min(cond_probs)









    Out[62]:





0.0



In [63]:

    
np.max(cond_probs)









    Out[63]:





1.0



In [64]:

    
from scipy.stats import entropy



In [65]:

    
cond_prob_entropies = np.array([entropy(cp) for cp in cond_probs])



In [66]:

    
print(cond_prob_entropies.shape)



In [67]:

    
print(cond_prob_entropies.min())









    



4.36272696201



In [68]:

    
print(cond_prob_entropies.max())









    



4.60761179674



In [69]:

    
pl.plot(cond_prob_entropies)









    Out[69]:





[<matplotlib.lines.Line2D at 0x1149ac190>]



In [70]:

    
pl.hist(cond_prob_entropies,bins=50);



In [71]:

    
P_21 = covariance(sequence)



In [72]:

    
svd = np.linalg.svd(P_21)



In [73]:

    
pl.plot(svd[1]) # n=300, jump = 50?









    Out[73]:





[<matplotlib.lines.Line2D at 0x114c689d0>]



In [74]:

    
P_31 = trivariance(sequence)



In [77]:

    
for i in [1]:
    svd = np.linalg.svd(P_31[i])
    pl.plot(range(1,n+1),svd[1])
#pl.xlim((0,100))









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-9df7565ccfcd> in <module>()
      1 for i in [1]:
      2     svd = np.linalg.svd(P_31[i])
----> 3     pl.plot(range(1,n+1),svd[1])
      4 #pl.xlim((0,100))

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/pyplot.pyc in plot(*args, **kwargs)
   3097         ax.hold(hold)
   3098     try:
-> 3099         ret = ax.plot(*args, **kwargs)
   3100         draw_if_interactive()
   3101     finally:

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in plot(self, *args, **kwargs)
   1372         lines = []
   1373 
-> 1374         for line in self._get_lines(*args, **kwargs):
   1375             self.add_line(line)
   1376             lines.append(line)

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _grab_next_args(self, *args, **kwargs)
    301                 return
    302             if len(remaining) <= 3:
--> 303                 for seg in self._plot_args(remaining, kwargs):
    304                     yield seg
    305                 return

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _plot_args(self, tup, kwargs)
    279             x = np.arange(y.shape[0], dtype=float)
    280 
--> 281         x, y = self._xy_from_xy(x, y)
    282 
    283         if self.command == 'plot':

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _xy_from_xy(self, x, y)
    221         y = np.atleast_1d(y)
    222         if x.shape[0] != y.shape[0]:
--> 223             raise ValueError("x and y must have same first dimension")
    224         if x.ndim > 2 or y.ndim > 2:
    225             raise ValueError("x and y can be no greater than 2-D")

ValueError: x and y must have same first dimension



In [76]:

    
P_31[0]/np.sum(P_31,axis=0)









    



-c:1: RuntimeWarning: invalid value encountered in divide






    Out[76]:





array([[ 0.85714286,  0.625     ,         nan, ...,         nan,
                nan,         nan],
       [ 0.66666667,  0.        ,  0.        , ...,         nan,
                nan,         nan],
       [ 0.5       ,  0.        ,  0.        , ...,         nan,
                nan,         nan],
       ..., 
       [        nan,         nan,         nan, ...,         nan,
                nan,  0.        ],
       [        nan,         nan,         nan, ...,  0.        ,
         0.        ,         nan],
       [        nan,         nan,         nan, ...,  0.        ,
                nan,         nan]])



In [278]:

    
x = npr.randn(10,2)
np.sum(x,axis=1)









    Out[278]:





array([-0.18352912,  1.23447102,  1.2849898 ,  1.54009921, -1.22542266,
       -0.48128584,  2.42779155, -0.61657985, -1.84252485,  0.29169065])



In [ ]:

    
k = 10

Notes:

slices of the "trivariance" matrix have very-low-rank structure
the covariance matrix has low-rank structure when the underlying process is fast-mixing: I would have expected something opposite
the spectral learning algorithm requires $m\leq n$, but in the paper it looks like that isn't the case for RR-HMM. Not sure what I'm missing, but it seems to be a constraint of RR-HMM as well
The rank of $B_x$ seems to depend on the mixing speed of the underlying chain as well: slow-mixing processes are lower-rank



In [101]:

    
U = svd[0][:,:k]



In [95]:

    
U.shape









    Out[95]:





(200, 10)



In [100]:

    
U,s,V = np.linalg.svd(npr.randn(100,100))
U.shape,s.shape,V.shape









    Out[100]:





((100, 100), (100,), (100, 100))



In [102]:

    
P_1 = np.random.randn(200)



In [104]:

    
U.T.dot(P_1)









    Out[104]:





array([ 1.79811368, -1.23668019, -0.56738238, -0.08762806, -1.13696033,
        1.15565864,  0.00633177, -0.02479007,  0.35536339,  0.24238011])

Next: use learned parameters for prediction and scoring



In [ ]:

    
# baum-welch learning for HMM

Next steps:

Hilbert-space embeddings of HMMs

Regularization scheme?
How to impose reversibility of $T$?
- Ooh, that leads to a sweet acronym: $R^3-HMM$ (reversible, reduced-rank hidden markov model)



In [78]:

    
# next: apply to actual data!



In [79]:

    
from __future__ import print_function
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm

from msmbuilder.decomposition import tICA
from msmbuilder.example_datasets import fetch_met_enkephalin
from msmbuilder.featurizer import AtomPairsFeaturizer
from sklearn.pipeline import Pipeline

%matplotlib inline



In [80]:

    
dataset = fetch_met_enkephalin()
print(dataset.DESCR)









    



The dataset consists of ten ~50 ns molecular dynamics (MD) simulation
trajectories of the 5 residue Met-enkaphalin peptide. The aggregate
sampling is 499.58 ns. Simulations were performed starting from the 1st
model in the 1PLX PDB file, solvated with 832 TIP3P water molecules using
OpenMM 6.0. The coordinates (protein only -- the water was stripped)
are saved every 5 picoseconds. Each of the ten trajectories is roughly
50 ns long and contains about 10,000 snapshots.

Forcefield: amber99sb-ildn; water: tip3p; nonbonded method: PME; cutoffs:
1nm; bonds to hydrogen were constrained; integrator: langevin dynamics;
temperature: 300K; friction coefficient: 1.0/ps; pressure control: Monte
Carlo barostat (interval of 25 steps); timestep 2 fs.

The dataset is available on figshare at

http://dx.doi.org/10.6084/m9.figshare.1026324



In [81]:

    
from itertools import combinations
heavy_atoms = dataset.trajectories[0].topology.select_atom_indices('heavy')
heavy_pairs = list(combinations(heavy_atoms, 2))



In [82]:

    
traj = dataset.trajectories



In [83]:

    
feat = AtomPairsFeaturizer(heavy_pairs)
featurized = feat.fit_transform(traj)



In [84]:

    
feat_stack = np.vstack(featurized)



In [88]:

    
import msmbuilder as mb
from msmbuilder.msm import MarkovStateModel



In [94]:

    
pipeline1 = Pipeline([
    ('feat', AtomPairsFeaturizer(heavy_pairs)),
    ('tica', tICA(gamma=0, n_components=2)),
    ('msm', MarkovStateModel())
])



In [96]:

    
# gah don't want to fix this at the moment
#pipeline1.fit_transform(traj)



In [98]:

    
dataset = mb.example_datasets.AlanineDipeptide().get()



In [99]:

    
traj = dataset.trajectories



In [102]:

    
topology = traj[0].topology



In [105]:

    
# from http://msmbuilder.org/latest/examples/hmm-and-msm.html
indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = mb.featurizer.SuperposeFeaturizer(indices, traj[0][0])
sequences = featurizer.transform(traj)



In [108]:

    
from msmbuilder import cluster



In [121]:

    
n = 10
assignments = cluster.KCenters(n_clusters=n).fit_predict([np.vstack(sequences)])



In [123]:

    
assignments[0].shape









    Out[123]:





(99999,)



In [125]:

    
params = learn_RR_HMM(assignments[0],5)



In [131]:

    
sequence = assignments[0]



In [126]:

    
len(params)









    Out[126]:





3



In [133]:

    
import rr_hmm



In [136]:

    
b_1,b_infty,B = rr_hmm.learn_RR_HMM(sequence,5)



In [139]:

    
rr_hmm = 1



In [149]:

    
reload(rr_hmm)









    Out[149]:





<module 'rr_hmm' from 'rr_hmm.py'>



In [143]:

    
model = rr_hmm.RR_HMM(sequence,5)



In [146]:

    
model.B









    Out[146]:





array([[[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -4.52600000e+03,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -3.10000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -3.24400000e+03,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -1.60000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -9.20000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -8.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -2.70000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]]])



In [ ]: