• For continuous observations, use KDE (are there better Bayesian nonparametric methods for this purpose?)

In [1]:
# spectral learning for HMM

In [2]:
import numpy as np
import numpy.random as npr
import pylab as pl
from scipy import stats
import cython

%load_ext cython
%matplotlib inline

In [3]:
# Walker alias method implementation
# adapted from https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/

def alias_setup(probs):
    K       = len(probs)
    q       = np.zeros(K)
    J       = np.zeros(K, dtype=np.int)
 
    # Sort the data into the outcomes with probabilities
    # that are larger and smaller than 1/K.
    smaller = []
    larger  = []
    for kk, prob in enumerate(probs):
        q[kk] = K*prob
        if q[kk] < 1.0:
            smaller.append(kk)
        else:
            larger.append(kk)
 
    # Loop though and create little binary mixtures that
    # appropriately allocate the larger outcomes over the
    # overall uniform mixture.
    while len(smaller) > 0 and len(larger) > 0:
        small = smaller.pop()
        large = larger.pop()
 
        J[small] = large
        q[large] = q[large] - (1.0 - q[small])
 
        if q[large] < 1.0:
            smaller.append(large)
        else:
            larger.append(large)
 
    return (J, q)

In [4]:
%%cython
def c_alias_draw(J, q, K, rand1, rand2):
 
    # Draw from the overall uniform mixture.
    kk = int(rand1*K)
 
    # Draw from the binary mixture, either keeping the
    # small one, or choosing the associated larger one.
    if rand2 < q[kk]:
        return kk
    else:
        return J[kk]

In [5]:
class Alias():
    def __init__(self,probs):
        self.probs = probs
        self.J,self.q = alias_setup(probs)
        self.K = len(probs)
    
    def draw(self):
        return c_alias_draw(self.J,self.q,self.K,rand1=npr.rand(),rand2=npr.rand())
    
    def __repr__(self):
        return "Alias(" + str(self.probs / sum(self.probs)) + ")"

In [6]:
class MSM():
    def __init__(self,T=None,size=None):
        if type(T) == type(None):
            T = self.gen_stoch_matrix(size)
        else:
            assert(T.shape[0] == T.shape[1])
        self.T = T
        self.sampler = np.empty(len(T),dtype=object)
        for i in range(len(T)):
            self.sampler[i] = Alias(T[i])
    
    def draw(self,num_samples=1000):
        samples = np.zeros(num_samples,dtype=int)
        samples[0] = npr.randint(len(self.T))
        for i in range(1,num_samples):
            samples[i] = self.sampler[samples[i-1]].draw()
        return samples
    
    def gen_stoch_matrix(self,size=5,num_iter=20):
        " Generates a random row-stochastic matrix "
        T = npr.rand(size,size)
        for i in range(size):
            T[i] /= sum(T[i])
        #note: doesn't satisfy detailed balance
        return T
    
    def compute_stationary_prob(self):
        " CURRENTLY BROKEN "
        return np.linalg.eigh(self.T)[1][-1]
    
    def __repr__(self):
        return str(len(self.T)) + "-state MSM"

In [7]:
# from eq. 5.38 of MSM book
example_T = np.array([[0.86207, 0.12931, 0.00862],[0.15625, 0.83333, 0.01041],[0.00199, 0.00199, 0.99602]])

In [8]:
m = MSM(example_T)
observed_sequence = m.draw(1000000)

In [9]:
# random symmetric matrix
n = 100
sym_T = npr.rand(n,n)
for i in xrange(n):
    for j in xrange(i):
        sym_T[j,i] = sym_T[i,j]

In [10]:
def observations(states,obs_size=10):
    obs = np.array(states)
    for i in range(1,obs_size):
        obs[states % i == 0] = i
    obs[states == 0] = 0
    return obs

In [11]:
m = MSM(sym_T)
hidden_sequence = m.draw(10000)

In [12]:
observed_sequence = observations(hidden_sequence)
print(len(observed_sequence))
set(observed_sequence)


10000
Out[12]:
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [13]:
def transition_count_matrix(sequence):
    n = len(set(sequence)) # number of states
    C = np.zeros((n,n),dtype=int)
    for t in range(1,len(sequence)):
        i,j = sequence[t-1:t+1]
        C[i,j] += 1
    return C

In [13]:


In [14]:
observed_counts = transition_count_matrix(observed_sequence)
observed_counts.shape


Out[14]:
(10, 10)

In [15]:
def triplet_count_matrix(sequence):
    n = len(set(sequence)) # number of states
    C = np.zeros((n,n,n),dtype=int)
    for t in range(2,len(sequence)):
        i,y,j = sequence[t-2:t+1]
        C[y,i,j] += 1
    return C

In [16]:
observed_triplets = transition_count_matrix(observed_sequence)

In [17]:
observed_triplets.shape


Out[17]:
(10, 10)

In [18]:
observed_triplets


Out[18]:
array([[  1,  24,   3,   4,   8,  14,   9,   7,  12,  12],
       [ 22, 469, 270, 176, 136, 235, 139, 263, 242, 240],
       [ 10, 254, 135,  95,  67, 125,  61, 161, 135, 150],
       [  5, 193, 103,  56,  51,  91,  52,  95,  71,  74],
       [  8, 138,  84,  43,  23,  80,  24,  77,  65,  62],
       [ 15, 229, 137,  77,  75, 135,  63, 145, 108, 135],
       [  7, 136,  69,  60,  46,  60,  29,  59,  69,  65],
       [ 13, 277, 133,  99,  72, 151,  67, 156, 131, 133],
       [  8, 232, 129,  97,  68, 106,  73, 130, 123, 109],
       [  5, 240, 130,  84,  59, 122,  82, 139, 119, 119]])

In [19]:
#spectral method:

In [20]:
np.linalg.svd?

In [21]:
svd = np.linalg.svd(observed_counts)

In [22]:
svd[0].shape


Out[22]:
(10, 10)

Problem set-up:

  • Observe a sequence $Y = [y_1,y_2,y_3,\dots,y_\tau]$
  • Assume a hidden variable that explains the observations, Observe a sequence $X = [x_1,x_2,x_3,\dots,x_\tau]$
    • Hidden variable is discrete and Markovian

To learn a HMM from data, we can use EM or spectral method:

  • Expectation-Maximization (Baum-Welch)
    • MLE
    • Local optima
    • Statistically and computationally inefficient for large hidden state space
  • Spectral method
    • Closed-form
    • Consistent, finite sample bounds
    • No local optima
    • Small loss in statistical efficiency

Goals in Siddiqi et al.:

  • Generalize spectral learning algorithm to larger class of models
  • Supply tighter finite sample bounds
  • Apply algorithm to high-dimensional data

Reduced-rank HMM: special case of predictive-state-representations, HMMs are a special case of RR-HMMs

HMM Definition

States

  • $m$: number of discrete states
  • $n$: number of discrete observations

Transitions

  • $T$: $m\times m$ column-stochastic transition matrix
  • $T_{i,j} = \text{Pr}[x_{t+1} = i \mid x_t = j]$
  • $T_{i,j}$ is the probability of transitioning from hidden state $j$ to $i$

Emissions

  • $O$: $n\times m$ column-stochastic observation matrix
  • $O_{i,j} = \text{Pr}[y_t = i \mid x_t = j]$
  • $O_{i,j}$ is the probability of observing $i$ when the hidden state is $j$

Prior

  • $\pi$: $m \times 1$ prior distribution over states, or equivalently, the initial distribution
  • $\pi_i = \text{Pr}[x_1 = i]$

Observable operators

  • For each $y \in \{ 1,\dots,n\}$ define an $m \times m$ matrix $A_y = T \text{diag}(O_{y,\cdot})$ $$ [A_y]_{i,j} \equiv \text{Pr} [x_{t+1} = i \wedge y_t = y \mid x_t] $$
  • $A_y$ factors as $A_y = \newcommand{\Pr}{\text{Pr}} \underbrace{\Pr[x_{t+1} \mid x_t]}_{\substack{\text{transition}\\ \text{probability}}} \underbrace{\Pr[y \mid x_t]}_{\substack{\text{observation}\\ \text{likelihood}}}$

Inference in HMMs [to-do] $$%\Pr[y_1,y_2,y_3,\dots,y_\tau]= \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau \Pr[y_\tau] \mid x_\tau] \cdots \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau] \Pr[y_\tau] \mid x_\tau]$$

Spectral learning for HMM parameters

Goal:

Recover observable HMM parameters from probabilities of doubles and triples of observations

  • Define $ [P_{2,1}]_{i,j} \triangleq \Pr[y_2 = i, y_1 = j]$
    • Define $ [P_{3,y,1}]_{i,j} \triangleq \Pr[y_3 = i, y_2 = y, y_1 = j]$
  • Factor into HMM parameters: $$\newcommand{\diag}{\text{diag}} \newcommand{\T}{\text{T}} \begin{align} P_{2,1} &= OT \diag (\pi) O^\T\\ P_{3,y,1} &= O A_y T \diag (\pi) O^\T \text{ where } A_y \triangleq T \diag (O_y,\cdot) \end{align}$$
  • Pick $U$ such that $(U^\T O)$ is invertible
  • Define $B_y \triangleq (U^\T P_{3,y,1}) (U^\T P_{2,1})^\dagger = (U^\T O) A_y (U^\T O) ^{-1}$. $B_y$ is within a similarity transform $(U^\T O)$ of the true HMM parameters $A_y$

Algorithm:

  • Use triples of observations $\langle y_1, y_2, y_3 \rangle$ to estimate frequencies $\hat{P}_{2,1}$ and $\hat{P}_{3,y,1}$
  • Compute singular value decomposition (SVD) of $\hat{P}_{2,1}$ to find a matrix of the top $m$ singular vectors $\hat{U}$
  • Find observable operators $\hat{B}_y = (\hat{U}^\T \hat{P}_{3,y,1})(\hat{U}^\T \hat{P}_{2,1})^\dagger$

For RR-HMMs

  • Rank of $P_{2,1}$ and $P_{3,y,1}$ depends on $R$ and $S$ $$ P_{2,1} = OT \diag (\pi) O^\T$$

*Algorithm: Learn-RR-HMM($k,N$):

  1. Compute empirical estimates of $\hat{P}_1, \hat{P}_{2,1}, \hat{P}_{3,x,1}$ (for $x = 1,\dots,n$)
  2. Compute SVD of $\hat{P}_{2,1}$
  3. $\hat{U} \leftarrow $ matrix of left singular vectors corresponding to k largest singular values
  4. Compute model parameter estimates
    • $\hat{b}_1 = \hat{U}^\T \hat{P}_1$
    • $\hat{b}_\infty = (\hat{P}_{2,1}^\T \hat{U})^\dagger \hat{P}_1$
    • $\hat{B}_x = \hat{U}^T \hat{P}_{3,x,1} (\hat{U}^\T \hat{P}_{2,1})^\dagger$ (for $x = 1,\dots,n$)

In [23]:
def transition_count_matrix(sequence):
    n = np.max(sequence)+1 # number of states
    C = np.zeros((n,n))
    for t in xrange(1,len(sequence)):
        i,j = sequence[t-1:t+1]
        C[i,j] += 1
    return C

In [24]:
def covariance(sequence):
    P_21 = transition_count_matrix(sequence)
    for i in xrange(len(P_21)):
        P_21[i] /= sum(P_21[i])
    return P_21

In [25]:
def trivariance(sequence):
    n = np.max(sequence)+1
    P_31 = np.zeros((n,n,n))
    for t in xrange(2,len(sequence)):
        i,x,j = sequence[t-2:t+1]
        P_31[x,i,j] += 1
    #for x in xrange(n):
    #    P_31[x] /= np.sum(P_31[x],axis=1)
    return P_31

In [26]:
def learn_RR_HMM(sequence,k):
    n = np.max(sequence)+1
    
    # compute empirical estimate of P_1
    P_1 = np.zeros(n)
    for i in xrange(len(sequence)):
        P_1[sequence[i]] += 1
    P_1 /= sum(P_1)
    
    # compute empirical estimate of P_21
    P_21 = covariance(sequence)
    
    # compute empirical estimates of P_3x1
    P_31 = trivariance(sequence)
    
    # compute SVD of P_21
    svd = np.linalg.svd(P_21)
    U = svd[0][:,:k]
    UT = U.T
    
    # compute model parameters
    b_1 = UT.dot(P_1)
    b_infty = np.linalg.pinv((P_21.T.dot(U))).dot(P_1)
    
    B = np.zeros((n,k,k))
    UTP21 = np.linalg.pinv(UT.dot(P_21))
    for x in xrange(n):
        B[x] = (UT.dot(P_31[x])).dot(UTP21)
    return b_1,b_infty,B

In [27]:
sequence = npr.randint(0,10,10000)
b_1,b_infty,B = learn_RR_HMM(sequence,5)

In [28]:
# k-nearest neighbor random walk
sequence = np.zeros(100000)
n = 100
jump=5
sequence[0] = npr.randint(n)
for i in range(1,len(sequence)):
    j=npr.randint(jump*2 + 1)-jump
    sequence[i] = min(abs(sequence[i-1] + j),n)
sequence = np.array(sequence,dtype=int)

In [29]:
# random walk with hidden state
length = 1000
hidden_states=5
observed_states=100
sequence = np.zeros(length)
h = np.zeros(length)
jump_prob = 0.1
h[0] = npr.randint(0,hidden_states)
for i in range(1,length):
    if npr.rand() < jump_prob:
        h[i] = (h[i-1] + npr.randint(2)*2-1) % hidden_states
sequence[0] = round(observed_states/2)
mid_h = round(hidden_states/2)
for i in range(1,length):
    sequence[i] = (sequence[i-1] + round(npr.randn()*(h[i]+1)))
    if sequence[i] > observed_states or sequence[i] < 0:
        sequence[i] = sequence[i-1]
sequence = np.array(sequence,dtype=int)

In [30]:
sequences = []
for i in range(100):
    n_sequence = np.zeros(length)
    n_sequence[0] = npr.randint(0,observed_states)
    mid_h = npr.randint(0,hidden_states)
    for i in range(1,length):
        n_sequence[i] = (n_sequence[i-1] + round(npr.randn()*(h[i]+1)))
        if n_sequence[i] > observed_states or n_sequence[i] < 0:
            n_sequence[i] = n_sequence[i-1]
    n_sequence = np.array(n_sequence,dtype=int)
    
    sequences.append(n_sequence)

In [31]:
big_seq = np.hstack(sequences)
big_seq.shape


Out[31]:
(100000,)

In [32]:
sequence[1]


Out[32]:
50

In [33]:
pl.plot(h)


Out[33]:
[<matplotlib.lines.Line2D at 0x110ebd250>]

In [34]:
pl.plot(sequence)


Out[34]:
[<matplotlib.lines.Line2D at 0x11280d850>]

In [35]:
b_1,b_infty,B = learn_RR_HMM(big_seq,5)

In [36]:
%timeit learn_RR_HMM(sequence,5)


100 loops, best of 3: 6.31 ms per loop

In [37]:
B[0].shape


Out[37]:
(5, 5)

In [38]:
b_infty.shape


Out[38]:
(5,)

In [39]:
b_infty.T.dot(B[0]).dot(b_1)


Out[39]:
0.15271026611143881

In [40]:
b_infty[-len(b_infty)] == b_infty[0]


Out[40]:
True

In [41]:
def score(sequence,b_1,b_infty,B):
    ans = b_infty.T
    for i in range(1,len(B)):
        ans = ans.dot(B[sequence[-i]])
    return ans.dot(b_1)

In [42]:
score(sequence, b_1, b_infty, B)


Out[42]:
6.8754842705539447e+164

In [43]:
score(npr.randint(0,n,len(sequence)),b_1, b_infty, B)


Out[43]:
7.0091990675043781e+88
$$\begin{align} \newcommand{\bti}{\hat{b}^T_\infty} \newcommand{\hB}{\hat{B}} \newcommand{\hb}{\hat{b}} \hat{\text{Pr}}[x_1,\dots,x_t] & = \bti \hB_{x_t} \dots \hB_{x_1}\hb_1\\ \hb_{t+1} & = \frac{\hB_{x_t} \hb_t }{\bti \hB_{x_t} \hb_t}\\ \hat{\text{Pr}}[x_t \mid x_{1:t-1}] & = \frac{\bti \hB_{x_t} \hb_t}{\sum_x \bti \hB_x \hb_t} \end{align}$$

In [44]:
def next_b(x_t,b_t,b_infty,B):
    return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))

In [45]:
b_1


Out[45]:
array([-0.09904614,  0.01768298,  0.00090685, -0.00131447, -0.00370897])

In [46]:
next_b(sequence[0],b_1,b_infty,B)


Out[46]:
array([ -9.27558865,   1.57588227,  11.99321289,  -0.78687568, -11.19151555])

In [47]:
B.shape


Out[47]:
(101, 5, 5)

In [48]:
def cond_prob(b_infty,B,b):
    ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
    return ans / sum(ans)

In [49]:
sequence[0]


Out[49]:
50

In [50]:
bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
    bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)

In [50]:
def next_b(x_t,b_t,b_infty,B):
    return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))

def cond_prob(b_infty,B,b):
    ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
    return ans / sum(ans)

bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
    bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)

In [51]:
pl.plot(b_1)


Out[51]:
[<matplotlib.lines.Line2D at 0x114841dd0>]

In [52]:
i = 1
y = cond_prob(b_infty,B,bees[i])[:-1]
y = np.log(y-np.min(y)+0.1)
pl.plot(y)
#pl.plot([np.mean(y)]*len(y))
np.mean(y)
#pl.vlines(sequence[0],np.min(y),np.max(y),linestyles='--')
x = range(sequence[i]-jump,sequence[i]+jump)
pl.plot(x,[np.mean(y)]*len(x))


Out[52]:
[<matplotlib.lines.Line2D at 0x1148459d0>]

In [53]:
T = transition_count_matrix(big_seq)
T= (T.T / np.sum(T,1)).T
sum(T[0])


Out[53]:
1.0000000000000002

In [54]:
cond_probs_msm = np.array([T[x] for x in sequence])

In [55]:
pl.plot(cond_probs_msm[0])


Out[55]:
[<matplotlib.lines.Line2D at 0x114913450>]

In [56]:
for i in range(10):
    pl.plot(cond_probs_msm[i*10])



In [57]:
for i in range(1,10):
    pl.plot(cond_prob(b_infty,B,bees[i*10]))



In [58]:
for i in range(1,100):
    pl.plot(cond_prob(b_infty,B,bees[i]))



In [59]:
for i in range(1,500):
    pl.plot(cond_prob(b_infty,B,bees[i]))



In [60]:
for i in range(1,500):
    pl.plot(cond_prob(b_infty,B,bees[i])[:90],linewidth=5)



In [61]:
cond_probs = np.array([cond_prob(b_infty,B,b) for b in bees])
cond_probs -= np.min(cond_probs)
cond_probs/= np.max(cond_probs)

In [62]:
np.min(cond_probs)


Out[62]:
0.0

In [63]:
np.max(cond_probs)


Out[63]:
1.0

In [64]:
from scipy.stats import entropy

In [65]:
cond_prob_entropies = np.array([entropy(cp) for cp in cond_probs])

In [66]:
print(cond_prob_entropies.shape)


(1000,)

In [67]:
print(cond_prob_entropies.min())


4.36272696201

In [68]:
print(cond_prob_entropies.max())


4.60761179674

In [69]:
pl.plot(cond_prob_entropies)


Out[69]:
[<matplotlib.lines.Line2D at 0x1149ac190>]

In [70]:
pl.hist(cond_prob_entropies,bins=50);



In [71]:
P_21 = covariance(sequence)

In [72]:
svd = np.linalg.svd(P_21)

In [73]:
pl.plot(svd[1]) # n=300, jump = 50?


Out[73]:
[<matplotlib.lines.Line2D at 0x114c689d0>]

In [74]:
P_31 = trivariance(sequence)

In [77]:
for i in [1]:
    svd = np.linalg.svd(P_31[i])
    pl.plot(range(1,n+1),svd[1])
#pl.xlim((0,100))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-9df7565ccfcd> in <module>()
      1 for i in [1]:
      2     svd = np.linalg.svd(P_31[i])
----> 3     pl.plot(range(1,n+1),svd[1])
      4 #pl.xlim((0,100))

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/pyplot.pyc in plot(*args, **kwargs)
   3097         ax.hold(hold)
   3098     try:
-> 3099         ret = ax.plot(*args, **kwargs)
   3100         draw_if_interactive()
   3101     finally:

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in plot(self, *args, **kwargs)
   1372         lines = []
   1373 
-> 1374         for line in self._get_lines(*args, **kwargs):
   1375             self.add_line(line)
   1376             lines.append(line)

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _grab_next_args(self, *args, **kwargs)
    301                 return
    302             if len(remaining) <= 3:
--> 303                 for seg in self._plot_args(remaining, kwargs):
    304                     yield seg
    305                 return

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _plot_args(self, tup, kwargs)
    279             x = np.arange(y.shape[0], dtype=float)
    280 
--> 281         x, y = self._xy_from_xy(x, y)
    282 
    283         if self.command == 'plot':

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in _xy_from_xy(self, x, y)
    221         y = np.atleast_1d(y)
    222         if x.shape[0] != y.shape[0]:
--> 223             raise ValueError("x and y must have same first dimension")
    224         if x.ndim > 2 or y.ndim > 2:
    225             raise ValueError("x and y can be no greater than 2-D")

ValueError: x and y must have same first dimension

In [76]:
P_31[0]/np.sum(P_31,axis=0)


-c:1: RuntimeWarning: invalid value encountered in divide
Out[76]:
array([[ 0.85714286,  0.625     ,         nan, ...,         nan,
                nan,         nan],
       [ 0.66666667,  0.        ,  0.        , ...,         nan,
                nan,         nan],
       [ 0.5       ,  0.        ,  0.        , ...,         nan,
                nan,         nan],
       ..., 
       [        nan,         nan,         nan, ...,         nan,
                nan,  0.        ],
       [        nan,         nan,         nan, ...,  0.        ,
         0.        ,         nan],
       [        nan,         nan,         nan, ...,  0.        ,
                nan,         nan]])

In [278]:
x = npr.randn(10,2)
np.sum(x,axis=1)


Out[278]:
array([-0.18352912,  1.23447102,  1.2849898 ,  1.54009921, -1.22542266,
       -0.48128584,  2.42779155, -0.61657985, -1.84252485,  0.29169065])

In [ ]:
k = 10

Notes:

  • slices of the "trivariance" matrix have very-low-rank structure
  • the covariance matrix has low-rank structure when the underlying process is fast-mixing: I would have expected something opposite
  • the spectral learning algorithm requires $m\leq n$, but in the paper it looks like that isn't the case for RR-HMM. Not sure what I'm missing, but it seems to be a constraint of RR-HMM as well
  • The rank of $B_x$ seems to depend on the mixing speed of the underlying chain as well: slow-mixing processes are lower-rank

In [101]:
U = svd[0][:,:k]

In [95]:
U.shape


Out[95]:
(200, 10)

In [100]:
U,s,V = np.linalg.svd(npr.randn(100,100))
U.shape,s.shape,V.shape


Out[100]:
((100, 100), (100,), (100, 100))

In [102]:
P_1 = np.random.randn(200)

In [104]:
U.T.dot(P_1)


Out[104]:
array([ 1.79811368, -1.23668019, -0.56738238, -0.08762806, -1.13696033,
        1.15565864,  0.00633177, -0.02479007,  0.35536339,  0.24238011])
  • Next: use learned parameters for prediction and scoring

In [ ]:
# baum-welch learning for HMM

Next steps:

  • Hilbert-space embeddings of HMMs
  • Regularization scheme?
  • How to impose reversibility of $T$?
    • Ooh, that leads to a sweet acronym: $R^3-HMM$ (reversible, reduced-rank hidden markov model)

In [78]:
# next: apply to actual data!

In [79]:
from __future__ import print_function
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm

from msmbuilder.decomposition import tICA
from msmbuilder.example_datasets import fetch_met_enkephalin
from msmbuilder.featurizer import AtomPairsFeaturizer
from sklearn.pipeline import Pipeline

%matplotlib inline

In [80]:
dataset = fetch_met_enkephalin()
print(dataset.DESCR)


The dataset consists of ten ~50 ns molecular dynamics (MD) simulation
trajectories of the 5 residue Met-enkaphalin peptide. The aggregate
sampling is 499.58 ns. Simulations were performed starting from the 1st
model in the 1PLX PDB file, solvated with 832 TIP3P water molecules using
OpenMM 6.0. The coordinates (protein only -- the water was stripped)
are saved every 5 picoseconds. Each of the ten trajectories is roughly
50 ns long and contains about 10,000 snapshots.

Forcefield: amber99sb-ildn; water: tip3p; nonbonded method: PME; cutoffs:
1nm; bonds to hydrogen were constrained; integrator: langevin dynamics;
temperature: 300K; friction coefficient: 1.0/ps; pressure control: Monte
Carlo barostat (interval of 25 steps); timestep 2 fs.

The dataset is available on figshare at

http://dx.doi.org/10.6084/m9.figshare.1026324


In [81]:
from itertools import combinations
heavy_atoms = dataset.trajectories[0].topology.select_atom_indices('heavy')
heavy_pairs = list(combinations(heavy_atoms, 2))

In [82]:
traj = dataset.trajectories

In [83]:
feat = AtomPairsFeaturizer(heavy_pairs)
featurized = feat.fit_transform(traj)

In [84]:
feat_stack = np.vstack(featurized)

In [88]:
import msmbuilder as mb
from msmbuilder.msm import MarkovStateModel

In [94]:
pipeline1 = Pipeline([
    ('feat', AtomPairsFeaturizer(heavy_pairs)),
    ('tica', tICA(gamma=0, n_components=2)),
    ('msm', MarkovStateModel())
])

In [96]:
# gah don't want to fix this at the moment
#pipeline1.fit_transform(traj)

In [98]:
dataset = mb.example_datasets.AlanineDipeptide().get()

In [99]:
traj = dataset.trajectories

In [102]:
topology = traj[0].topology

In [105]:
# from http://msmbuilder.org/latest/examples/hmm-and-msm.html
indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = mb.featurizer.SuperposeFeaturizer(indices, traj[0][0])
sequences = featurizer.transform(traj)

In [108]:
from msmbuilder import cluster

In [121]:
n = 10
assignments = cluster.KCenters(n_clusters=n).fit_predict([np.vstack(sequences)])

In [123]:
assignments[0].shape


Out[123]:
(99999,)

In [125]:
params = learn_RR_HMM(assignments[0],5)

In [131]:
sequence = assignments[0]

In [126]:
len(params)


Out[126]:
3

In [133]:
import rr_hmm

In [136]:
b_1,b_infty,B = rr_hmm.learn_RR_HMM(sequence,5)

In [139]:
rr_hmm = 1

In [149]:
reload(rr_hmm)


Out[149]:
<module 'rr_hmm' from 'rr_hmm.py'>

In [143]:
model = rr_hmm.RR_HMM(sequence,5)

In [146]:
model.B


Out[146]:
array([[[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -4.52600000e+03,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -3.10000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -3.24400000e+03,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -1.60000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -9.20000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -8.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [ -2.70000000e+01,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00]]])

In [ ]: