In [1]:
# spectral learning for HMM
In [2]:
import numpy as np
import numpy.random as npr
import pylab as pl
from scipy import stats
import cython
%load_ext cython
%matplotlib inline
In [3]:
# Walker alias method implementation
# adapted from https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
def alias_setup(probs):
K = len(probs)
q = np.zeros(K)
J = np.zeros(K, dtype=np.int)
# Sort the data into the outcomes with probabilities
# that are larger and smaller than 1/K.
smaller = []
larger = []
for kk, prob in enumerate(probs):
q[kk] = K*prob
if q[kk] < 1.0:
smaller.append(kk)
else:
larger.append(kk)
# Loop though and create little binary mixtures that
# appropriately allocate the larger outcomes over the
# overall uniform mixture.
while len(smaller) > 0 and len(larger) > 0:
small = smaller.pop()
large = larger.pop()
J[small] = large
q[large] = q[large] - (1.0 - q[small])
if q[large] < 1.0:
smaller.append(large)
else:
larger.append(large)
return (J, q)
In [4]:
%%cython
def c_alias_draw(J, q, K, rand1, rand2):
# Draw from the overall uniform mixture.
kk = int(rand1*K)
# Draw from the binary mixture, either keeping the
# small one, or choosing the associated larger one.
if rand2 < q[kk]:
return kk
else:
return J[kk]
In [5]:
class Alias():
def __init__(self,probs):
self.probs = probs
self.J,self.q = alias_setup(probs)
self.K = len(probs)
def draw(self):
return c_alias_draw(self.J,self.q,self.K,rand1=npr.rand(),rand2=npr.rand())
def __repr__(self):
return "Alias(" + str(self.probs / sum(self.probs)) + ")"
In [6]:
class MSM():
def __init__(self,T=None,size=None):
if type(T) == type(None):
T = self.gen_stoch_matrix(size)
else:
assert(T.shape[0] == T.shape[1])
self.T = T
self.sampler = np.empty(len(T),dtype=object)
for i in range(len(T)):
self.sampler[i] = Alias(T[i])
def draw(self,num_samples=1000):
samples = np.zeros(num_samples,dtype=int)
samples[0] = npr.randint(len(self.T))
for i in range(1,num_samples):
samples[i] = self.sampler[samples[i-1]].draw()
return samples
def gen_stoch_matrix(self,size=5,num_iter=20):
" Generates a random row-stochastic matrix "
T = npr.rand(size,size)
for i in range(size):
T[i] /= sum(T[i])
#note: doesn't satisfy detailed balance
return T
def compute_stationary_prob(self):
" CURRENTLY BROKEN "
return np.linalg.eigh(self.T)[1][-1]
def __repr__(self):
return str(len(self.T)) + "-state MSM"
In [7]:
# from eq. 5.38 of MSM book
example_T = np.array([[0.86207, 0.12931, 0.00862],[0.15625, 0.83333, 0.01041],[0.00199, 0.00199, 0.99602]])
In [8]:
m = MSM(example_T)
observed_sequence = m.draw(1000000)
In [9]:
# random symmetric matrix
n = 100
sym_T = npr.rand(n,n)
for i in xrange(n):
for j in xrange(i):
sym_T[j,i] = sym_T[i,j]
In [10]:
def observations(states,obs_size=10):
obs = np.array(states)
for i in range(1,obs_size):
obs[states % i == 0] = i
obs[states == 0] = 0
return obs
In [11]:
m = MSM(sym_T)
hidden_sequence = m.draw(10000)
In [12]:
observed_sequence = observations(hidden_sequence)
print(len(observed_sequence))
set(observed_sequence)
Out[12]:
In [13]:
def transition_count_matrix(sequence):
n = len(set(sequence)) # number of states
C = np.zeros((n,n),dtype=int)
for t in range(1,len(sequence)):
i,j = sequence[t-1:t+1]
C[i,j] += 1
return C
In [13]:
In [14]:
observed_counts = transition_count_matrix(observed_sequence)
observed_counts.shape
Out[14]:
In [15]:
def triplet_count_matrix(sequence):
n = len(set(sequence)) # number of states
C = np.zeros((n,n,n),dtype=int)
for t in range(2,len(sequence)):
i,y,j = sequence[t-2:t+1]
C[y,i,j] += 1
return C
In [16]:
observed_triplets = transition_count_matrix(observed_sequence)
In [17]:
observed_triplets.shape
Out[17]:
In [18]:
observed_triplets
Out[18]:
In [19]:
#spectral method:
In [20]:
np.linalg.svd?
In [21]:
svd = np.linalg.svd(observed_counts)
In [22]:
svd[0].shape
Out[22]:
Problem set-up:
To learn a HMM from data, we can use EM or spectral method:
Goals in Siddiqi et al.:
Reduced-rank HMM: special case of predictive-state-representations, HMMs are a special case of RR-HMMs
States
Transitions
Emissions
Prior
Observable operators
Inference in HMMs [to-do] $$%\Pr[y_1,y_2,y_3,\dots,y_\tau]= \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau \Pr[y_\tau] \mid x_\tau] \cdots \sum_{x_{\tau+1}} \Pr[x_{\tau+1} \mid x_\tau] \Pr[y_\tau] \mid x_\tau]$$
Recover observable HMM parameters from probabilities of doubles and triples of observations
*Algorithm: Learn-RR-HMM($k,N$):
In [23]:
def transition_count_matrix(sequence):
n = np.max(sequence)+1 # number of states
C = np.zeros((n,n))
for t in xrange(1,len(sequence)):
i,j = sequence[t-1:t+1]
C[i,j] += 1
return C
In [24]:
def covariance(sequence):
P_21 = transition_count_matrix(sequence)
for i in xrange(len(P_21)):
P_21[i] /= sum(P_21[i])
return P_21
In [25]:
def trivariance(sequence):
n = np.max(sequence)+1
P_31 = np.zeros((n,n,n))
for t in xrange(2,len(sequence)):
i,x,j = sequence[t-2:t+1]
P_31[x,i,j] += 1
#for x in xrange(n):
# P_31[x] /= np.sum(P_31[x],axis=1)
return P_31
In [26]:
def learn_RR_HMM(sequence,k):
n = np.max(sequence)+1
# compute empirical estimate of P_1
P_1 = np.zeros(n)
for i in xrange(len(sequence)):
P_1[sequence[i]] += 1
P_1 /= sum(P_1)
# compute empirical estimate of P_21
P_21 = covariance(sequence)
# compute empirical estimates of P_3x1
P_31 = trivariance(sequence)
# compute SVD of P_21
svd = np.linalg.svd(P_21)
U = svd[0][:,:k]
UT = U.T
# compute model parameters
b_1 = UT.dot(P_1)
b_infty = np.linalg.pinv((P_21.T.dot(U))).dot(P_1)
B = np.zeros((n,k,k))
UTP21 = np.linalg.pinv(UT.dot(P_21))
for x in xrange(n):
B[x] = (UT.dot(P_31[x])).dot(UTP21)
return b_1,b_infty,B
In [27]:
sequence = npr.randint(0,10,10000)
b_1,b_infty,B = learn_RR_HMM(sequence,5)
In [28]:
# k-nearest neighbor random walk
sequence = np.zeros(100000)
n = 100
jump=5
sequence[0] = npr.randint(n)
for i in range(1,len(sequence)):
j=npr.randint(jump*2 + 1)-jump
sequence[i] = min(abs(sequence[i-1] + j),n)
sequence = np.array(sequence,dtype=int)
In [29]:
# random walk with hidden state
length = 1000
hidden_states=5
observed_states=100
sequence = np.zeros(length)
h = np.zeros(length)
jump_prob = 0.1
h[0] = npr.randint(0,hidden_states)
for i in range(1,length):
if npr.rand() < jump_prob:
h[i] = (h[i-1] + npr.randint(2)*2-1) % hidden_states
sequence[0] = round(observed_states/2)
mid_h = round(hidden_states/2)
for i in range(1,length):
sequence[i] = (sequence[i-1] + round(npr.randn()*(h[i]+1)))
if sequence[i] > observed_states or sequence[i] < 0:
sequence[i] = sequence[i-1]
sequence = np.array(sequence,dtype=int)
In [30]:
sequences = []
for i in range(100):
n_sequence = np.zeros(length)
n_sequence[0] = npr.randint(0,observed_states)
mid_h = npr.randint(0,hidden_states)
for i in range(1,length):
n_sequence[i] = (n_sequence[i-1] + round(npr.randn()*(h[i]+1)))
if n_sequence[i] > observed_states or n_sequence[i] < 0:
n_sequence[i] = n_sequence[i-1]
n_sequence = np.array(n_sequence,dtype=int)
sequences.append(n_sequence)
In [31]:
big_seq = np.hstack(sequences)
big_seq.shape
Out[31]:
In [32]:
sequence[1]
Out[32]:
In [33]:
pl.plot(h)
Out[33]:
In [34]:
pl.plot(sequence)
Out[34]:
In [35]:
b_1,b_infty,B = learn_RR_HMM(big_seq,5)
In [36]:
%timeit learn_RR_HMM(sequence,5)
In [37]:
B[0].shape
Out[37]:
In [38]:
b_infty.shape
Out[38]:
In [39]:
b_infty.T.dot(B[0]).dot(b_1)
Out[39]:
In [40]:
b_infty[-len(b_infty)] == b_infty[0]
Out[40]:
In [41]:
def score(sequence,b_1,b_infty,B):
ans = b_infty.T
for i in range(1,len(B)):
ans = ans.dot(B[sequence[-i]])
return ans.dot(b_1)
In [42]:
score(sequence, b_1, b_infty, B)
Out[42]:
In [43]:
score(npr.randint(0,n,len(sequence)),b_1, b_infty, B)
Out[43]:
In [44]:
def next_b(x_t,b_t,b_infty,B):
return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))
In [45]:
b_1
Out[45]:
In [46]:
next_b(sequence[0],b_1,b_infty,B)
Out[46]:
In [47]:
B.shape
Out[47]:
In [48]:
def cond_prob(b_infty,B,b):
ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
return ans / sum(ans)
In [49]:
sequence[0]
Out[49]:
In [50]:
bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)
In [50]:
def next_b(x_t,b_t,b_infty,B):
return B[x_t].dot(b_t)/(b_infty.T.dot(B[x_t]).dot(b_t))
def cond_prob(b_infty,B,b):
ans = np.array([b_infty.T.dot(B[x]).dot(b) for x in xrange(len(B))])
return ans / sum(ans)
bees = np.zeros((len(sequence),len(b_1)))
bees[0] = b_1
for i in range(1,len(bees)):
bees[i] = next_b(sequence[i-1],bees[i-1],b_infty,B)
In [51]:
pl.plot(b_1)
Out[51]:
In [52]:
i = 1
y = cond_prob(b_infty,B,bees[i])[:-1]
y = np.log(y-np.min(y)+0.1)
pl.plot(y)
#pl.plot([np.mean(y)]*len(y))
np.mean(y)
#pl.vlines(sequence[0],np.min(y),np.max(y),linestyles='--')
x = range(sequence[i]-jump,sequence[i]+jump)
pl.plot(x,[np.mean(y)]*len(x))
Out[52]:
In [53]:
T = transition_count_matrix(big_seq)
T= (T.T / np.sum(T,1)).T
sum(T[0])
Out[53]:
In [54]:
cond_probs_msm = np.array([T[x] for x in sequence])
In [55]:
pl.plot(cond_probs_msm[0])
Out[55]:
In [56]:
for i in range(10):
pl.plot(cond_probs_msm[i*10])
In [57]:
for i in range(1,10):
pl.plot(cond_prob(b_infty,B,bees[i*10]))
In [58]:
for i in range(1,100):
pl.plot(cond_prob(b_infty,B,bees[i]))
In [59]:
for i in range(1,500):
pl.plot(cond_prob(b_infty,B,bees[i]))
In [60]:
for i in range(1,500):
pl.plot(cond_prob(b_infty,B,bees[i])[:90],linewidth=5)
In [61]:
cond_probs = np.array([cond_prob(b_infty,B,b) for b in bees])
cond_probs -= np.min(cond_probs)
cond_probs/= np.max(cond_probs)
In [62]:
np.min(cond_probs)
Out[62]:
In [63]:
np.max(cond_probs)
Out[63]:
In [64]:
from scipy.stats import entropy
In [65]:
cond_prob_entropies = np.array([entropy(cp) for cp in cond_probs])
In [66]:
print(cond_prob_entropies.shape)
In [67]:
print(cond_prob_entropies.min())
In [68]:
print(cond_prob_entropies.max())
In [69]:
pl.plot(cond_prob_entropies)
Out[69]:
In [70]:
pl.hist(cond_prob_entropies,bins=50);
In [71]:
P_21 = covariance(sequence)
In [72]:
svd = np.linalg.svd(P_21)
In [73]:
pl.plot(svd[1]) # n=300, jump = 50?
Out[73]:
In [74]:
P_31 = trivariance(sequence)
In [77]:
for i in [1]:
svd = np.linalg.svd(P_31[i])
pl.plot(range(1,n+1),svd[1])
#pl.xlim((0,100))
In [76]:
P_31[0]/np.sum(P_31,axis=0)
Out[76]:
In [278]:
x = npr.randn(10,2)
np.sum(x,axis=1)
Out[278]:
In [ ]:
k = 10
Notes:
In [101]:
U = svd[0][:,:k]
In [95]:
U.shape
Out[95]:
In [100]:
U,s,V = np.linalg.svd(npr.randn(100,100))
U.shape,s.shape,V.shape
Out[100]:
In [102]:
P_1 = np.random.randn(200)
In [104]:
U.T.dot(P_1)
Out[104]:
In [ ]:
# baum-welch learning for HMM
Next steps:
In [78]:
# next: apply to actual data!
In [79]:
from __future__ import print_function
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
from msmbuilder.decomposition import tICA
from msmbuilder.example_datasets import fetch_met_enkephalin
from msmbuilder.featurizer import AtomPairsFeaturizer
from sklearn.pipeline import Pipeline
%matplotlib inline
In [80]:
dataset = fetch_met_enkephalin()
print(dataset.DESCR)
In [81]:
from itertools import combinations
heavy_atoms = dataset.trajectories[0].topology.select_atom_indices('heavy')
heavy_pairs = list(combinations(heavy_atoms, 2))
In [82]:
traj = dataset.trajectories
In [83]:
feat = AtomPairsFeaturizer(heavy_pairs)
featurized = feat.fit_transform(traj)
In [84]:
feat_stack = np.vstack(featurized)
In [88]:
import msmbuilder as mb
from msmbuilder.msm import MarkovStateModel
In [94]:
pipeline1 = Pipeline([
('feat', AtomPairsFeaturizer(heavy_pairs)),
('tica', tICA(gamma=0, n_components=2)),
('msm', MarkovStateModel())
])
In [96]:
# gah don't want to fix this at the moment
#pipeline1.fit_transform(traj)
In [98]:
dataset = mb.example_datasets.AlanineDipeptide().get()
In [99]:
traj = dataset.trajectories
In [102]:
topology = traj[0].topology
In [105]:
# from http://msmbuilder.org/latest/examples/hmm-and-msm.html
indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = mb.featurizer.SuperposeFeaturizer(indices, traj[0][0])
sequences = featurizer.transform(traj)
In [108]:
from msmbuilder import cluster
In [121]:
n = 10
assignments = cluster.KCenters(n_clusters=n).fit_predict([np.vstack(sequences)])
In [123]:
assignments[0].shape
Out[123]:
In [125]:
params = learn_RR_HMM(assignments[0],5)
In [131]:
sequence = assignments[0]
In [126]:
len(params)
Out[126]:
In [133]:
import rr_hmm
In [136]:
b_1,b_infty,B = rr_hmm.learn_RR_HMM(sequence,5)
In [139]:
rr_hmm = 1
In [149]:
reload(rr_hmm)
Out[149]:
In [143]:
model = rr_hmm.RR_HMM(sequence,5)
In [146]:
model.B
Out[146]:
In [ ]: