In [1]:
import mdtraj as md
import numpy as np

import numpy as np
from matplotlib import pyplot as plt
from mdtraj.utils import timing
import msmbuilder as mb
from msmbuilder.example_datasets import load_doublewell
from msmbuilder.cluster import NDGrid
from msmbuilder.msm import BayesianMarkovStateModel, MarkovStateModel

%matplotlib inline

In [2]:
trjs = load_doublewell(random_state=0)['trajectories']
plt.hist(np.concatenate(trjs), bins=50, log=True)
plt.ylabel('Frequency')
plt.show()


loading "/Users/joshuafass/msmbuilder_data/doublewell/version-1_random-state-0.pkl"...

In [3]:
from __future__ import print_function
import os
from matplotlib.pyplot import *
from msmbuilder.featurizer import SuperposeFeaturizer
from msmbuilder.example_datasets import AlanineDipeptide
from msmbuilder.hmm import GaussianFusionHMM
from msmbuilder.cluster import KCenters
from msmbuilder.msm import MarkovStateModel

In [4]:
print(AlanineDipeptide.description())

dataset = AlanineDipeptide().get()
trajectories = dataset.trajectories
topology = trajectories[0].topology

indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
sequences = featurizer.transform(trajectories)


The dataset consists of ten 10ns trajectories of of alanine dipeptide,
simulated using OpenMM 6.0.1 (CUDA platform, NVIDIA GTX660) with the
AMBER99SB-ILDN force field at 300K (langevin dynamics, friction coefficient
of 91/ps, timestep of 2fs) with GBSA implicit solvent. The coordinates are
saved every 1ps. Each trajectory contains 9,999 snapshots.

The dataset, including the script used to generate the dataset
is available on figshare at

http://dx.doi.org/10.6084/m9.figshare.1026131


In [5]:
len(sequences)


Out[5]:
10

In [6]:
lag_times = [1, 10, 20, 30, 40]
hmm_ts0 = {}
hmm_ts1 = {}
n_states = [3, 5]

for n in n_states:
    hmm_ts0[n] = []
    hmm_ts1[n] = []
    for lag_time in lag_times:
        strided_data = [s[i::lag_time] for s in sequences for i in range(lag_time)]
        hmm = GaussianFusionHMM(n_states=n, n_features=sequences[0].shape[1], n_init=1).fit(strided_data)
        timescales = hmm.timescales_ * lag_time
        hmm_ts0[n].append(timescales[0])
        hmm_ts1[n].append(timescales[1])
        print('n_states=%d\tlag_time=%d\ttimescales=%s' % (n, lag_time, timescales))
    print()


n_states=3	lag_time=1	timescales=[ 125.83473969    3.76069617]
n_states=3	lag_time=10	timescales=[ 208.22064209    5.93715   ]
n_states=3	lag_time=20	timescales=[ 221.70774841    6.29119635]
n_states=3	lag_time=30	timescales=[ 227.06152344    7.7659955 ]
n_states=3	lag_time=40	timescales=[ 230.05775452    8.96201706]

n_states=5	lag_time=1	timescales=[ 124.05364227    4.28548574    1.66884148    0.95070887]
n_states=5	lag_time=10	timescales=[ 210.96549988    6.21665716    2.39654016    1.35008681]
n_states=5	lag_time=20	timescales=[ 223.22265625    6.2946701     4.25229549    3.35900426]
n_states=5	lag_time=30	timescales=[ 227.30316162    7.20709753    6.2593503 ]
n_states=5	lag_time=40	timescales=[ 227.23243713    8.77617931    8.41687107]

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/msmbuilder-3.0.0_beta-py2.7-macosx-10.6-x86_64.egg/msmbuilder/hmm/ghmm.py:390: RuntimeWarning: divide by zero encountered in log
  self._impl.transmat_ = value

In [7]:
figure(figsize=(14,3))

for i, n in enumerate(n_states):
    subplot(1,len(n_states),1+i)
    plot(lag_times, hmm_ts0[n])
    plot(lag_times, hmm_ts1[n])
    if i == 0:
        ylabel('Relaxation Timescale')
    xlabel('Lag Time')
    title('%d states' % n)

show()



In [8]:
hmm.score(sequences)


Out[8]:
2969326.0

In [9]:
me_data = mb.example_datasets.MetEnkephalin().get()

In [10]:
fs_data = mb.example_datasets.FsPeptide().get()


loading trajectory_1.xtc...
loading trajectory_10.xtc...
loading trajectory_11.xtc...
loading trajectory_12.xtc...
loading trajectory_13.xtc...
loading trajectory_14.xtc...
loading trajectory_15.xtc...
loading trajectory_16.xtc...
loading trajectory_17.xtc...
loading trajectory_18.xtc...
loading trajectory_19.xtc...
loading trajectory_2.xtc...
loading trajectory_20.xtc...
loading trajectory_21.xtc...
loading trajectory_22.xtc...
loading trajectory_23.xtc...
loading trajectory_24.xtc...
loading trajectory_25.xtc...
loading trajectory_26.xtc...
loading trajectory_27.xtc...
loading trajectory_28.xtc...
loading trajectory_3.xtc...
loading trajectory_4.xtc...
loading trajectory_5.xtc...
loading trajectory_6.xtc...
loading trajectory_7.xtc...
loading trajectory_8.xtc...
loading trajectory_9.xtc...

In [11]:
dataset = fs_data

In [12]:
trajectories = dataset.trajectories
topology = trajectories[0].topology

indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
sequences = featurizer.transform(trajectories)

In [13]:
len(sequences)


Out[13]:
28

In [14]:
sum([len(traj) for traj in sequences])


Out[14]:
280000

In [15]:
sequences[0].shape


Out[15]:
(10000, 128)

In [17]:
hmm


Out[17]:
GaussianFusionHMM(fusion_prior=0.01, init_algo='kmeans', init_params='tmv',
         n_em_iter=10, n_features=10, n_hotstart='all', n_init=1, n_jobs=1,
         n_lqa_iter=10, n_states=5, params='tmv', precision=None,
         random_state=None, reversible_type='mle', thresh=0.01,
         timing=False, transmat_prior=1.0, vars_prior=0.001,
         vars_weight=1.0)

In [18]:
hmm.params


Out[18]:
'tmv'

In [19]:
hmm.vars_.shape


Out[19]:
(5, 10)

In [20]:
hmm.vars_


Out[20]:
array([[  1.91488434e-04,   5.20601352e-05,   9.68532171e-04,
          5.73767356e-05,   1.18788179e-04,   3.34413315e-04,
          2.08521597e-05,   3.06292495e-04,   9.14890552e-05,
          2.13697858e-04],
       [  1.25953040e-04,   4.19898279e-05,   5.76663821e-04,
          7.08183215e-05,   3.56524870e-05,   1.40600023e-04,
          1.87604855e-05,   4.91073733e-05,   4.62977187e-05,
          2.03883697e-04],
       [  2.34200284e-04,   2.66525713e-05,   6.02752028e-04,
          6.69366709e-05,   5.58121246e-05,   1.22097845e-04,
          3.06284091e-05,   5.00308524e-04,   1.64567828e-04,
          1.46769351e-04],
       [  1.16374758e-04,   3.59268415e-05,   5.92628086e-04,
          7.59668910e-05,   3.21573134e-05,   6.84226688e-05,
          4.11394722e-05,   1.69776124e-03,   3.46563233e-04,
          2.74006801e-04],
       [  3.38291196e-04,   7.51507396e-05,   1.15648727e-03,
          1.73907625e-04,   7.92533465e-05,   1.47518134e-04,
          4.37595154e-05,   6.00454339e-04,   3.76172742e-04,
          5.79788873e-04]], dtype=float32)

In [21]:
hmm.transmat_


Out[21]:
array([[ 0.29276803,  0.51149726,  0.03518528,  0.12032859,  0.04022082],
       [ 0.28584546,  0.52791041,  0.02907804,  0.12267954,  0.03448655],
       [ 0.02091923,  0.03093583,  0.48235598,  0.02223061,  0.44355837],
       [ 0.25485855,  0.46495891,  0.07919489,  0.11022133,  0.09076632],
       [ 0.02516027,  0.03860345,  0.46669218,  0.02680765,  0.44273645]], dtype=float32)

In [22]:
import pylab as pl
%matplotlib inline
pl.imshow(hmm.transmat_,interpolation='none')


Out[22]:
<matplotlib.image.AxesImage at 0x113318310>

In [27]:
c=mb.cluster.KMedoids(n_clusters=100)

In [ ]:
c.fit(sequences)

In [ ]: