In [1]:
import numpy as np
import numpy.random as npr

In [2]:
path = '../../../Downloads/abl_snapshot/abl_snapshot/*.h5'

In [3]:
from glob import glob
fnames = glob(path)[:-1]
fnames


Out[3]:
['../../../Downloads/abl_snapshot/abl_snapshot/run0-clone0.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone1.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone10.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone11.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone12.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone13.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone14.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone15.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone16.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone17.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone18.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone19.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone2.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone3.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone4.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone5.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone6.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone7.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone8.h5',
 '../../../Downloads/abl_snapshot/abl_snapshot/run0-clone9.h5']

In [4]:
import mdtraj as md
trajs = [md.load(fname) for fname in fnames]
traj = trajs[0]

In [5]:
trajs


Out[5]:
[<mdtraj.Trajectory with 2600 frames, 4062 atoms, 252 residues, and unitcells at 0x114d92650>,
 <mdtraj.Trajectory with 2440 frames, 4062 atoms, 252 residues, and unitcells at 0x11ccb9e90>,
 <mdtraj.Trajectory with 5997 frames, 4062 atoms, 252 residues, and unitcells at 0x11cea5710>,
 <mdtraj.Trajectory with 4119 frames, 4062 atoms, 252 residues, and unitcells at 0x12558cd10>,
 <mdtraj.Trajectory with 6036 frames, 4062 atoms, 252 residues, and unitcells at 0x125776650>,
 <mdtraj.Trajectory with 8198 frames, 4062 atoms, 252 residues, and unitcells at 0x12d2c9e90>,
 <mdtraj.Trajectory with 5760 frames, 4062 atoms, 252 residues, and unitcells at 0x12d7c8710>,
 <mdtraj.Trajectory with 6720 frames, 4062 atoms, 252 residues, and unitcells at 0x12e1ade90>,
 <mdtraj.Trajectory with 7717 frames, 4062 atoms, 252 residues, and unitcells at 0x12e39a7d0>,
 <mdtraj.Trajectory with 3760 frames, 4062 atoms, 252 residues, and unitcells at 0x12e57ffd0>,
 <mdtraj.Trajectory with 8395 frames, 4062 atoms, 252 residues, and unitcells at 0x12e76a890>,
 <mdtraj.Trajectory with 4000 frames, 4062 atoms, 252 residues, and unitcells at 0x12e956110>,
 <mdtraj.Trajectory with 8197 frames, 4062 atoms, 252 residues, and unitcells at 0x12ec3c890>,
 <mdtraj.Trajectory with 6959 frames, 4062 atoms, 252 residues, and unitcells at 0x12ef271d0>,
 <mdtraj.Trajectory with 4520 frames, 4062 atoms, 252 residues, and unitcells at 0x12f10da10>,
 <mdtraj.Trajectory with 4000 frames, 4062 atoms, 252 residues, and unitcells at 0x12f2f9290>,
 <mdtraj.Trajectory with 4000 frames, 4062 atoms, 252 residues, and unitcells at 0x12f4dfa10>,
 <mdtraj.Trajectory with 3399 frames, 4062 atoms, 252 residues, and unitcells at 0x12f6ca350>,
 <mdtraj.Trajectory with 4327 frames, 4062 atoms, 252 residues, and unitcells at 0x12f8afb90>,
 <mdtraj.Trajectory with 7477 frames, 4062 atoms, 252 residues, and unitcells at 0x12fa9c410>]

In [10]:
n_atoms = traj.n_atoms
n_atoms


Out[10]:
4062

In [6]:
import pyemma
import matplotlib.pyplot as plt
%matplotlib inline

In [70]:
from msmbuilder.featurizer import DihedralFeaturizer
dih = DihedralFeaturizer(['phi', 'psi', 'omega', 'chi1', 'chi2', 'chi3', 'chi4'])
X_dih = dih.fit_transform(trajs)

In [76]:
X_dih[0].shape[1] / 2,X_dih[0].shape[1]


Out[76]:
(1239, 2478)

In [45]:
from msmbuilder.featurizer import AlphaAngleFeaturizer
aaf = AlphaAngleFeaturizer()
X_aaf = aaf.fit_transform(trajs)

In [46]:
X_aaf[0].shape


Out[46]:
(2600, 498)

In [47]:
from msmbuilder.featurizer import AtomPairsFeaturizer

all_pairs = []
for i in range(n_atoms):
    for j in range(i):
        all_pairs.append((i,j))
print(len(all_pairs))


8247891

In [48]:
all_pairs[:10]


Out[48]:
[(1, 0),
 (2, 0),
 (2, 1),
 (3, 0),
 (3, 1),
 (3, 2),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3)]

In [49]:
npr.seed(0)
npr.shuffle(all_pairs)

In [50]:
all_pairs[:10]


Out[50]:
[(2953, 2694),
 (3853, 1418),
 (3497, 1081),
 (1095, 1036),
 (2690, 640),
 (3925, 3286),
 (1864, 615),
 (3540, 1555),
 (2215, 515),
 (1722, 1431)]

In [61]:
some_pairs = all_pairs[:3000]

In [62]:
apf = AtomPairsFeaturizer(some_pairs)
X_apf = apf.fit_transform(trajs)

In [53]:
import pyemma
tica = pyemma.coordinates.tica(X_aaf)
X_tica = tica.get_output()


getting output of TICA: 100% (30/30) [#############################] eta 00:01 -

In [63]:
tica_p = pyemma.coordinates.tica(X_apf)
X_tica_p = tica.get_output()


getting output of TICA: 100% (30/30) [#############################] eta 00:01 /

In [71]:
tica_dih = pyemma.coordinates.tica(X_dih)
X_tica_dih = tica.get_output()


getting output of TICA: 100% (30/30) [#############################] eta 00:00 \

In [64]:
Xs = [X_apf,X_aaf]
X_feat = [np.hstack([x[i] for x in Xs]) for i in range(len(trajs))]
tica_tot = pyemma.coordinates.tica(X_feat)


calculate covariances: 100% (30/30) [##############################] eta 00:01 /

In [ ]:
# also throw in dihedrals?

In [56]:
X_tot = tica_tot.get_output()


getting output of TICA: 100% (30/30) [#############################] eta 00:01 -

In [66]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(np.cumsum(tica.eigenvalues**2))
plt.plot(np.cumsum(tica_p.eigenvalues**2))
plt.plot(np.cumsum(tica_tot.eigenvalues**2))
plt.plot(range(180),range(180),'--')
#plt.ylim(0,180)
#plt.xlim(0,500)


Out[66]:
[<matplotlib.lines.Line2D at 0x2c65f8290>]

In [79]:
from triangle import corner
n_tics = 6
corner(np.vstack(X_tot)[:,:n_tics],
           plot_contours=False,
           labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
          )


Out[79]:

In [62]:
tica_tot.dimension()


Out[62]:
480

In [68]:
kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tot,k=100,max_iter=100)


kmeans iterations: 100% (100/100) [################################] eta 00:01 \07-05-16 23:48:35 pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering[5] INFO     Algorithm did not reach convergence criterion of 1e-05 in 100 iterations. Consider increasing max_iter.
kmeans iterations: 100% (100/100) [################################] eta 00:00 /

In [69]:
dtrajs = [dtraj.flatten() for dtraj in kmeans.get_output()]


getting output of MiniBatchKmeansClustering: 100% (30/30) [########] eta 00:01 \

In [70]:
nits = 20
its = pyemma.msm.its(dtrajs,lags=range(1,101),nits=nits)
pyemma.plots.plot_implied_timescales(its)


estimating MaximumLikelihoodMSM: 100% (100/100) [##################] eta 00:01 |
/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/msmtools/estimation/api.py:941: NotConvergedWarning: Reversible transition matrix estimation didn't converge.
  T = sparse.mle_trev.mle_trev(C, **kwargs)
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x15a89df50>

In [9]:
from math import factorial
factorial(n_atoms) / (factorial(n_atoms - 3) * factorial(3))
# The full Grassmann vector has 11 billion entries!


Out[9]:
11162145820L

In [10]:
n_residues = 252
factorial(n_residues) / (factorial(n_residues - 3) * factorial(3))
# The Grassmann vector using only alpha-carbons has >2.5 million entries


Out[10]:
2635500L

In [33]:
def select_random_features(n_atoms,n_features=1000,seed=0):
    npr.seed(0)
    all_indices = np.arange(n_atoms)
    
    feature_indices = []
    for i in range(n_features):
        npr.shuffle(all_indices)
        feature_ind = np.zeros(n_atoms,dtype=bool)
        feature_ind[all_indices[:3]] = True
        feature_indices.append(feature_ind)
    return feature_indices

def grassmann_vector(X,indices):
    return np.array([np.linalg.det(X[s]) for s in indices])

def grassmann_featurize(trajs,indices):
    return [np.array([grassmann_vector(x,indices) for x in traj.xyz]) for traj in trajs]

def parallel_grassmann_featurize(trajs,indices,n_jobs=8):
    from joblib import Parallel,delayed
    def f(traj):
        return np.array([grassmann_vector(x,indices) for x in traj.xyz])
    return Parallel(n_jobs=n_jobs,backend='threading')(delayed(f)(traj) for traj in trajs)

In [36]:
feature_indices = select_random_features(n_atoms,n_features=3000)

In [37]:
%%time
X_g = grassmann_featurize(trajs,feature_indices)


CPU times: user 1h 55min 59s, sys: 3.54 s, total: 1h 56min 2s
Wall time: 1h 56min 2s

In [42]:
np.save('X_grassmann_abl_run_0.npy',X_g)

In [35]:
X_g_ = parallel_grassmann_featurize(trajs,feature_indices)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-d2006b9ad195> in <module>()
----> 1 X_g_ = parallel_grassmann_featurize(trajs,feature_indices)

<ipython-input-33-445140bed9a8> in parallel_grassmann_featurize(trajs, indices, n_jobs)
     21     def f(traj):
     22         return np.array([grassmann_vector(x,indices) for x in traj.xyz])
---> 23     return Parallel(n_jobs=n_jobs,backend='threading')(delayed(f)(traj) for traj in trajs)

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/joblib/parallel.pyc in __call__(self, iterable)
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
    801                 self._iterating = True
    802             else:

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    651             batch_size = self.batch_size
    652         with self._lock:
--> 653             tasks = BatchedCalls(itertools.islice(iterator, batch_size))
    654             if not tasks:
    655                 # No more tasks available in the iterator: tell caller to stop.

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/joblib/parallel.pyc in __init__(self, iterator_slice)
     66 
     67     def __init__(self, iterator_slice):
---> 68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 

<ipython-input-33-445140bed9a8> in <genexpr>((traj,))
     21     def f(traj):
     22         return np.array([grassmann_vector(x,indices) for x in traj.xyz])
---> 23     return Parallel(n_jobs=n_jobs,backend='threading')(delayed(f)(traj) for traj in trajs)

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/joblib/parallel.pyc in delayed(function, check_pickle)
    156     # using with multiprocessing:
    157     if check_pickle:
--> 158         pickle.dumps(function)
    159 
    160     def delayed_function(*args, **kwargs):

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/copy_reg.pyc in _reduce_ex(self, proto)
     68     else:
     69         if base is self.__class__:
---> 70             raise TypeError, "can't pickle %s objects" % base.__name__
     71         state = base(self)
     72     args = (self.__class__, base, state)

TypeError: can't pickle function objects

In [38]:
tica_g = pyemma.coordinates.tica(X_g)
X_tica_g = tica_g.get_output()

n_tics = 6
from triangle import corner
corner(np.vstack(X_tica_g)[:,:n_tics],
           plot_contours=False,
           labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
          )


getting output of TICA: 100% (30/30) [#############################] eta 00:01 /
Out[38]:

In [43]:
plt.plot(np.cumsum(tica_g.eigenvalues**2),label='Grassmann')
plt.plot(range(300),range(300))


Out[43]:
[<matplotlib.lines.Line2D at 0x1aafca990>]

In [81]:



Out[81]:
(2600, 3000)

In [77]:
plt.plot(np.cumsum(tica.eigenvalues**2),label = '500 alpha angles')
plt.plot(np.cumsum(tica_dih.eigenvalues**2), label = '1300 (x 2) dihedral angles (sin, cos)')
plt.plot(np.cumsum(tica_p.eigenvalues**2),label = '3000 atom-pairs')
plt.plot(np.cumsum(tica_tot.eigenvalues**2), label = '3500 alpha angles + atom-pairs')
plt.plot(np.cumsum(tica_g.eigenvalues**2), linewidth = 3, label = '3000 Grassmann features')
plt.plot(range(300),range(300),'--')
plt.legend(loc='best')


Out[77]:
<matplotlib.legend.Legend at 0x3680ca6d0>

In [82]:
X_apf_g = [np.hstack([x[i] for x in [X_g,X_apf]]) for i in range(len(trajs))]

In [83]:
tica_apf_g = pyemma.coordinates.tica(X_apf_g)


calculate covariances: 100% (30/30) [##############################] eta 00:02 \

In [89]:
plt.plot(np.cumsum(tica.eigenvalues**2),label = '500 alpha angles')
plt.plot(np.cumsum(tica_dih.eigenvalues**2), label = '1300 (x 2) dihedral angles (sin, cos)')
plt.plot(np.cumsum(tica_p.eigenvalues**2),label = '3000 atom-pairs')
plt.plot(np.cumsum(tica_tot.eigenvalues**2), label = '3500 alpha angles + atom-pairs')
plt.plot(np.cumsum(tica_g.eigenvalues**2), linewidth = 3, label = '3000 Grassmann features')
plt.plot(np.cumsum(tica_apf_g.eigenvalues**2), '--', linewidth = 3, label = '6000 Grassmann + pairs')
plt.plot(range(600),range(600),'--')
plt.legend(loc=(0.5,0))


Out[89]:
<matplotlib.legend.Legend at 0x115217810>

In [90]:
X_tica_apf_g = tica_apf_g.get_output()


getting output of TICA: 100% (30/30) [#############################] eta 00:01 /

In [92]:
del(X_apf_g)
del(Xs)

In [ ]:
# next-- draw a few samples of size 1000 or 3000

In [44]:
nits = 20
k = 100

kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_g,k=k,max_iter=100)
dtrajs_g = [dtraj.flatten() for dtraj in kmeans.get_output()]

its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,1000)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)


kmeans iterations: 100% (100/100) [################################] eta 00:01 /08-05-16 09:24:27 pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering[8] INFO     Algorithm did not reach convergence criterion of 1e-05 in 100 iterations. Consider increasing max_iter.
estimating MaximumLikelihoodMSM: 100% (190/190) [##################] eta 00:01 |
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1daf5e790>

In [59]:
its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)


estimating MaximumLikelihoodMSM: 100% (140/140) [##################] eta 00:00 \
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x367dac690>

In [ ]:
its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)

In [41]:
k = 500

kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_g,k=k,max_iter=100)
dtrajs_g = [dtraj.flatten() for dtraj in kmeans.get_output()]

its = pyemma.msm.its(dtrajs_g,lags=range(1,101),nits=nits)
pyemma.plots.plot_implied_timescales(its)


kmeans iterations:  13% ( 13/100) [####                            ] eta 33:38 \08-05-16 03:07:28 pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering[6] INFO     Cluster centers converged after 14 steps.
estimating MaximumLikelihoodMSM: 100% (100/100) [##################] eta 00:00 /08-05-16 03:09:19 pyemma.msm.estimators.implied_timescales.ImpliedTimescales[7] WARNING  Estimation has failed at lagtimes: [  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26
  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44
  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62
  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80
  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98
  99 100]. Run single-lag estimation at these lags to track down the error.
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x11baaee50>

In [95]:
X_tica_apf_g[0].shape


Out[95]:
(2600, 2878)

In [96]:
k = 100

kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_apf_g,k=k,max_iter=100)
dtrajs_apf_g = [dtraj.flatten() for dtraj in kmeans.get_output()]

its = pyemma.msm.its(dtrajs_apf_g,lags=range(1,101)+range(100,1000)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)


kmeans iterations:  16% ( 16/100) [#####                           ] eta 16:07 /08-05-16 10:03:55 pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering[16] INFO     Cluster centers converged after 17 steps.
estimating MaximumLikelihoodMSM: 100% (190/190) [##################] eta 00:00 /
Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x286f13b50>

In [97]:
its = pyemma.msm.its(dtrajs_apf_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)


estimating MaximumLikelihoodMSM: 100% (140/140) [##################] eta 00:01 -
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x367f73310>

In [98]:
n_tics = 6
X_comb = np.vstack(X_tica_apf_g)[:,:n_tics]

In [99]:
from triangle import corner

corner(X_comb,
           plot_contours=False,
           labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
          )


Out[99]:

In [ ]: