Apply msmbuilder API to WT ff14SB cTN

MD datasets are usually quite large. It doesn't make sense to load everything into memory at once. The dataset object lazily-loads trajectories as they are needed. Below, we create a dataset out of all the trajectories we have at the moment.


In [1]:
from msmbuilder.dataset import dataset
import numpy as np
import os
from mdtraj.utils import timing
from msmbuilder.featurizer import DihedralFeaturizer
import seaborn as sns; sns.set_style("white"); sns.set_palette("Blues")

In [2]:
with timing("Loading data as dataset object"):
    wt_xyz = dataset("/Users/je714/wt_data/*/05*nc", topology="/Users/je714/wt_data/test.pdb")


Loading data as dataset object: 0.210 seconds

In [3]:
with timing("Loading data as dataset object"):
    S1P_xyz = dataset("/Users/je714/p_data/run*/S1P/05*nc", topology="/Users/je714/p_data/S1P_ff14SB_newclean.prmtop")


Loading data as dataset object: 0.201 seconds

In [4]:
with timing("Loading data as dataset object"):
    SEP_xyz = dataset("/Users/je714/p_data/run*/SEP/05*nc", topology="/Users/je714/p_data/SEP_ff14SB_newclean.prmtop")


Loading data as dataset object: 0.195 seconds

Featurization

The raw (x, y, z) coordinates from the simulation do not respect the translational and rotational symmetry of our problem. A Featurizer transforms cartesian coordinates into other representations.

Dihedrals

Here we use the DihedralFeaturizer to turn our data into phi and psi dihedral angles. Observe that the 6812*3-dimensional space is reduced substantially.


In [ ]:
wt_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/wt_data/wt_diheds_phi-psi.tgz'):
    with timing("Loading dihedrals from file..."):
        wt_diheds = np.loadtxt('/Users/je714/wt_data/wt_diheds_phi-psi.tgz')
else:
    with timing("Featurizing trajectory into dihedrals..."):
        wt_diheds = wt_featurizer.fit_transform(wt_xyz)
        np.savetxt('/Users/je714/wt_data/wt_diheds_phi-psi.tgz', np.concatenate(wt_diheds))

In [ ]:
S1P_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/p_data/S1P_diheds_phi-psi.tgz'):
    with timing("Loading dihedrals from file..."):
        S1P_diheds = np.loadtxt('/Users/je714/p_data/S1P_diheds_phi-psi.tgz')
else:
    with timing("Featurizing trajectory into dihedrals..."):
        S1P_diheds = S1P_featurizer.fit_transform(S1P_xyz)
        np.savetxt('/Users/je714/p_data/S1P_diheds_phi-psi.tgz', np.concatenate(S1P_diheds))

In [ ]:
SEP_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/p_data/SEP_diheds_phi-psi.tgz'):
    with timing("Loading dihedrals from file..."):
        SEP_diheds = np.loadtxt('/Users/je714/p_data/SEP_diheds_phi-psi.tgz')
else:
    with timing("Featurizing trajectory into dihedrals..."):
        SEP_diheds = SEP_featurizer.fit_transform(SEP_xyz)
        np.savetxt('/Users/je714/p_data/SEP_diheds_phi-psi.tgz', np.concatenate(SEP_diheds))

Contact Featurizer

Featurizer based on residue-residue distances

This featurizer transforms a dataset containing MD trajectories into a vector dataset by representing each frame in each of the MD trajectories by a vector of the distances between pairs of amino-acid residues.

The exact method for computing the the distance between two residues is configurable with the scheme parameter. In this case we use "ca" to determine the distance between two residues as the distance between their alpha carbons.


In [ ]:
# from msmbuilder.featurizer import ContactFeaturizer
# featurizer_contact = ContactFeaturizer("all", scheme="ca")
# contacts = featurizer_contact.fit_transform(xyz)
# print(xyz[0].xyz.shape)
# print(contacts[0].shape)

Intermediate kinetic model: tICA

tICA is similar to PCA. Note the reduction to just 4 dimensions.


In [ ]:
wt_xyz[0][0].topology

In [ ]:
wt_diheds.shape

In [5]:
from sklearn.pipeline import Pipeline
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel

In [ ]:
DihedralFeaturizer?

In [6]:
model = Pipeline([
        ('featurizer', DihedralFeaturizer(types=['phi', 'psi'])),
        ('tica', tICA(n_components=10, lag_time=20)),
        ('cluster', MiniBatchKMeans(n_clusters=1000)),
        ('msm', MarkovStateModel(lag_time=50))
    ])

In [7]:
model.fit(wt_xyz)


/Users/je714/anaconda3/lib/python3.5/site-packages/sklearn/cluster/k_means_.py:1300: RuntimeWarning: init_size=300 should be larger than k=1000. Setting it to 3*k
  init_size=init_size)
MSM contains 32 strongly connected components above weight=0.02. Component 31 selected, with population 16.716890%
Out[7]:
Pipeline(steps=[('featurizer', DihedralFeaturizer(sincos=True, types=['phi', 'psi'])), ('tica', tICA(kinetic_mapping=False, lag_time=20, n_components=10, shrinkage=None,
   weighted_transform=False)), ('cluster', MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, m...=None,
         prior_counts=0, reversible_type='mle', sliding_window=True,
         verbose=True))])

In [8]:
for step in model.steps:
    print(step[0])


featurizer
tica
cluster
msm

In [9]:
diheds = model.steps[0][1]
tica_obj = model.steps[1][1]
clusterer = model.steps[2][1]
msm = model.steps[3][1]

In [ ]:
tica_trajs = tica_obj.transform(diheds.transform(wt_xyz))

In [ ]:
np.concatenate(tica_trajs).shape

tICA Heatmap

We can histogram our data projecting along the two first tICS (the two slowest DOFs found by tICA).


In [ ]:
plt.plot(np.concatenate(tica_trajs)[::,0])

In [ ]:
def plot_ticaTrajs(tica_trajs):
    txx = np.concatenate(tica_trajs)
    plt.figure(figsize=(10.5,5))
    cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
    plt.subplot(1, 2, 1)
    plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap=cmap)
    plt.xlabel('tIC 1')
    plt.ylabel('tIC 2')
    cb = plt.colorbar()
    cb.set_label('log10(N)')
    plt.subplot(1, 2, 2)
    plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1, cmap=cmap)
    plt.xlabel('tIC 3')
    plt.ylabel('tIC 4')
    cb = plt.colorbar()
    cb.set_label('log10(N)')

In [ ]:
plot_ticaTrajs(tica_trajs)

Clustering

Conformations need to be clustered into states (sometimes written as microstates). We cluster based on the tICA projections to group conformations that interconvert rapidly. Note that we transform our trajectories from the 4-dimensional tICA space into a 1-dimensional cluster index.


In [ ]:
clusterer.cluster_centers_.shape

In [ ]:
def plot_clusterCenters(clusterer_object, tica_trajs):
    txx = np.concatenate(tica_trajs)
    plt.figure(figsize=(10.5,5))
    plt.subplot(1, 2, 1)
    cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
    plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap=cmap)
    cb = plt.colorbar()
    cb.set_label('log10(N)')
    plt.scatter(clusterer.cluster_centers_[:,0],
                clusterer.cluster_centers_[:,1], 
                s=4, c='black')
    plt.xlabel('tIC 1')
    plt.ylabel('tIC 2')

    plt.subplot(1,2,2)
    plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1, cmap=cmap)
    cb = plt.colorbar()
    cb.set_label('log10(N)')
    plt.scatter(clusterer.cluster_centers_[:,2],
                clusterer.cluster_centers_[:,3], 
                s=4, c='black')
    plt.xlabel('tIC 3')
    plt.ylabel('tIC 4')

    plt.tight_layout()

In [ ]:
plot_clusterCenters(clusterer, tica_trajs)
plt.savefig("/Users/je714/Dropbox (Imperial)/ESAreport/Figures/tica_clusters.png", format='png', dpi=300)

MSM

We can construct an MSM from the labeled trajectories.


In [ ]:
np.asarray(range(0,10))

In [ ]:
plt.hexbin(np.asarray(range(0, np.hstack(clusterer.labels_).shape[0]))*0.00002,
           np.hstack(clusterer.labels_),
           mincnt=1,
           cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True))
plt.ylabel("Cluster ID")
plt.xlabel("Aggregated time ($\mu$s)")
plt.savefig("/Users/je714/Dropbox (Imperial)/ESAReport/Figures/labeled_Trajs",
            format='png', dpi=300)

In [ ]:
msm_lagtimes = [x for x in range(1,201) if (x%10==0) or (x==1)]
msm_lagtimes

In [ ]:
msm_test = MarkovStateModel(lag_time=1)
msm_test.fit(np.hstack(clusterer.labels_))

In [ ]:
msm_objects = []
for lagtime in msm_lagtimes:
    msm = MarkovStateModel(lag_time=lagtime)
    msm.fit(np.hstack(clusterer.labels_))
    msm_objects.append(msm)

In [ ]:
msm_timescales = []
for msm in msm_objects:
    msm_timescales.append(msm.timescales_)

In [ ]:
first_timescale = []
for lag_time, timescale in zip(msm_lagtimes, msm_timescales):
    print(lag_time, timescale[0])
    first_timescale.append(timescale[0])

In [ ]:
time_asNParray = np.array(first_timescale)
lag_asNParray = np.array(msm_lagtimes[0:6])

In [ ]:
plt.scatter(lag_asNParray, time_asNParray)
plt.semilogy()

In [ ]:
txx = np.concatenate(tica_trajs)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
            clusterer.cluster_centers_[:,1],
            s=1e4 * msm.populations_, # size by population
            c=msm.left_eigenvectors_[:,1], # color by eigenvector
            cmap="RdBu") 
plt.colorbar(label='First dynamical eigenvector')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.tight_layout()

Macrostate model


In [ ]:
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=5)
macro_trajs = pcca.transform(np.concatenate(clusterer.labels_))

In [ ]:
print(msm.left_eigenvectors_[:,1].shape)
print(clusterer.cluster_centers_[:,0].shape)

In [ ]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
            clusterer.cluster_centers_[:,1],
            s=100,
            c=pcca.microstate_mapping_,
      )
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')

In [ ]:
plt.plot(msm.eigenvalues_, 'bo')
plt.xlabel("MSM state")
plt.ylim(0,1)

In [ ]:
plt.plot(msm.sample_discrete(n_steps=1000), 'bo')

In [ ]:
from msmbuilder import utils

In [ ]:
plt.plot(msm.populations_)

In [ ]:
plt.plot(msm.timescales_)

In [ ]:


In [ ]: