MD datasets are usually quite large. It doesn't make sense to load everything into memory at once. The dataset
object lazily-loads trajectories as they are needed. Below, we create a dataset out of all the trajectories we have at the moment.
In [1]:
from msmbuilder.dataset import dataset
import numpy as np
import os
from mdtraj.utils import timing
from msmbuilder.featurizer import DihedralFeaturizer
import seaborn as sns; sns.set_style("white"); sns.set_palette("Blues")
In [2]:
with timing("Loading data as dataset object"):
wt_xyz = dataset("/Users/je714/wt_data/*/05*nc", topology="/Users/je714/wt_data/test.pdb")
In [3]:
with timing("Loading data as dataset object"):
S1P_xyz = dataset("/Users/je714/p_data/run*/S1P/05*nc", topology="/Users/je714/p_data/S1P_ff14SB_newclean.prmtop")
In [4]:
with timing("Loading data as dataset object"):
SEP_xyz = dataset("/Users/je714/p_data/run*/SEP/05*nc", topology="/Users/je714/p_data/SEP_ff14SB_newclean.prmtop")
The raw (x, y, z) coordinates from the simulation do not respect the translational and rotational symmetry of our problem. A Featurizer transforms cartesian coordinates into other representations.
Here we use the DihedralFeaturizer
to turn our data into phi and psi dihedral angles.
Observe that the 6812*3-dimensional space is reduced substantially.
In [ ]:
wt_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/wt_data/wt_diheds_phi-psi.tgz'):
with timing("Loading dihedrals from file..."):
wt_diheds = np.loadtxt('/Users/je714/wt_data/wt_diheds_phi-psi.tgz')
else:
with timing("Featurizing trajectory into dihedrals..."):
wt_diheds = wt_featurizer.fit_transform(wt_xyz)
np.savetxt('/Users/je714/wt_data/wt_diheds_phi-psi.tgz', np.concatenate(wt_diheds))
In [ ]:
S1P_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/p_data/S1P_diheds_phi-psi.tgz'):
with timing("Loading dihedrals from file..."):
S1P_diheds = np.loadtxt('/Users/je714/p_data/S1P_diheds_phi-psi.tgz')
else:
with timing("Featurizing trajectory into dihedrals..."):
S1P_diheds = S1P_featurizer.fit_transform(S1P_xyz)
np.savetxt('/Users/je714/p_data/S1P_diheds_phi-psi.tgz', np.concatenate(S1P_diheds))
In [ ]:
SEP_featurizer = DihedralFeaturizer(types=['phi', 'psi'])
if os.path.isfile('/Users/je714/p_data/SEP_diheds_phi-psi.tgz'):
with timing("Loading dihedrals from file..."):
SEP_diheds = np.loadtxt('/Users/je714/p_data/SEP_diheds_phi-psi.tgz')
else:
with timing("Featurizing trajectory into dihedrals..."):
SEP_diheds = SEP_featurizer.fit_transform(SEP_xyz)
np.savetxt('/Users/je714/p_data/SEP_diheds_phi-psi.tgz', np.concatenate(SEP_diheds))
Featurizer based on residue-residue distances
This featurizer transforms a dataset containing MD trajectories into a vector dataset by representing each frame in each of the MD trajectories by a vector of the distances between pairs of amino-acid residues.
The exact method for computing the the distance between two residues is configurable with the scheme
parameter. In this case we use "ca"
to determine the distance between two residues as the distance between their alpha carbons.
In [ ]:
# from msmbuilder.featurizer import ContactFeaturizer
# featurizer_contact = ContactFeaturizer("all", scheme="ca")
# contacts = featurizer_contact.fit_transform(xyz)
# print(xyz[0].xyz.shape)
# print(contacts[0].shape)
In [ ]:
wt_xyz[0][0].topology
In [ ]:
wt_diheds.shape
In [5]:
from sklearn.pipeline import Pipeline
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel
In [ ]:
DihedralFeaturizer?
In [6]:
model = Pipeline([
('featurizer', DihedralFeaturizer(types=['phi', 'psi'])),
('tica', tICA(n_components=10, lag_time=20)),
('cluster', MiniBatchKMeans(n_clusters=1000)),
('msm', MarkovStateModel(lag_time=50))
])
In [7]:
model.fit(wt_xyz)
Out[7]:
In [8]:
for step in model.steps:
print(step[0])
In [9]:
diheds = model.steps[0][1]
tica_obj = model.steps[1][1]
clusterer = model.steps[2][1]
msm = model.steps[3][1]
In [ ]:
tica_trajs = tica_obj.transform(diheds.transform(wt_xyz))
In [ ]:
np.concatenate(tica_trajs).shape
In [ ]:
plt.plot(np.concatenate(tica_trajs)[::,0])
In [ ]:
def plot_ticaTrajs(tica_trajs):
txx = np.concatenate(tica_trajs)
plt.figure(figsize=(10.5,5))
cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
plt.subplot(1, 2, 1)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap=cmap)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.subplot(1, 2, 2)
plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1, cmap=cmap)
plt.xlabel('tIC 3')
plt.ylabel('tIC 4')
cb = plt.colorbar()
cb.set_label('log10(N)')
In [ ]:
plot_ticaTrajs(tica_trajs)
In [ ]:
clusterer.cluster_centers_.shape
In [ ]:
def plot_clusterCenters(clusterer_object, tica_trajs):
txx = np.concatenate(tica_trajs)
plt.figure(figsize=(10.5,5))
plt.subplot(1, 2, 1)
cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap=cmap)
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=4, c='black')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.subplot(1,2,2)
plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1, cmap=cmap)
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.scatter(clusterer.cluster_centers_[:,2],
clusterer.cluster_centers_[:,3],
s=4, c='black')
plt.xlabel('tIC 3')
plt.ylabel('tIC 4')
plt.tight_layout()
In [ ]:
plot_clusterCenters(clusterer, tica_trajs)
plt.savefig("/Users/je714/Dropbox (Imperial)/ESAreport/Figures/tica_clusters.png", format='png', dpi=300)
In [ ]:
np.asarray(range(0,10))
In [ ]:
plt.hexbin(np.asarray(range(0, np.hstack(clusterer.labels_).shape[0]))*0.00002,
np.hstack(clusterer.labels_),
mincnt=1,
cmap=sns.cubehelix_palette(8, start=.5, rot=-.75, as_cmap=True))
plt.ylabel("Cluster ID")
plt.xlabel("Aggregated time ($\mu$s)")
plt.savefig("/Users/je714/Dropbox (Imperial)/ESAReport/Figures/labeled_Trajs",
format='png', dpi=300)
In [ ]:
msm_lagtimes = [x for x in range(1,201) if (x%10==0) or (x==1)]
msm_lagtimes
In [ ]:
msm_test = MarkovStateModel(lag_time=1)
msm_test.fit(np.hstack(clusterer.labels_))
In [ ]:
msm_objects = []
for lagtime in msm_lagtimes:
msm = MarkovStateModel(lag_time=lagtime)
msm.fit(np.hstack(clusterer.labels_))
msm_objects.append(msm)
In [ ]:
msm_timescales = []
for msm in msm_objects:
msm_timescales.append(msm.timescales_)
In [ ]:
first_timescale = []
for lag_time, timescale in zip(msm_lagtimes, msm_timescales):
print(lag_time, timescale[0])
first_timescale.append(timescale[0])
In [ ]:
time_asNParray = np.array(first_timescale)
lag_asNParray = np.array(msm_lagtimes[0:6])
In [ ]:
plt.scatter(lag_asNParray, time_asNParray)
plt.semilogy()
In [ ]:
txx = np.concatenate(tica_trajs)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=1e4 * msm.populations_, # size by population
c=msm.left_eigenvectors_[:,1], # color by eigenvector
cmap="RdBu")
plt.colorbar(label='First dynamical eigenvector')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.tight_layout()
In [ ]:
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=5)
macro_trajs = pcca.transform(np.concatenate(clusterer.labels_))
In [ ]:
print(msm.left_eigenvectors_[:,1].shape)
print(clusterer.cluster_centers_[:,0].shape)
In [ ]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=100,
c=pcca.microstate_mapping_,
)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
In [ ]:
plt.plot(msm.eigenvalues_, 'bo')
plt.xlabel("MSM state")
plt.ylim(0,1)
In [ ]:
plt.plot(msm.sample_discrete(n_steps=1000), 'bo')
In [ ]:
from msmbuilder import utils
In [ ]:
plt.plot(msm.populations_)
In [ ]:
plt.plot(msm.timescales_)
In [ ]:
In [ ]: