In [ ]:
# Download example dataset
from msmbuilder.example_datasets import FsPeptide
fs_peptide = FsPeptide(verbose=False)
fs_peptide.cache()
In [ ]:
# Work in a temporary directory
import tempfile
import os
os.chdir(tempfile.mkdtemp())
dataset
objectMD datasets are usually quite large. It doesn't make sense to load everything into memory at once. The dataset
object lazily-loads trajectories as they are needed. Below, we create a dataset out of the 28 *.xtc
files we downloaded above. Since the data was saved at 50 ps / frame, we only load every 10th frame (with a new frequency of 0.5/ns).
In [ ]:
from msmbuilder.dataset import dataset
xyz = dataset(fs_peptide.data_dir + "/*.xtc",
topology=fs_peptide.data_dir + '/fs-peptide.pdb',
stride=10)
print("{} trajectories".format(len(xyz)))
# msmbuilder does not keep track of units! You must keep track of your
# data's timestep
to_ns = 0.5
print("with length {} ns".format(set(len(x)*to_ns for x in xyz)))
The raw (x, y, z)
coordinates from the simulation do not respect the translational and rotational symmetry of our problem. A Featurizer
transforms cartesian coordinates into other representations. Here we use the DihedralFeaturizer
to turn our data into phi and psi dihedral angles. Observe that the 264*3
-dimensional space is reduced to 84 dimensions
In [ ]:
from msmbuilder.featurizer import DihedralFeaturizer
featurizer = DihedralFeaturizer(types=['phi', 'psi'])
diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy')
print(xyz[0].xyz.shape)
print(diheds[0].shape)
In [ ]:
from msmbuilder.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_diheds = diheds.fit_transform_with(scaler, 'scaled_diheds/', fmt='dir-npy')
print(diheds[0].shape)
print(scaled_diheds[0].shape)
In [ ]:
from msmbuilder.decomposition import tICA
tica_model = tICA(lag_time=2, n_components=4)
# fit and transform can be done in seperate steps:
tica_model = scaled_diheds.fit_with(tica_model)
tica_trajs = scaled_diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy')
print(diheds[0].shape)
print(tica_trajs[0].shape)
In [ ]:
%matplotlib inline
import msmexplorer as msme
import numpy as np
txx = np.concatenate(tica_trajs)
msme.plot_histogram(txx)
In [ ]:
from msmbuilder.cluster import MiniBatchKMeans
clusterer = MiniBatchKMeans(n_clusters=100, random_state=42)
clustered_trajs = tica_trajs.fit_transform_with(
clusterer, 'kmeans/', fmt='dir-npy'
)
print(tica_trajs[0].shape)
print(clustered_trajs[0].shape)
In [ ]:
from matplotlib import pyplot as plt
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap='viridis')
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=100, c='w')
In [ ]:
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
msm = MarkovStateModel(lag_time=2, n_timescales=20)
msm.fit(clustered_trajs)
In [ ]:
assignments = clusterer.partial_transform(txx)
assignments = msm.partial_transform(assignments)
msme.plot_free_energy(txx, obs=(0, 1), n_samples=10000,
pi=msm.populations_[assignments],
xlabel='tIC 1', ylabel='tIC 2')
plt.scatter(clusterer.cluster_centers_[msm.state_labels_, 0],
clusterer.cluster_centers_[msm.state_labels_, 1],
s=1e4 * msm.populations_, # size by population
c=msm.left_eigenvectors_[:, 1], # color by eigenvector
cmap="coolwarm",
zorder=3)
plt.colorbar(label='First dynamical eigenvector')
plt.tight_layout()
In [ ]:
msm.timescales_
In [ ]:
msme.plot_timescales(msm, n_timescales=5, ylabel='Implied Timescales ($ns$)')
In [ ]:
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=4)
macro_trajs = pcca.transform(clustered_trajs)
In [ ]:
msme.plot_free_energy(txx, obs=(0, 1), n_samples=10000,
pi=msm.populations_[assignments],
xlabel='tIC 1', ylabel='tIC 2')
plt.scatter(clusterer.cluster_centers_[msm.state_labels_, 0],
clusterer.cluster_centers_[msm.state_labels_, 1],
s=50,
c=pcca.microstate_mapping_,
zorder=3
)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')