In [39]:
from msmbuilder.dataset import dataset
import mdtraj as md
import numpy as np
from glob import glob
from mdtraj.utils import timing
import itertools
from msmbuilder.featurizer import AtomPairsFeaturizer
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.cluster import MiniBatchKMedoids
%matplotlib inline
from matplotlib import pyplot as plt
In [2]:
traj_list = sorted(glob("/Users/je714/wt_data/*/05*nc"))
In [3]:
top = md.load_prmtop("/Users/je714/wt_data/run8/WT-ff14SB_clean.prmtop")
In [4]:
ef_hand = top.select("resid 64 to 74")
cat_cal = top.select("resid 419")
In [5]:
atom_sel = np.append(ef_hand, cat_cal)
In [6]:
atom_sel
Out[6]:
In [7]:
xyz = dataset("/Users/je714/wt_data/*/05*nc", fmt='mdtraj',
topology="/Users/je714/wt_data/run8/WT-ff14SB_clean.prmtop",
atom_indices=atom_sel)
We featurize the trajectories into the pairwise distance between all the atoms in the EF hand and the catalytic Ca2+. Since we only loaded the relevant atoms, msmbuilder renumbers the indices from 0.
In [8]:
pairs = np.asarray(list(itertools.product(list(range(139)), [138])))
In [9]:
print(pairs.shape)
In [10]:
pairs_noIJ = pairs[pairs[:,0] != pairs[:,1]] # Drop out the elements that are i=j (whose distance is 0)
In [11]:
pairs_noIJ
Out[11]:
In [12]:
traj1 = xyz[0]
In [13]:
dist_featurizer = AtomPairsFeaturizer(pair_indices=pairs)
In [14]:
with timing("Featurizing into distances..."):
dists = dist_featurizer.fit_transform(xyz)
In [15]:
tica_model = tICA(n_components=10, lag_time=1)
with timing("tica..."):
tica_trajs_dists = tica_model.fit_transform(dists)
In [16]:
tica_model = tICA
In [17]:
tica_trajs_dists[0].shape
Out[17]:
In [61]:
def tica_plotter(tica_trajs):
txx = np.concatenate(tica_trajs)
plt.figure(figsize=(14, 4))
plt.subplot(1, 2, 1)
plt.hexbin(txx[:, 0], txx[:, 1], bins='log', mincnt=1)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.subplot(1, 2, 2)
time = np.linspace(start=0.0, stop=txx.shape[0]*0.02, num = txx.shape[0]) / 1000
plt.plot(time, txx[:, 0])
plt.xlabel("Aggregated time ($\mu$s)")
plt.ylabel("tIC 1")
plt.tight_layout()
In [62]:
tica_plotter(tica_trajs_dists)
In [19]:
clusterer = MiniBatchKMeans(n_clusters=400)
with timing("Clustering tica trajs from dist featurization..."):
clustered_trajs = clusterer.fit_transform(tica_trajs_dists)
In [57]:
def cluster_plotter(trajectory, clusterer_object):
txx = np.concatenate(trajectory)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1)
plt.scatter(clusterer_object.cluster_centers_[:,0],
clusterer_object.cluster_centers_[:,1],
s=100, c='w')
In [59]:
cluster_plotter(tica_trajs_dists, clusterer)
In [21]:
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
msm = MarkovStateModel(lag_time=50)
msm.fit(clustered_trajs)
print("The MSM has %s states.\n" % msm.n_states_)
print(msm.left_eigenvectors_.shape)
In [22]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=1e4 * msm.populations_, # size by population
c=msm.left_eigenvectors_[:,1], # color by eigenvector
cmap="RdBu")
plt.colorbar(label='First dynamical eigenvector')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.tight_layout()
In [23]:
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=5)
macro_trajs = pcca.transform(clustered_trajs)
In [24]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
clusterer.cluster_centers_[:,1],
s=100,
c=pcca.microstate_mapping_,
)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
Out[24]:
In [25]:
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import randint
In [26]:
model = Pipeline([
('featurizer', AtomPairsFeaturizer(pair_indices=pairs)),
('tica', tICA(n_components=4)),
('clusterer', MiniBatchKMeans()),
('msm', MarkovStateModel(n_timescales=3, ergodic_cutoff='on', reversible_type='mle', verbose=False))
])
search = RandomizedSearchCV(model, n_iter=50, cv=3, refit=False, param_distributions={
'tica__lag_time':randint(1,100),
'clusterer__n_clusters':randint(1,1000),
'msm__lag_time':randint(1,100)
})
In [27]:
def plot_scores(score_matrix):
from matplotlib import pyplot as pp
scores = np.array([[np.mean(e.cv_validation_scores),
np.std(e.cv_validation_scores),
e.parameters['tica__lag_time'],
e.parameters['clusterer__n_clusters'],
e.parameters['msm__lag_time']]
for e in score_matrix.grid_scores_])
mean_score_value = scores[:,0]
std_score_value = scores[:,1]
lags_tica = scores[:,2]
cluster_number = scores[:,3]
lags_msm = scores[:,4]
pp.figure(figsize=(14,4))
pp.grid(False)
interval = 2*std_score_value
# subplot1
pp.subplot(1,3,1, axisbg='white')
pp.errorbar(x=lags_tica, y=mean_score_value, yerr=interval, color = 'b', fmt='o')
pp.plot(pp.xlim(), [search.best_score_]*2, 'k-.', label = 'best')
pp.xlabel("tICA lag time (Frame)")
pp.ylabel("Score")
# subplot2
pp.subplot(1,3,2, axisbg='white')
pp.errorbar(x=cluster_number, y=mean_score_value, yerr=interval, color = 'r', fmt='o')
pp.plot(pp.xlim(), [search.best_score_]*2, 'k-.', label = 'best')
pp.xlabel("Number of clusters")
pp.ylabel("Score")
# subplot3
pp.subplot(1,3,3, axisbg='white')
pp.errorbar(x=lags_msm, y=mean_score_value, yerr=interval, color = 'g', fmt='o')
pp.plot(pp.xlim(), [search.best_score_]*2, 'k-.', label = 'best')
pp.xlabel("MSM lag time (Frame)")
pp.ylabel("Score")
pp.tight_layout()
In [28]:
# with timing("Cross validation..."):
# xyz_scores = search.fit(xyz)
# for key, value in xyz_scores.best_params_.items():
# print("%s\t%s" % (key, value))
# print("Score = %.2f" % xyz_scores.best_score_)
In [29]:
# plot_scores(xyz_scores)
In [30]:
model = Pipeline([
('tica', tICA()),
('clusterer', MiniBatchKMedoids()),
('msm', MarkovStateModel(ergodic_cutoff='on', reversible_type='mle', verbose=False))
])
search = RandomizedSearchCV(model, n_iter=100, cv=5, refit=False, param_distributions={
'tica__lag_time':randint(1,200),
'tica__n_components':randint(1,15),
'clusterer__n_clusters':randint(1,1000),
'msm__n_timescales':randint(1,15),
'msm__lag_time':randint(1,100)
})
In [31]:
print(search)
In [32]:
with timing("Cross validation on trajectories already fit to distance feature..."):
feat_scores = search.fit(dists)
In [33]:
feat_scores.best_params_
Out[33]:
In [34]:
plot_scores(feat_scores)
feat_scores.gridscores
Use best scores from cross-validation to build an MSM
In [110]:
opt_tica = tICA(lag_time=118, n_components=13)
opt_cluster = MiniBatchKMedoids(n_clusters=346)
opt_msm = MarkovStateModel(lag_time=20, n_timescales=14)
In [111]:
opt_tica_trajs = opt_tica.fit_transform(dists)
In [112]:
tica_plotter(opt_tica_trajs)
In [113]:
clustered_opt_tica_trajs = opt_cluster.fit_transform(opt_tica_trajs)
In [114]:
cluster_plotter(opt_tica_trajs, opt_cluster)
In [115]:
opt_msm.fit(clustered_opt_tica_trajs)
Out[115]:
In [104]:
lag_times = [x*2 for x in range(1,55,2)]
print(lag_times)
In [105]:
opt_msm.timescales_[0:3]
Out[105]:
In [106]:
first_three_timescales = []
for lag_time in lag_times:
msm = MarkovStateModel(lag_time=lag_time, n_timescales=14)
msm.fit(clustered_opt_tica_trajs)
first_three_timescales.append(msm.timescales_[0:3])
In [107]:
first_three_timescales
Out[107]:
In [108]:
first_timescale = []
for comb in first_three_timescales:
first_timescale.append(comb[0])
In [109]:
plt.scatter(x=lag_times, y=first_timescale)
Out[109]:
In [ ]: