In [1]:
import numpy as np
import numpy.random as npr
In [2]:
path = '../../../Downloads/abl_snapshot/abl_snapshot/*.h5'
In [3]:
from glob import glob
fnames = glob(path)[:-1]
fnames
Out[3]:
In [4]:
import mdtraj as md
trajs = [md.load(fname) for fname in fnames]
traj = trajs[0]
In [5]:
trajs
Out[5]:
In [10]:
n_atoms = traj.n_atoms
n_atoms
Out[10]:
In [6]:
import pyemma
import matplotlib.pyplot as plt
%matplotlib inline
In [70]:
from msmbuilder.featurizer import DihedralFeaturizer
dih = DihedralFeaturizer(['phi', 'psi', 'omega', 'chi1', 'chi2', 'chi3', 'chi4'])
X_dih = dih.fit_transform(trajs)
In [76]:
X_dih[0].shape[1] / 2,X_dih[0].shape[1]
Out[76]:
In [45]:
from msmbuilder.featurizer import AlphaAngleFeaturizer
aaf = AlphaAngleFeaturizer()
X_aaf = aaf.fit_transform(trajs)
In [46]:
X_aaf[0].shape
Out[46]:
In [47]:
from msmbuilder.featurizer import AtomPairsFeaturizer
all_pairs = []
for i in range(n_atoms):
for j in range(i):
all_pairs.append((i,j))
print(len(all_pairs))
In [48]:
all_pairs[:10]
Out[48]:
In [49]:
npr.seed(0)
npr.shuffle(all_pairs)
In [50]:
all_pairs[:10]
Out[50]:
In [61]:
some_pairs = all_pairs[:3000]
In [62]:
apf = AtomPairsFeaturizer(some_pairs)
X_apf = apf.fit_transform(trajs)
In [53]:
import pyemma
tica = pyemma.coordinates.tica(X_aaf)
X_tica = tica.get_output()
In [63]:
tica_p = pyemma.coordinates.tica(X_apf)
X_tica_p = tica.get_output()
In [71]:
tica_dih = pyemma.coordinates.tica(X_dih)
X_tica_dih = tica.get_output()
In [64]:
Xs = [X_apf,X_aaf]
X_feat = [np.hstack([x[i] for x in Xs]) for i in range(len(trajs))]
tica_tot = pyemma.coordinates.tica(X_feat)
In [ ]:
# also throw in dihedrals?
In [56]:
X_tot = tica_tot.get_output()
In [66]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(np.cumsum(tica.eigenvalues**2))
plt.plot(np.cumsum(tica_p.eigenvalues**2))
plt.plot(np.cumsum(tica_tot.eigenvalues**2))
plt.plot(range(180),range(180),'--')
#plt.ylim(0,180)
#plt.xlim(0,500)
Out[66]:
In [79]:
from triangle import corner
n_tics = 6
corner(np.vstack(X_tot)[:,:n_tics],
plot_contours=False,
labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
)
Out[79]:
In [62]:
tica_tot.dimension()
Out[62]:
In [68]:
kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tot,k=100,max_iter=100)
In [69]:
dtrajs = [dtraj.flatten() for dtraj in kmeans.get_output()]
In [70]:
nits = 20
its = pyemma.msm.its(dtrajs,lags=range(1,101),nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[70]:
In [9]:
from math import factorial
factorial(n_atoms) / (factorial(n_atoms - 3) * factorial(3))
# The full Grassmann vector has 11 billion entries!
Out[9]:
In [10]:
n_residues = 252
factorial(n_residues) / (factorial(n_residues - 3) * factorial(3))
# The Grassmann vector using only alpha-carbons has >2.5 million entries
Out[10]:
In [33]:
def select_random_features(n_atoms,n_features=1000,seed=0):
npr.seed(0)
all_indices = np.arange(n_atoms)
feature_indices = []
for i in range(n_features):
npr.shuffle(all_indices)
feature_ind = np.zeros(n_atoms,dtype=bool)
feature_ind[all_indices[:3]] = True
feature_indices.append(feature_ind)
return feature_indices
def grassmann_vector(X,indices):
return np.array([np.linalg.det(X[s]) for s in indices])
def grassmann_featurize(trajs,indices):
return [np.array([grassmann_vector(x,indices) for x in traj.xyz]) for traj in trajs]
def parallel_grassmann_featurize(trajs,indices,n_jobs=8):
from joblib import Parallel,delayed
def f(traj):
return np.array([grassmann_vector(x,indices) for x in traj.xyz])
return Parallel(n_jobs=n_jobs,backend='threading')(delayed(f)(traj) for traj in trajs)
In [36]:
feature_indices = select_random_features(n_atoms,n_features=3000)
In [37]:
%%time
X_g = grassmann_featurize(trajs,feature_indices)
In [42]:
np.save('X_grassmann_abl_run_0.npy',X_g)
In [35]:
X_g_ = parallel_grassmann_featurize(trajs,feature_indices)
In [38]:
tica_g = pyemma.coordinates.tica(X_g)
X_tica_g = tica_g.get_output()
n_tics = 6
from triangle import corner
corner(np.vstack(X_tica_g)[:,:n_tics],
plot_contours=False,
labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
)
Out[38]:
In [43]:
plt.plot(np.cumsum(tica_g.eigenvalues**2),label='Grassmann')
plt.plot(range(300),range(300))
Out[43]:
In [81]:
Out[81]:
In [77]:
plt.plot(np.cumsum(tica.eigenvalues**2),label = '500 alpha angles')
plt.plot(np.cumsum(tica_dih.eigenvalues**2), label = '1300 (x 2) dihedral angles (sin, cos)')
plt.plot(np.cumsum(tica_p.eigenvalues**2),label = '3000 atom-pairs')
plt.plot(np.cumsum(tica_tot.eigenvalues**2), label = '3500 alpha angles + atom-pairs')
plt.plot(np.cumsum(tica_g.eigenvalues**2), linewidth = 3, label = '3000 Grassmann features')
plt.plot(range(300),range(300),'--')
plt.legend(loc='best')
Out[77]:
In [82]:
X_apf_g = [np.hstack([x[i] for x in [X_g,X_apf]]) for i in range(len(trajs))]
In [83]:
tica_apf_g = pyemma.coordinates.tica(X_apf_g)
In [89]:
plt.plot(np.cumsum(tica.eigenvalues**2),label = '500 alpha angles')
plt.plot(np.cumsum(tica_dih.eigenvalues**2), label = '1300 (x 2) dihedral angles (sin, cos)')
plt.plot(np.cumsum(tica_p.eigenvalues**2),label = '3000 atom-pairs')
plt.plot(np.cumsum(tica_tot.eigenvalues**2), label = '3500 alpha angles + atom-pairs')
plt.plot(np.cumsum(tica_g.eigenvalues**2), linewidth = 3, label = '3000 Grassmann features')
plt.plot(np.cumsum(tica_apf_g.eigenvalues**2), '--', linewidth = 3, label = '6000 Grassmann + pairs')
plt.plot(range(600),range(600),'--')
plt.legend(loc=(0.5,0))
Out[89]:
In [90]:
X_tica_apf_g = tica_apf_g.get_output()
In [92]:
del(X_apf_g)
del(Xs)
In [ ]:
# next-- draw a few samples of size 1000 or 3000
In [44]:
nits = 20
k = 100
kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_g,k=k,max_iter=100)
dtrajs_g = [dtraj.flatten() for dtraj in kmeans.get_output()]
its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,1000)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[44]:
In [59]:
its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[59]:
In [ ]:
its = pyemma.msm.its(dtrajs_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)
In [41]:
k = 500
kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_g,k=k,max_iter=100)
dtrajs_g = [dtraj.flatten() for dtraj in kmeans.get_output()]
its = pyemma.msm.its(dtrajs_g,lags=range(1,101),nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[41]:
In [95]:
X_tica_apf_g[0].shape
Out[95]:
In [96]:
k = 100
kmeans = pyemma.coordinates.cluster_mini_batch_kmeans(X_tica_apf_g,k=k,max_iter=100)
dtrajs_apf_g = [dtraj.flatten() for dtraj in kmeans.get_output()]
its = pyemma.msm.its(dtrajs_apf_g,lags=range(1,101)+range(100,1000)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[96]:
In [97]:
its = pyemma.msm.its(dtrajs_apf_g,lags=range(1,101)+range(100,500)[::10],nits=nits)
pyemma.plots.plot_implied_timescales(its)
Out[97]:
In [98]:
n_tics = 6
X_comb = np.vstack(X_tica_apf_g)[:,:n_tics]
In [99]:
from triangle import corner
corner(X_comb,
plot_contours=False,
labels=['tIC{0}'.format(i+1) for i in range(n_tics)]
)
Out[99]:
In [ ]: