In [5]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
%matplotlib inline
import mdtraj
plt.rc('font', family='serif')
In [6]:
# we want to find the atoms with the fastest fluctuations when optimally aligned, and then
# deemphasize them in a weighted RMSD Mahalanobis distance
# the result will be a vector of length N where N is the number of atoms in the molecule
# which will be written on the diagonal of an NxN matrix and multiplied into RMSD or Binet-Cauchy computations
In [7]:
from mdtraj.geometry import alignment
In [8]:
npr.seed(0)
X = npr.randn(100,3)
Y = X.dot(np.diag([1,2,1]))
In [9]:
def compute_atomwise_deviation_xyz(X_xyz,Y_xyz):
X_prime = alignment.transform(X_xyz, Y_xyz)
delta = X_prime - Y_xyz
deviation = ((delta**2).sum(1))**0.5
return deviation
def compute_atomwise_deviation(X,Y):
return compute_atomwise_deviation_xyz(X.xyz[0],Y.xyz[0])
In [10]:
from msmbuilder.example_datasets import FsPeptide
dataset = FsPeptide().get()
fs_trajectories = dataset.trajectories
fs_t = fs_trajectories[0]
In [16]:
dataset.DESCR
Out[16]:
In [ ]:
from simtk.u
In [11]:
compute_atomwise_deviation(fs_t[0],fs_t[1])
Out[11]:
In [17]:
tau=20
n_frames=len(fs_t)-tau
n_atoms = fs_t.n_atoms
atomwise_deviations=np.zeros((n_frames,n_atoms))
for i in range(n_frames):
atomwise_deviations[i] = compute_atomwise_deviation(fs_t[i],fs_t[i+tau])
In [13]:
atomwise_deviations.shape
Out[13]:
In [15]:
fs
In [27]:
mean = np.mean(atomwise_deviations,0)
stdev = np.std(atomwise_deviations,0)
plt.plot(mean,c='darkblue')
#plt.plot(mean+stdev,c='blue',linestyle='--')
#plt.plot(mean-stdev,c='blue',linestyle='--')
plt.fill_between(range(len(mean)),mean-stdev,mean+stdev,color='blue',alpha=0.3)
plt.xlabel('Atom index')
plt.ylabel(r'Mean displacement, computed at $\tau=1$ns')
plt.title('Atomwise deviation from Fs peptide simulation')
plt.xlim(0,263)
plt.savefig('atomwise_deviation.pdf')
In [31]:
plt.plot((1/mean) / np.max(1/mean),c='darkblue')
plt
Out[31]:
In [38]:
from msmbuilder.cluster import MiniBatchKMedoids
kmed = MiniBatchKMedoids(metric='rmsd')
kmed.fit(fs_trajectories)
Out[38]:
In [39]:
clustered = kmed.transform(fs_trajectories)
In [41]:
from msmbuilder.msm import MarkovStateModel
msm = MarkovStateModel()
msm.fit(clustered)
Out[41]:
In [42]:
msm.score_
Out[42]:
In [44]:
from msmbuilder.featurizer import DihedralFeaturizer
dih_model = DihedralFeaturizer()
dih_traj = dih_model.fit_transform(fs_trajectories)
In [48]:
kmed = MiniBatchKMedoids()
clustered = kmed.fit_transform(dih_traj)
msm = MarkovStateModel()
msm.fit(clustered)
msm.score_
Out[48]:
In [61]:
from msmbuilder.decomposition import tICA
tica = tICA(lag_time=10,weighted_transform=True)
tica_traj = tica.fit_transform(dih_traj)
kmed = MiniBatchKMedoids()
clustered = kmed.fit_transform(tica_traj)
msm = MarkovStateModel()
msm.fit(clustered)
msm.score_
Out[61]:
In [63]:
trajectory = fs_trajectories[0]
In [102]:
def wrmsd_dmat(trajectory,weights):
dmat = np.zeros((len(trajectory),len(trajectory)))
N = trajectory.n_atoms
for i in range(1,len(dmat)):
# align everything to the current "pivot" element
trajectory.superpose(trajectory,i)
# compute atomwise squared deviations
atomwise_deviations = ((trajectory[:i].xyz - trajectory[i].xyz)**2).sum(-1)
# dot with a weight vector and divide by the number of atoms
weighted_rmsd = atomwise_deviations.dot(weights) / N
# assign to the lower-left half of the distance matrix
dmat[i,:i] = weighted_rmsd
# return a full distance matrix
return dmat + dmat.T
In [114]:
trajectory = fs_trajectories[0][::20]
len(trajectory)
Out[114]:
In [115]:
dmat = wrmsd_dmat(trajectory,np.ones(trajectory.n_atoms))
In [125]:
%timeit dmat = wrmsd_dmat(trajectory,np.ones(trajectory.n_atoms))
In [ ]:
In [116]:
dmat.shape
Out[116]:
In [117]:
plt.imshow(dmat,interpolation='none',cmap='Blues')
Out[117]:
In [118]:
dmat_1 = wrmsd_dmat(trajectory,mean)
plt.imshow(dmat_1,interpolation='none',cmap='Blues')
Out[118]:
In [119]:
dmat_2 = wrmsd_dmat(trajectory,1/mean)
plt.imshow(dmat_2,interpolation='none',cmap='Blues')
Out[119]:
In [121]:
from sklearn.manifold import MDS
In [124]:
# unweighted
mds = MDS(dissimilarity='precomputed')
X = mds.fit_transform(dmat)
plt.scatter(X[:,0],X[:,1],linewidths=0,c=range(len(X)))
# weighted (dumbly)
plt.figure()
mds = MDS(dissimilarity='precomputed')
X = mds.fit_transform(dmat_1)
plt.scatter(X[:,0],X[:,1],linewidths=0,c=range(len(X)))
# weighted (cleverly)
plt.figure()
mds = MDS(dissimilarity='precomputed')
X = mds.fit_transform(dmat_2)
plt.scatter(X[:,0],X[:,1],linewidths=0,c=range(len(X)))
Out[124]:
In [90]:
deviations = (trajectory[:1000].xyz - trajectory[1].xyz).sum(-1)
deviations.shape,deviations.sum(1).shape
In [91]:
plt.plot(deviations);
Out[91]:
In [83]:
weights = np.ones(trajectory.n_atoms)
weights.shape
Out[83]:
In [85]:
deviations.dot(weights).shape
Out[85]:
In [ ]:
In [ ]:
class AltMiniBatchKMedoids(MiniBatchKMedoids):
''' redefines fit and transform to also allow weighted rmsd instead of just 'rmsd'
metric = {any metric accepted by minibatchkmedoids, ('wrmsd', np.ndarray of weights)}
going to do this by replacing the calls to libdistance.pdist and libdistance.pd
better way: augment libdistance to accept callables instead of just strings'''
def fit(self, X, y=None):
n_samples = len(X)
n_batches = int(np.ceil(float(n_samples) / self.batch_size))
n_iter = int(self.max_iter * n_batches)
random_state = check_random_state(self.random_state)
cluster_ids_ = random_state.random_integers(
low=0, high=n_samples - 1, size=self.n_clusters)
labels_ = random_state.random_integers(
low=0, high=self.n_clusters - 1, size=n_samples)
n_iters_no_improvement = 0
for kk in range(n_iter):
# each minibatch includes the random indices AND the
# current cluster centers
minibatch_indices = np.concatenate([
cluster_ids_,
random_state.random_integers(
0, n_samples - 1, self.batch_size),
])
X_indices=np.array(minibatch_indices, dtype=np.intp)
if metric[0]=='wmrsd':
dmat = wrmsd_dmat(X[X_indices],weights=metric[1])
else:
dmat = libdistance.pdist(X, metric=self.metric, X_indices=X_indices)
minibatch_labels = np.array(np.concatenate([
np.arange(self.n_clusters),
labels_[minibatch_indices[self.n_clusters:]]
]), dtype=np.intp)
ids, intertia, _ = _kmedoids.kmedoids(
self.n_clusters, dmat, 0, minibatch_labels,
random_state=random_state)
minibatch_labels, m = _kmedoids.contigify_ids(ids)
# Copy back the new cluster_ids_ for the centers
minibatch_cluster_ids = np.array(
sorted(m.items(), key=itemgetter(1)))[:, 0]
cluster_ids_ = minibatch_indices[minibatch_cluster_ids]
# Copy back the new labels for the elements
n_changed = np.sum(labels_[minibatch_indices] != minibatch_labels)
if n_changed == 0:
n_iters_no_improvement += 1
else:
labels_[minibatch_indices] = minibatch_labels
n_iters_no_improvement = 0
if n_iters_no_improvement >= self.max_no_improvement:
break
self.cluster_ids_ = cluster_ids_
self.cluster_centers_ = X[cluster_ids_]
self.labels_, self.inertia_ = libdistance.assign_nearest(
X, self.cluster_centers_, metric=self.metric)
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
New data to predict.
Returns
-------
Y : array, shape [n_samples,]
Index of the closest center each sample belongs to.
"""
labels, _ = libdistance.assign_nearest(
X, self.cluster_centers_, metric=self.metric)
return labels
In [18]:
import mdtraj as md
md.
In [8]:
atomwise_deviations = dict()
for tau in list(range(1,10)) + [25,50,100]:
print(tau)
n_frames=len(fs_t)-tau
n_atoms = fs_t[0].n_atoms
atomwise_deviations[tau]=np.zeros((n_frames,n_atoms))
for i in range(n_frames):
atomwise_deviations[tau][i] = compute_atomwise_deviation(fs_t[i],fs_t[i+tau])
In [99]:
mean = np.mean(atomwise_deviations[1],0)
stdev = np.std(atomwise_deviations[1],0)
plt.plot(mean,c='green')
plt.plot(mean+stdev,c='blue',linestyle='--')
plt.plot(mean-stdev,c='blue',linestyle='--')
Out[99]:
In [100]:
plt.plot(1.0/mean)
Out[100]:
In [101]:
plt.scatter(mean,stdev)
Out[101]:
In [102]:
means = np.array([np.mean(d,0) for d in atomwise_deviations.values()]).T
In [ ]:
plt.plot(np.arange(len(means))+1,means[:,0],label='Lag-time: 50ps');
#plt.plot(np.arange(len(means))+1,means[:,-1],label='Lag-time: 5000ps');
plt.xlabel('Residue #')
plt.ylabel('Mean distance (nm)')
plt.title('Fs Peptide')
plt.legend(loc='best',frameon=False)
In [107]:
plt.plot(np.arange(len(means))+1,means[:,0],label='Lag-time: 50 ps');
plt.plot(np.arange(len(means))+1,means[:,-1],label='Lag-time: 5 ns');
plt.xlabel('Atom #')
plt.ylabel('Mean distance (nm)')
plt.title('Fs Peptide')
plt.legend(loc='best',frameon=False)
Out[107]:
In [ ]:
In [ ]:
fs_trajectories
In [75]:
s = sorted(np.arange(len(means)),key=lambda i:means[i][-1])
In [79]:
plt.plot(means[s]);
plt.xlabel('Atom (sorted)')
In [85]:
plt.plot(means[:,[0,4,8,9,10,11]][s]);
In [80]:
means.shape
Out[80]:
In [84]:
means[:,[0,4,8,9,10,11]].shape
Out[84]:
In [90]:
print(FsPeptide.description())
In [127]:
atoms,_ = fs_t.topology.to_dataframe()
atoms.element
Out[127]:
In [129]:
a = np.array(atoms.element)
a
Out[129]:
In [136]:
atom_color_dict = {'C':0,'H':1,'O':2,'N':3}
atom_colors = np.array([atom_color_dict[e] for e in a])
In [ ]:
In [139]:
plt.scatter(np.arange(len(means))+1,means[:,0],#label='Lag-time: 50 ps',
c=atom_colors);
plt.xlabel('Atom #')
plt.ylabel('Mean distance (nm)')
plt.title('Fs Peptide')
plt.colorbar()
#plt.legend(loc='best',frameon=False)
Out[139]:
In [141]:
m = means[:,0]
In [142]:
m.shape
Out[142]:
In [144]:
for atom in atom_color_dict:
print(atom,np.mean(m[a==atom]))
In [190]:
weights = 1/means[:,0]
weights.shape
Out[190]:
In [150]:
from msmbuilder.cluster import MiniBatchKMedoids
In [152]:
kmedoids = MiniBatchKMedoids(metric='rmsd')
In [166]:
X_ = kmedoids.fit_transform(fs_t)
In [167]:
X_ = np.array(X_)[:,0]
X_.shape
Out[167]:
In [168]:
from msmbuilder import msm
m = msm.MarkovStateModel()
In [169]:
m.fit(X_)
Out[169]:
In [176]:
print(m.summarize())
In [173]:
plt.imshow(m.transmat_,interpolation='none',cmap='Blues')
plt.colorbar()
Out[173]:
In [177]:
n=5000
t = fs_t[:n]
In [180]:
rmsd = np.zeros((n,n))
for i in range(n):
rmsd[i,i:] = mdtraj.rmsd(t[i:n],t[i])
In [206]:
rmsd = rmsd + rmsd.T
In [207]:
plt.imshow(rmsd,interpolation='none',cmap='Blues')
Out[207]:
In [ ]:
In [201]:
def wRMSD(X,Y,w='unweighted'):
dev = compute_atomwise_deviation(X,Y)
if w == 'unweighted':
wdev = sum(dev)
else:
wdev = w.dot(dev)
return np.sqrt(wdev) / len(X.xyz.T)
In [ ]:
In [202]:
wRMSD(t[0],t[1])
Out[202]:
In [204]:
wRMSD(t[0],t[1],weights)
Out[204]:
In [209]:
wrmsd_mat = np.zeros((n,n))
for i in range(len(t)):
for j in range(i):
wrmsd_mat[i,j] = wRMSD(t[i],t[j],weights)
if i%10==0:
print(i)
In [222]:
wrmsd_subset = wrmsd_mat[:500,:500] + wrmsd_mat[:500,:500].T
rmsd_subset = rmsd[:500,:500]
In [230]:
plt.imshow(np.log(wrmsd_subset),cmap='Blues',interpolation='none')
plt.colorbar()
Out[230]:
In [232]:
plt.imshow(np.log(rmsd_subset+1),cmap='Blues',interpolation='none')
plt.colorbar()
Out[232]:
In [220]:
plt.hist(rmsd.flatten(),bins=100);
In [227]:
plt.hist(rmsd_subset.flatten(),bins=100);
plt.figure()
plt.hist(wrmsd_subset.flatten(),bins=100);
In [203]:
dev = compute_atomwise_deviation(t[0],t[1])
dev.shape
Out[203]:
In [ ]:
mdtraj.rmsd(
In [248]:
from msmbuilder import featurizer
rpf = featurizer.RawPositionsFeaturizer()
rpft = rpf.fit_transform(t.center_coordinates())
In [250]:
rpft[0].shape
Out[250]:
In [253]:
np.reshape(rpft,(len(t),n_atoms,3)).shape
Out[253]:
In [255]:
rpft = np.reshape(rpft,(len(t),n_atoms,3))
In [257]:
dev_mat = np.zeros((n,n,n_atoms))
for i in range(len(t)):
for j in range(i):
dev_mat[i,j] = compute_atomwise_deviation_xyz(rpft[i],rpft[j])
if i%100==0:
print(i)
In [258]:
n=2000
dev_mat = dev_mat[:n,:n]
In [263]:
dev_mat[0,:].shape
Out[263]:
In [281]:
weights/=weights.sum()
weights.sum()
Out[281]:
In [282]:
weighted_rmsd = dev_mat.dot(weights)
weighted_rmsd = weighted_rmsd+weighted_rmsd.T
In [267]:
plt.imshow(np.log(weighted_rmsd),cmap='Blues',interpolation='none')
plt.colorbar()
Out[267]:
In [272]:
plt.imshow(np.log(rmsd[:2000,:2000]+10),cmap='Blues',interpolation='none')
Out[272]:
In [277]:
plt.scatter(rmsd[:2000,:2000].flatten(),weighted_rmsd.flatten(),alpha=0.1,linewidths=0)
Out[277]:
In [241]:
from numpy.linalg import det
def BC(X,Y):
return det(X.T.dot(Y)) / np.sqrt(det(X.T.dot(X)) * det(Y.T.dot(Y)))
def BC_w(X,Y,M=None):
if M==None:
M = np.diag(np.ones(len(X)))
return det(X.T.dot(M).dot(Y)) / np.sqrt(det(X.T.dot(M).dot(X)) * det(Y.T.dot(M).dot(Y)))
In [284]:
n
Out[284]:
In [286]:
%%prun
bc_mat = np.zeros((n,n))
for i in range(n):
for j in range(i):
bc_mat[i,j] = BC(rpft[i],rpft[j])
if i % 100==0:
print(i)
In [289]:
wbc_mat = np.zeros((n,n))
weight_matrix = np.diag(weights)
for i in range(n):
for j in range(i):
wbc_mat[i,j] = BC_w(rpft[i],rpft[j],weight_matrix)
if i % 100==0:
print(i)
In [314]:
plt.imshow(bc_mat)
plt.figure()
plt.imshow(wbc_mat)
Out[314]:
In [293]:
plt.scatter(wbc_mat.flatten(),bc_mat.flatten(),alpha=0.01,linewidths=0)
Out[293]:
In [309]:
plt.plot(weights)
Out[309]:
In [301]:
plt.plot(weights**-3)
Out[301]:
In [308]:
plt.plot(np.exp(-weights))
Out[308]:
In [246]:
%prun BC(t[0].xyz[0],t[1].xyz[0])
In [240]:
%prun [compute_atomwise_deviation_xyz(t[0].xyz[0],t[1].xyz[0]) for i in range(1000)]
In [239]:
t[0].xyz[0]
Out[239]:
In [188]:
weights.shape
Out[188]:
In [ ]:
mini_medoids = MiniBatchKMedoids(metric=RMSD)
In [313]:
plt.imshow(wrmsd_subset)
Out[313]:
In [151]:
from sklearn.cluster import MiniBatchKMeans,
In [ ]:
kmeans = MiniBatchKMeans(