In [15]:
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline
fnames = glob('/Users/joshuafass/src_backbone_torsions/*.npy')
len(fnames)
Out[15]:
In [2]:
fnames[0]
Out[2]:
In [3]:
def parse_run_from_fname(fname):
start = fname.find('/run') + 4
end = fname.find('-clone')
return int(fname[start:end])
parse_run_from_fname(fnames[0]), parse_run_from_fname(fnames[-1])
Out[3]:
In [4]:
# for now, let me just look at a particular pair of RUNs: first, what do the distributions look like?
X = np.load(fnames[0])
In [5]:
# so there are a thousand here, and we should recall that these are sins and cosines of angles
X.shape
Out[5]:
In [6]:
np.arccos(X[:,0])
Out[6]:
In [7]:
import pyemma
In [12]:
import matplotlib.pyplot as plt
%matplotlib inline
In [8]:
# first, let's generate a thousand histograms?
run_0 = []
run_1 = []
X_raw = []
Y_raw = []
for fname in fnames:
try:
raw = np.load(fname)
x = np.arctan2(raw[:,::2], raw[:,1::2]) + np.pi
run = parse_run_from_fname(fname)
if run == 0:
run_0.append(x)
X_raw.append(raw)
if run == 1:
run_1.append(x)
Y_raw.append(raw)
except:
print('Something went wrong!')
In [9]:
starting_0 = np.vstack([x[0] for x in run_0])
starting_1 = np.vstack([x[0] for x in run_1])
starting_0.shape, starting_1.shape
Out[9]:
In [10]:
X = np.vstack(run_0)
Y = np.vstack(run_1)
In [11]:
X.shape
Out[11]:
In [14]:
xlim = 0, 2 * np.pi
plt.hist(X[:,0],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.hist(Y[:,0],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.xlabel('Angle (radians)')
plt.ylabel('Observed probability density')
plt.title('')
plt.xlim(*xlim)
Out[14]:
In [30]:
# let me grab the feature names real quick, so I can label the plots properly
In [31]:
# let me also find the argmax_i of D_KL(p(X[:,i]), p(Y[:,i])), the torsion with the most different of a distribution here
In [32]:
import mdtraj as md
src_traj = md.load('run0-clone0.h5')
In [ ]:
import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view
In [33]:
import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view.clear_representations()
#view.add_cartoon()
view.add_backbone()
view
In [34]:
feat_src = pyemma.coordinates.featurizer(src_traj.top)
feat_src.add_backbone_torsions()
feats = feat_src.describe()
In [35]:
starting_0.shape
Out[35]:
In [36]:
def plot(i):
plt.hist(X[:,i],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.hist(Y[:,i],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.xlabel('Angle (radians)')
plt.ylabel('Observed probability density')
plt.title(feats[i])
plt.vlines(starting_0[:,i],0,1,color='blue',linestyles='--')
plt.vlines(starting_1[:,i],0,1,color='green',linestyles='--')
plt.xlim(*xlim)
plot(4)
In [37]:
from scipy.stats import entropy
from tqdm import tqdm
# get histograms
X_hists = []
Y_hists = []
KL_divs = []
n_bins = 100
xlim = 0, 2 * np.pi
for i in tqdm(range(len(feat_src.describe()))):
X_hists.append(np.histogram(X[:,i],bins=n_bins,range=xlim)[0])
Y_hists.append(np.histogram(Y[:,i],bins=n_bins,range=xlim)[0])
KL_divs.append(entropy(X_hists[i]+1, Y_hists[i]+1))
In [38]:
np.max(KL_divs), np.argmax(KL_divs)
Out[38]:
In [39]:
plot(np.argmax(KL_divs))
In [40]:
plt.plot(sorted(KL_divs),'.')
Out[40]:
In [41]:
inds = np.argsort(KL_divs)[::-1]
In [42]:
inds[0]
Out[42]:
In [43]:
for i in inds[:20]:
plt.figure()
plot(i)
plt.ylim(0,3)
#plt.yscale('log')
In [1]:
# okay, let's look at the energy term!
from simtk.openmm import app
import simtk.openmm as mm
from simtk.unit import *
import openmmtools
from openmmtools.integrators import kB
forcefield = app.ForceField('amber99sbildn.xml', 'tip3p')
temperature = 300*kelvin
system = forcefield.createSystem(src_traj.topology.to_openmm(), nonbondedMethod=app.CutoffNonPeriodic,
nonbondedCutoff=1.0*nanometers, constraints=app.HBonds, rigidWater=True)
In [66]:
# what's the index of that completely disconnected
In [67]:
system.getForces()
Out[67]:
In [68]:
torsions = system.getForce(2)
In [ ]:
In [119]:
worst_feature_index = inds[0]
feat_torsions = feat_src.active_features[0]
worst_dih_inds = feat_torsions.angle_indexes[worst_feature_index]
In [120]:
worst_dih_inds
Out[120]:
In [121]:
feat_src.describe()[worst_feature_index]
Out[121]:
In [122]:
force_term_inds = []
all_angles = []
for i in range(torsions.getNumTorsions()):
abcd = torsions.getTorsionParameters(i)[:4]
if set(abcd) == set(worst_dih_inds):
print('Found one!')
force_term_inds.append(i)
all_angles.append(abcd)
In [123]:
for i in force_term_inds:
print(torsions.getTorsionParameters(i))
In [124]:
def parse_torsion_parameters(torsion_parameters):
param_dict = dict()
#param_dict['a'] = torsion_parameters[0]
#param_dict['b'] = torsion_parameters[1]
#param_dict['c'] = torsion_parameters[2]
#param_dict['d'] = torsion_parameters[3]
param_dict['n'] = torsion_parameters[4]
param_dict['theta_0'] = torsion_parameters[5]
param_dict['k'] = torsion_parameters[6]
return param_dict
def periodic_torsion_force(theta, theta_0, k, n):
return k * (1 + np.cos(n*theta - theta_0 / radian))
param_dicts = [parse_torsion_parameters(torsions.getTorsionParameters(i)) for i in force_term_inds]
In [125]:
x = np.linspace(0, 2 * np.pi, 1000)
y = sum([periodic_torsion_force(x, **p) for p in param_dicts])
In [126]:
plt.plot(x,y)
Out[126]:
In [127]:
q = np.exp(-y * kilojoule_per_mole / (kB * temperature))
q /= np.trapz(q, x)
In [128]:
plot(worst_feature_index)
plt.plot(x,q,linewidth=3,color='purple')
plt.fill_between(x,q,alpha=0.3, color='purple')
plt.xlim(x[0],x[-1])
Out[128]:
In [129]:
# plot_
In [130]:
DFG_flip = [2190,2188,2198,2203]
In [131]:
[i for i in range(len(feat_torsions.describe())) if set(feat_torsions.angle_indexes[i]) == set(DFG_flip)]
Out[131]:
In [132]:
np.min(feat_torsions.angle_indexes), np.max(feat_torsions.angle_indexes)
Out[132]:
In [133]:
np.min(all_angles), np.max(all_angles)
Out[133]:
In [134]:
# starting structures
In [135]:
# hmm maybe that didn't work!
# let's see if they're still separable if I omit that feature
In [136]:
i,j = 2 * worst_feature_index, 2 * worst_feature_index + 1
i,j
Out[136]:
In [137]:
X_ = np.vstack(X_raw)
Y_ = np.vstack(Y_raw)
In [138]:
ndim = X_.shape[1]
censored_set = set(range(ndim))-set([i,j])
len(censored_set), ndim
Out[138]:
In [49]:
censored_inds = np.array(sorted(censored_set))
censored_inds
Out[49]:
In [ ]:
In [51]:
X_censored = X_[:,censored_inds]
Y_censored = Y_[:,censored_inds]
ins = np.vstack((X_censored, Y_censored))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
ins.shape, outs.shape
Out[51]:
In [53]:
from sklearn.linear_model import LogisticRegression
In [142]:
lr = LogisticRegression(penalty='l1')
lr.fit(ins, outs)
lr.score(ins, outs)
Out[142]:
In [143]:
sum(lr.coef_==0)
Out[143]:
In [176]:
# full / un-censored
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
lr = LogisticRegression(penalty='l1',C=0.001)
lr.fit(ins, outs)
lr.score(ins, outs)
Out[176]:
In [177]:
np.sum(lr.coef_!=0)
Out[177]:
In [178]:
lr.coef_.shape
Out[178]:
In [179]:
discriminative_features = np.arange(len(lr.coef_[0]))[lr.coef_[0]!=0]
In [181]:
feat_src_cossin = pyemma.coordinates.featurizer(src_traj.top)
feat_src_cossin.add_backbone_torsions(cossin=True)
In [182]:
interesting_residues = [int(feat_src_cossin.describe()[i].split()[-1][:-1]) for i in discriminative_features]
In [183]:
len(interesting_residues)
Out[183]:
In [184]:
len(set(interesting_residues))
Out[184]:
In [185]:
res = ', '.join([str(a) for a in sorted(set(interesting_residues))])
res
Out[185]:
In [186]:
# let me just highlight the residues in question
import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view.clear_representations()
view.add_cartoon()
#view.add_backbone()
view.add_hyperball(selection=res, color='red')
view
In [ ]:
atoms = [a for a in src_traj.top.atoms]
[atoms[a] for a in
In [187]:
# how many times can I remove all discriminative features?
In [ ]:
current_censored_set = set()
scores = []
censored_sets = []
for i in range(10):
X_censored = X_[:,censored_inds]
Y_censored = Y_[:,censored_inds]
ins = np.vstack((X_censored, Y_censored))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
In [191]:
# which of the features has any informaation about c?
from sklearn.feature_selection import chi2, f_classif
chi2, pval = chi2(X_, outs)
In [192]:
np.min(X_)
Out[192]:
In [194]:
F, pval = f_classif(ins, outs)
In [196]:
plt.plot(sorted(F))
Out[196]:
In [199]:
np.max(pval), np.min(pval)
Out[199]:
In [200]:
pval
Out[200]:
In [202]:
F[10], pval[10]
Out[202]:
In [205]:
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
ins.shape, outs.shape
Out[205]:
In [19]:
from sklearn.lda import LDA
lda = LDA(n_components=1)
In [213]:
%%time
lda.fit(ins_[::10],outs[::10])
Out[213]:
In [214]:
lda.score(ins_,outs)
Out[214]:
In [220]:
len(feat_src.describe())
Out[220]:
In [23]:
from tqdm import tqdm
In [39]:
# X_raw, Y_raw are the raw torsion angles for each
X_ = np.vstack(X_raw)
Y_ = np.vstack(Y_raw)
# if I used just a small random subset of features, can I still get high classification accuracy?
# --> if so, then this result isn't extremely sensitive to a single isolate region of the protein.
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
def subset_classifiability(ins, outs, n_features, n_iterations= 100, train_stride=10):
scores = []
np.random.seed(0)
all_indices = np.arange(ins.shape[1])
train_stride = 10
for i in tqdm(range(n_iterations)):
np.random.shuffle(all_indices)
ins_ = ins[:,all_indices[:n_features]]
#clf = LogisticRegression(penalty='l1')
clf = LDA(n_components=1)
clf.fit(ins_[::train_stride], outs[::train_stride])
score = clf.score(ins_, outs)
scores.append(score)
return scores
In [40]:
all_scores = []
for fraction in [0.1,0.25,0.5]:
n_features = int(fraction*ins.shape[1])
print(fraction, n_features)
all_scores.append(subset_classifiability(ins, outs, n_features))
print(np.mean(all_scores[-1]))
In [41]:
# what fraction are perfectly classified?
[np.sum(np.array(s)==1) for s in all_scores]
Out[41]:
In [43]:
[np.mean(s) for s in all_scores]
Out[43]:
In [42]:
# what are the score distributions?
for scores in all_scores:
plt.figure()
plt.hist(scores, bins=20);
In [ ]: