In [15]:
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline

fnames = glob('/Users/joshuafass/src_backbone_torsions/*.npy')
len(fnames)


Out[15]:
788

In [2]:
fnames[0]


Out[2]:
'/Users/joshuafass/src_backbone_torsions/run0-clone0_backbone_torsions.npy'

In [3]:
def parse_run_from_fname(fname):
    start = fname.find('/run') + 4
    end = fname.find('-clone')
    return int(fname[start:end])
parse_run_from_fname(fnames[0]), parse_run_from_fname(fnames[-1])


Out[3]:
(0, 99)

In [4]:
# for now, let me just look at a particular pair of RUNs: first, what do the distributions look like?
X = np.load(fnames[0])

In [5]:
# so there are a thousand here, and we should recall that these are sins and cosines of angles
X.shape


Out[5]:
(5320, 1012)

In [6]:
np.arccos(X[:,0])


Out[6]:
array([ 2.04579735,  1.53733039,  1.86796951, ...,  1.56466341,
        1.48882687,  2.26974821], dtype=float32)

In [7]:
import pyemma

In [12]:
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# first, let's generate a thousand histograms?

run_0 = []
run_1 = []

X_raw = []
Y_raw = []

for fname in fnames:
    try:
        raw = np.load(fname)
        x = np.arctan2(raw[:,::2], raw[:,1::2]) + np.pi
        run = parse_run_from_fname(fname)
        if run == 0:
            run_0.append(x)
            X_raw.append(raw)
        if run == 1:
            run_1.append(x)
            Y_raw.append(raw)
    except:
        print('Something went wrong!')


Something went wrong!

In [9]:
starting_0 = np.vstack([x[0] for x in run_0])
starting_1 = np.vstack([x[0] for x in run_1])
starting_0.shape, starting_1.shape


Out[9]:
((13, 506), (14, 506))

In [10]:
X = np.vstack(run_0)
Y = np.vstack(run_1)

In [11]:
X.shape


Out[11]:
(58516, 506)

In [14]:
xlim = 0, 2 * np.pi
plt.hist(X[:,0],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.hist(Y[:,0],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
plt.xlabel('Angle (radians)')
plt.ylabel('Observed probability density')
plt.title('')
plt.xlim(*xlim)


Out[14]:
(0, 6.283185307179586)

In [30]:
# let me grab the feature names real quick, so I can label the plots properly

In [31]:
# let me also find the argmax_i of D_KL(p(X[:,i]), p(Y[:,i])), the torsion with the most different of a distribution here

In [32]:
import mdtraj as md
src_traj = md.load('run0-clone0.h5')

In [ ]:
import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view

In [33]:
import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view.clear_representations()
#view.add_cartoon()
view.add_backbone()
view

In [34]:
feat_src = pyemma.coordinates.featurizer(src_traj.top)
feat_src.add_backbone_torsions()
feats = feat_src.describe()

In [35]:
starting_0.shape


Out[35]:
(13, 506)

In [36]:
def plot(i):
    plt.hist(X[:,i],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
    plt.hist(Y[:,i],bins=100,histtype='stepfilled',alpha=0.5,normed=True);
    plt.xlabel('Angle (radians)')
    plt.ylabel('Observed probability density')
    plt.title(feats[i])
    
    plt.vlines(starting_0[:,i],0,1,color='blue',linestyles='--')
    plt.vlines(starting_1[:,i],0,1,color='green',linestyles='--')
    plt.xlim(*xlim)
plot(4)



In [37]:
from scipy.stats import entropy
from tqdm import tqdm
# get histograms
X_hists = []
Y_hists = []
KL_divs = []

n_bins = 100
xlim = 0, 2 * np.pi

for i in tqdm(range(len(feat_src.describe()))):
    X_hists.append(np.histogram(X[:,i],bins=n_bins,range=xlim)[0])
    Y_hists.append(np.histogram(Y[:,i],bins=n_bins,range=xlim)[0])
    KL_divs.append(entropy(X_hists[i]+1, Y_hists[i]+1))


100%|██████████| 506/506 [00:00<00:00, 575.25it/s]

In [38]:
np.max(KL_divs), np.argmax(KL_divs)


Out[38]:
(8.6881729235109351, 436)

In [39]:
plot(np.argmax(KL_divs))



In [40]:
plt.plot(sorted(KL_divs),'.')


Out[40]:
[<matplotlib.lines.Line2D at 0x14b349090>]

In [41]:
inds = np.argsort(KL_divs)[::-1]

In [42]:
inds[0]


Out[42]:
436

In [43]:
for i in inds[:20]:
    plt.figure()
    plot(i)
    plt.ylim(0,3)
    #plt.yscale('log')



In [1]:
# okay, let's look at the energy term!

from simtk.openmm import app
import simtk.openmm as mm
from simtk.unit import *
import openmmtools
from openmmtools.integrators import kB

forcefield = app.ForceField('amber99sbildn.xml', 'tip3p')

temperature = 300*kelvin

system = forcefield.createSystem(src_traj.topology.to_openmm(), nonbondedMethod=app.CutoffNonPeriodic,
    nonbondedCutoff=1.0*nanometers, constraints=app.HBonds, rigidWater=True)


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-1-81138964ef70> in <module>()
      7 from openmmtools.integrators import kB
      8 
----> 9 forcefield = app.ForceField('amber99sbildn.xml', 'tip3p')
     10 
     11 temperature = 300*kelvin

/Users/joshuafass/anaconda/lib/python2.7/site-packages/simtk/openmm/app/forcefield.pyc in __init__(self, *files)
    122         self._templateGenerators = []
    123         for file in files:
--> 124             self.loadFile(file)
    125 
    126     def loadFile(self, file):

/Users/joshuafass/anaconda/lib/python2.7/site-packages/simtk/openmm/app/forcefield.pyc in loadFile(self, file)
    140             tree = etree.parse(file)
    141         except IOError:
--> 142             tree = etree.parse(os.path.join(os.path.dirname(__file__), 'data', file))
    143         except Exception as e:
    144             # Fail with an error message about which file could not be read.

/Users/joshuafass/anaconda/lib/python2.7/xml/etree/ElementTree.pyc in parse(source, parser)
   1180 def parse(source, parser=None):
   1181     tree = ElementTree()
-> 1182     tree.parse(source, parser)
   1183     return tree
   1184 

/Users/joshuafass/anaconda/lib/python2.7/xml/etree/ElementTree.pyc in parse(self, source, parser)
    645         close_source = False
    646         if not hasattr(source, "read"):
--> 647             source = open(source, "rb")
    648             close_source = True
    649         try:

IOError: [Errno 2] No such file or directory: '/Users/joshuafass/anaconda/lib/python2.7/site-packages/simtk/openmm/app/data/tip3p'

In [66]:
# what's the index of that completely disconnected

In [67]:
system.getForces()


Out[67]:
[<simtk.openmm.openmm.HarmonicBondForce; proxy of <Swig Object of type 'OpenMM::HarmonicBondForce *' at 0x14f67f5d0> >,
 <simtk.openmm.openmm.HarmonicAngleForce; proxy of <Swig Object of type 'OpenMM::HarmonicAngleForce *' at 0x14f67f720> >,
 <simtk.openmm.openmm.PeriodicTorsionForce; proxy of <Swig Object of type 'OpenMM::PeriodicTorsionForce *' at 0x14f67f7b0> >,
 <simtk.openmm.openmm.NonbondedForce; proxy of <Swig Object of type 'OpenMM::NonbondedForce *' at 0x14f67f5a0> >,
 <simtk.openmm.openmm.GBSAOBCForce; proxy of <Swig Object of type 'OpenMM::GBSAOBCForce *' at 0x14f67f780> >,
 <simtk.openmm.openmm.CMMotionRemover; proxy of <Swig Object of type 'OpenMM::CMMotionRemover *' at 0x14f67f900> >]

In [68]:
torsions = system.getForce(2)

In [ ]:


In [119]:
worst_feature_index = inds[0]
feat_torsions = feat_src.active_features[0]
worst_dih_inds = feat_torsions.angle_indexes[worst_feature_index]

In [120]:
worst_dih_inds


Out[120]:
array([3509, 3511, 3513, 3524])

In [121]:
feat_src.describe()[worst_feature_index]


Out[121]:
u'PHI 0 GLU 489'

In [122]:
force_term_inds = []
all_angles = []

for i in range(torsions.getNumTorsions()):
    abcd = torsions.getTorsionParameters(i)[:4]
    if set(abcd) == set(worst_dih_inds):
        print('Found one!')
        force_term_inds.append(i)
    all_angles.append(abcd)


Found one!
Found one!

In [123]:
for i in force_term_inds:
    print(torsions.getTorsionParameters(i))


[3509, 3511, 3513, 3524, 3, Quantity(value=0.0, unit=radian), Quantity(value=1.75728, unit=kilojoule/mole)]
[3509, 3511, 3513, 3524, 2, Quantity(value=0.0, unit=radian), Quantity(value=1.12968, unit=kilojoule/mole)]

In [124]:
def parse_torsion_parameters(torsion_parameters):
    param_dict = dict()
    #param_dict['a'] = torsion_parameters[0]
    #param_dict['b'] = torsion_parameters[1]
    #param_dict['c'] = torsion_parameters[2]
    #param_dict['d'] = torsion_parameters[3]
    param_dict['n'] = torsion_parameters[4]
    param_dict['theta_0'] = torsion_parameters[5]
    param_dict['k'] = torsion_parameters[6]
    return param_dict

def periodic_torsion_force(theta, theta_0, k, n):
    return k * (1 + np.cos(n*theta - theta_0 / radian))

param_dicts = [parse_torsion_parameters(torsions.getTorsionParameters(i)) for i in force_term_inds]

In [125]:
x = np.linspace(0, 2 * np.pi, 1000)
y = sum([periodic_torsion_force(x, **p) for p in param_dicts])

In [126]:
plt.plot(x,y)


Out[126]:
[<matplotlib.lines.Line2D at 0x14b904510>]

In [127]:
q = np.exp(-y * kilojoule_per_mole / (kB * temperature))
q /= np.trapz(q, x)

In [128]:
plot(worst_feature_index)

plt.plot(x,q,linewidth=3,color='purple')
plt.fill_between(x,q,alpha=0.3, color='purple')
plt.xlim(x[0],x[-1])


Out[128]:
(0.0, 6.2831853071795862)

In [129]:
# plot_

In [130]:
DFG_flip = [2190,2188,2198,2203]

In [131]:
[i for i in range(len(feat_torsions.describe())) if set(feat_torsions.angle_indexes[i]) == set(DFG_flip)]


Out[131]:
[]

In [132]:
np.min(feat_torsions.angle_indexes), np.max(feat_torsions.angle_indexes)


Out[132]:
(0, 4088)

In [133]:
np.min(all_angles), np.max(all_angles)


Out[133]:
(0, 4090)

In [134]:
# starting structures

In [135]:
# hmm maybe that didn't work!
# let's see if they're still separable if I omit that feature

In [136]:
i,j = 2 * worst_feature_index, 2 * worst_feature_index + 1
i,j


Out[136]:
(872, 873)

In [137]:
X_ = np.vstack(X_raw)
Y_ = np.vstack(Y_raw)

In [138]:
ndim = X_.shape[1]
censored_set = set(range(ndim))-set([i,j])
len(censored_set), ndim


Out[138]:
(1010, 1012)

In [49]:
censored_inds = np.array(sorted(censored_set))
censored_inds


Out[49]:
array([   0,    1,    2, ..., 1009, 1010, 1011])

In [ ]:


In [51]:
X_censored = X_[:,censored_inds]
Y_censored = Y_[:,censored_inds]

ins = np.vstack((X_censored, Y_censored))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))

ins.shape, outs.shape


Out[51]:
((114273, 1010), (114273,))

In [53]:
from sklearn.linear_model import LogisticRegression

In [142]:
lr = LogisticRegression(penalty='l1')


lr.fit(ins, outs)
lr.score(ins, outs)


Out[142]:
1.0

In [143]:
sum(lr.coef_==0)


Out[143]:
950

In [176]:
# full / un-censored
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
lr = LogisticRegression(penalty='l1',C=0.001)
lr.fit(ins, outs)
lr.score(ins, outs)


Out[176]:
1.0

In [177]:
np.sum(lr.coef_!=0)


Out[177]:
2

In [178]:
lr.coef_.shape


Out[178]:
(1, 1012)

In [179]:
discriminative_features = np.arange(len(lr.coef_[0]))[lr.coef_[0]!=0]

In [181]:
feat_src_cossin = pyemma.coordinates.featurizer(src_traj.top)
feat_src_cossin.add_backbone_torsions(cossin=True)

In [182]:
interesting_residues = [int(feat_src_cossin.describe()[i].split()[-1][:-1]) for i in discriminative_features]

In [183]:
len(interesting_residues)


Out[183]:
2

In [184]:
len(set(interesting_residues))


Out[184]:
2

In [185]:
res = ', '.join([str(a) for a in sorted(set(interesting_residues))])
res


Out[185]:
'488, 489'

In [186]:
# let me just highlight the residues in question

import nglview
view = nglview.show_mdtraj(src_traj[::10].superpose(src_traj,0))
view.clear_representations()
view.add_cartoon()
#view.add_backbone()
view.add_hyperball(selection=res, color='red')
view

In [ ]:
atoms = [a for a in src_traj.top.atoms]
[atoms[a] for a in

In [187]:
# how many times can I remove all discriminative features?

In [ ]:
current_censored_set = set()
scores = []
censored_sets = []

for i in range(10):
    X_censored = X_[:,censored_inds]
    Y_censored = Y_[:,censored_inds]

    ins = np.vstack((X_censored, Y_censored))
    outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))

In [191]:
# which of the features has any informaation about c?
from sklearn.feature_selection import chi2, f_classif
chi2, pval = chi2(X_, outs)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-191-86a2f9689caa> in <module>()
      1 # which of the features has any informaation about c?
      2 from sklearn.feature_selection import chi2, f_classif
----> 3 chi2, pval = chi2(X_, outs)

/Users/joshuafass/anaconda/lib/python2.7/site-packages/scikit_learn-0.17.1-py2.7-macosx-10.6-x86_64.egg/sklearn/feature_selection/univariate_selection.pyc in chi2(X, y)
    212     X = check_array(X, accept_sparse='csr')
    213     if np.any((X.data if issparse(X) else X) < 0):
--> 214         raise ValueError("Input X must be non-negative.")
    215 
    216     Y = LabelBinarizer().fit_transform(y)

ValueError: Input X must be non-negative.

In [192]:
np.min(X_)


Out[192]:
-1.0

In [194]:
F, pval = f_classif(ins, outs)

In [196]:
plt.plot(sorted(F))


Out[196]:
[<matplotlib.lines.Line2D at 0x16ea78110>]

In [199]:
np.max(pval), np.min(pval)


Out[199]:
(nan, nan)

In [200]:
pval


Out[200]:
array([  3.14585397e-106,   7.30624862e-008,   1.66745930e-004, ...,
         6.38431409e-052,   6.92883754e-001,   2.36543268e-011])

In [202]:
F[10], pval[10]


Out[202]:
(187.62216, 1.1353878807045007e-42)

In [205]:
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))
ins.shape, outs.shape


Out[205]:
((114273, 1012), (114273,))

In [19]:
from sklearn.lda import LDA
lda = LDA(n_components=1)


/Users/joshuafass/anaconda/lib/python2.7/site-packages/sklearn/lda.py:4: DeprecationWarning: lda.LDA has been moved to discriminant_analysis.LinearDiscriminantAnalysis in 0.17 and will be removed in 0.19
  "in 0.17 and will be removed in 0.19", DeprecationWarning)

In [213]:
%%time
lda.fit(ins_[::10],outs[::10])


CPU times: user 87.3 ms, sys: 7.49 ms, total: 94.8 ms
Wall time: 45.7 ms
Out[213]:
LinearDiscriminantAnalysis(n_components=1, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [214]:
lda.score(ins_,outs)


Out[214]:
0.99497694118470681

In [220]:
len(feat_src.describe())


Out[220]:
506

In [23]:
from tqdm import tqdm

In [39]:
# X_raw, Y_raw are the raw torsion angles for each
X_ = np.vstack(X_raw)
Y_ = np.vstack(Y_raw)

# if I used just a small random subset of features, can I still get high classification accuracy?
# --> if so, then this result isn't extremely sensitive to a single isolate region of the protein.
ins = np.vstack((X_, Y_))
outs = np.hstack((np.zeros(len(X_)), np.ones(len(Y_))))

def subset_classifiability(ins, outs, n_features, n_iterations= 100, train_stride=10):
    scores = []
    np.random.seed(0)
    all_indices = np.arange(ins.shape[1])
    train_stride = 10
    for i in tqdm(range(n_iterations)):
        np.random.shuffle(all_indices)
        ins_ = ins[:,all_indices[:n_features]]

        #clf = LogisticRegression(penalty='l1')
        clf = LDA(n_components=1)

        clf.fit(ins_[::train_stride], outs[::train_stride])
        score = clf.score(ins_, outs)
        scores.append(score)
    return scores

In [40]:
all_scores = []
for fraction in [0.1,0.25,0.5]:
    n_features = int(fraction*ins.shape[1])
    print(fraction, n_features)
    all_scores.append(subset_classifiability(ins, outs, n_features))
    print(np.mean(all_scores[-1]))


  0%|          | 0/100 [00:00<?, ?it/s]
(0.1, 101)
100%|██████████| 100/100 [00:29<00:00,  3.40it/s]
  0%|          | 0/100 [00:00<?, ?it/s]
0.987397372958
(0.25, 253)
100%|██████████| 100/100 [01:25<00:00,  1.20it/s]
  0%|          | 0/100 [00:00<?, ?it/s]
0.999685139972
(0.5, 506)
100%|██████████| 100/100 [03:27<00:00,  2.46s/it]
0.99999667463


In [41]:
# what fraction are perfectly classified?
[np.sum(np.array(s)==1) for s in all_scores]


Out[41]:
[15, 54, 95]

In [43]:
[np.mean(s) for s in all_scores]


Out[43]:
[0.98739737295774144, 0.99968513997182185, 0.99999667463005271]

In [42]:
# what are the score distributions?
for scores in all_scores:
    plt.figure()
    plt.hist(scores, bins=20);



In [ ]: