In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from small_script.myFunctions import *
import feather
import Bio.PDB as bio

from sklearn.metrics import confusion_matrix


d3_to_index = bio.Polypeptide.d3_to_index  # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index

%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.rcParams['figure.figsize'] = [16.18033, 10]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import to_categorical

from keras.models import model_from_json

from keras.metrics import top_k_categorical_accuracy

max_features = 20
batch_size = 1024*2
maxlen = 9
n = int(1e4)


Using TensorFlow backend.

In [ ]:
def getFrags(pdb):
    seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
    with open(seqFile) as f:
        lines = f.readlines()
    a = lines[0].strip()
    all_frag = []
    for i in range(0, len(a)-8):
        frag = a[i:i+9]
        # print(frag)
        fragIndex = [one_to_index(x) for x in frag]
        # print(fragIndex)
        all_frag.append(fragIndex)
    all_frag = np.array(all_frag)
    predicted_cluster = loaded_model.predict_classes(all_frag)
    pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/fragment_memory/"
    header = '''\
[Target]
query

[Memories]
'''
    with open(pre+f"{pdb}.mem", "w") as out:
        out.write(header)
        for index, i in enumerate(predicted_cluster):
            out.write(f"fraglib/{i}.gro {index+1} 1 9 20\n")

In [222]:
def getFrags(pdb, toLocation, top_n=1, evenWeight=True):
    seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
    with open(seqFile) as f:
        lines = f.readlines()
    a = lines[0].strip()
    all_frag = []
    for i in range(0, len(a)-8):
        frag = a[i:i+9]
        # print(frag)
        fragIndex = [one_to_index(x) for x in frag]
        # print(fragIndex)
        all_frag.append(fragIndex)
    all_frag = np.array(all_frag)
    predict_prob = loaded_model.predict(all_frag)
    clusters = np.argsort(-predict_prob)[:, :top_n]
    n = predict_prob.shape[0]
    prob = predict_prob[np.arange(n).reshape(n,1), clusters]
    prob /= (prob.sum(axis=1)).reshape(n,1)
    # pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/fragment_memory/"
    pre = toLocation
    header = '''\
[Target]
query

[Memories]
'''
    with open(pre+f"{pdb}.mem", "w") as out:
        out.write(header)
        if evenWeight:
            weight = 20/top_n
        for index, c in enumerate(clusters):
            for index2, i in enumerate(c):
                if not evenWeight:
                    weight = prob[index][index2]
                out.write(f"fraglib/{i}.gro {index+1} 1 9 {weight:.3}\n")

In [4]:
json_file = open("/Users/weilu/Research/optimization/fragment/lstm100/model.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("/Users/weilu/Research/optimization/fragment/lstm100/model.h5")

In [7]:
pre = "/Users/weilu/Research/optimization/fragment/lstm100/"
x_train = np.load(pre+"trainX.npy")
y_train = np.load(pre+"trainY.npy")
x_test = np.load(pre+"testX.npy")
y_test = np.load(pre+"testY.npy")

In [97]:
loaded_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

In [137]:
loaded_model.evaluate(x_test, y_test)


1000000/1000000 [==============================] - 70s 70us/step
Out[137]:
[3.547358810791254, 0.185256, 0.32267, 0.405813, 0.539875, 0.693226]

In [118]:
loaded_model.predict_classes(x_test[:10])


Out[118]:
array([13, 69, 45, 40, 45, 95, 32, 19, 70, 21])

In [179]:
predict_prob = loaded_model.predict(x_test[:10])
clusters = np.argsort(-predict_prob)[:, :top_n]
loaded_model.evaluate(x_test[:10], y_test[:10])


10/10 [==============================] - 0s 189us/step
Out[179]:
[4.3088274002075195, 0.0, 0.0, 0.0, 0.20000000298023224, 0.6000000238418579]

In [185]:
np.arange(10).T


Out[185]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [182]:
clusters


Out[182]:
array([[13, 70, 37, 96, 79],
       [69,  6, 39, 73, 31],
       [45, 39, 96, 15, 74],
       [40, 84, 48, 22, 63],
       [45, 58, 74, 29, 39],
       [95, 66, 76, 98, 63],
       [32, 11,  5, 86, 93],
       [19,  4, 23, 97, 52],
       [70, 87, 37, 32,  6],
       [21,  4, 58, 72, 19]])

In [135]:
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)
def top_5_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)
def top_10_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=10)
def top_20_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=20)

In [136]:
loaded_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy', top_3_accuracy, top_5_accuracy, 
                                                                  top_10_accuracy, top_20_accuracy])

In [223]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/fragment_memory_top5_noeven/"
for p in pdb_list:
    name = p.lower()[:4]
    getFrags(name, pre, top_n=5, evenWeight=False)

In [ ]:
top_k_categorical_accuracy()

In [59]:
seqFile = "/Users/weilu/Research/optimization/fragment/1ctf.seq"
with open(seqFile) as f:
    lines = f.readlines()

In [63]:
a = lines[0].strip()

In [69]:
a


Out[69]:
'EFDVILKAAGANKVAVIKAVRGATGLGLKEAKDLVESAPAALKEGVSKDDAEALKKALEEAGAEVEVK'

In [79]:
all_frag = []
for i in range(0, len(a)-8):
    frag = a[i:i+9]
    # print(frag)
    fragIndex = [one_to_index(x) for x in frag]
    # print(fragIndex)
    all_frag.append(fragIndex)
all_frag = np.array(all_frag)
# print(all_frag)
loaded_model.predict_classes(all_frag)


Out[79]:
array([ 1, 25, 21, 26, 16, 38, 40, 65, 51, 85, 25, 17, 56,  4, 21, 26, 79,
       21, 96, 90, 83, 23, 38, 13, 11, 80, 84, 32, 80,  1, 67, 27, 95, 98,
       75, 11, 80, 21, 26, 16, 46, 23, 75, 13, 11, 80, 78, 32, 80,  1,  1,
        1,  1, 54, 21, 26, 16, 46, 61,  2])

In [85]:
predicted_cluster = loaded_model.predict_classes(all_frag)

In [88]:
pre = "/Users/weilu/Research/optimization/fragment/"
header = '''\
[Target]
query

[Memories]
'''
with open(pre+"frags.mem", "w") as out:
    out.write(header)
    for index, i in enumerate(predicted_cluster):
        out.write(f"fraglib/{i}.gro {index+1} 1 9 20\n")

In [ ]:


In [81]:
len(loaded_model.predict_classes(all_frag))


Out[81]:
60

In [21]:
y_train[2]


Out[21]:
array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [17]:
np.argwhere(y_train[:,1] == 1)


Out[17]:
array([[10000],
       [10001],
       [10002],
       ...,
       [19997],
       [19998],
       [19999]])

In [76]:
all_frag.shape


Out[76]:
(60, 9)

In [77]:
x_train[0:2]


Out[77]:
array([[ 6,  9, 19,  7,  3,  3,  5, 14, 16],
       [ 8,  4,  9, 19, 11,  2, 11,  5,  8]])

In [38]:
loaded_model.predict(x_train[10001:10004, :]).round(2)


Out[38]:
array([[0.  , 0.06, 0.  , 0.09, 0.  , 0.  , 0.  , 0.01, 0.  , 0.01, 0.  ,
        0.01, 0.  , 0.  , 0.01, 0.  , 0.06, 0.01, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.01, 0.  , 0.  ,
        0.  , 0.02, 0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
        0.  , 0.  , 0.01, 0.01, 0.09, 0.  , 0.  , 0.  , 0.01, 0.  , 0.05,
        0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.11, 0.01, 0.  ,
        0.06, 0.01, 0.  , 0.  , 0.  , 0.  , 0.04, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.02, 0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.07, 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.01,
        0.  ],
       [0.  , 0.22, 0.01, 0.04, 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.  ,
        0.04, 0.  , 0.  , 0.01, 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.05,
        0.  , 0.  , 0.01, 0.01, 0.  , 0.08, 0.  , 0.01, 0.  , 0.01, 0.02,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.11,
        0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.01, 0.01, 0.01,
        0.01, 0.03, 0.  , 0.02, 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.01,
        0.  , 0.01, 0.  , 0.02, 0.  , 0.02, 0.  , 0.  , 0.01, 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.24, 0.  , 0.07, 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  ,
        0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.04,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.04, 0.  , 0.  , 0.  , 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.24,
        0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.04, 0.  , 0.  , 0.  , 0.  , 0.06, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.01, 0.  , 0.1 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ]], dtype=float32)

In [39]:
loaded_model.predict_classes(x_train[10001:20000, :])


Out[39]:
array([63,  1,  1, ..., 78,  1, 56])

In [43]:
plt.hist(loaded_model.predict_classes(x_train[:10000, :]), bins=50)


Out[43]:
(array([550., 111., 457.,  86.,  52., 149., 122., 326., 212., 243., 338.,
        178.,  58., 213., 485., 162., 166., 127., 135., 184.,  67., 366.,
        466., 104.,  40.,  49., 143.,  89., 100., 690., 207.,  57., 234.,
        110., 259., 195., 189., 427., 129., 252.,  69., 119., 111.,  80.,
        136.,  75., 292., 214., 299.,  78.]),
 array([ 0.  ,  1.98,  3.96,  5.94,  7.92,  9.9 , 11.88, 13.86, 15.84,
        17.82, 19.8 , 21.78, 23.76, 25.74, 27.72, 29.7 , 31.68, 33.66,
        35.64, 37.62, 39.6 , 41.58, 43.56, 45.54, 47.52, 49.5 , 51.48,
        53.46, 55.44, 57.42, 59.4 , 61.38, 63.36, 65.34, 67.32, 69.3 ,
        71.28, 73.26, 75.24, 77.22, 79.2 , 81.18, 83.16, 85.14, 87.12,
        89.1 , 91.08, 93.06, 95.04, 97.02, 99.  ]),
 <a list of 50 Patch objects>)

In [48]:
df = pd.DataFrame({'data':loaded_model.predict_classes(x_train[:10000, :])})
print(df['data'].value_counts().head())


58    547
0     445
45    366
74    359
28    338
Name: data, dtype: int64

In [47]:
df = pd.DataFrame({'data':loaded_model.predict_classes(x_train[10001:20000, :])})
print(df['data'].value_counts().head())


1     2611
54     470
80     405
72     402
3      392
Name: data, dtype: int64

In [ ]:


In [ ]:


In [55]:
np.all(loaded_model.predict(x_train[10001:20000, :]).round(2) == loaded_model.predict_proba(x_train[10001:20000, :]).round(2))


Out[55]:
True

In [ ]:


In [57]:
loaded_model.predict(x_train[10002].reshape(1,9)).round(2)


Out[57]:
array([[0.  , 0.22, 0.01, 0.04, 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.  ,
        0.04, 0.  , 0.  , 0.01, 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.05,
        0.  , 0.  , 0.01, 0.01, 0.  , 0.08, 0.  , 0.01, 0.  , 0.01, 0.02,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.11,
        0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.01, 0.01, 0.01,
        0.01, 0.03, 0.  , 0.02, 0.  , 0.  , 0.01, 0.01, 0.  , 0.  , 0.01,
        0.  , 0.01, 0.  , 0.02, 0.  , 0.02, 0.  , 0.  , 0.01, 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  ,
        0.  ]], dtype=float32)

In [25]:
loaded_model.predict(x_train[10001].reshape(1,-1))


Out[25]:
array([[1.94163201e-03, 5.72058335e-02, 3.49636818e-03, 8.66604224e-02,
        3.45707545e-03, 2.18330737e-04, 9.23126529e-04, 1.06004775e-02,
        1.06848951e-03, 6.80391304e-03, 8.49636039e-04, 5.14316978e-03,
        5.12195111e-04, 2.44294875e-03, 6.44069584e-03, 2.51834490e-03,
        5.91256283e-02, 7.19474256e-03, 1.38767750e-03, 1.88106555e-03,
        2.71060248e-03, 3.22767720e-03, 3.61111853e-03, 4.47409088e-03,
        2.90813716e-03, 3.80009576e-03, 1.00325020e-02, 7.80387037e-03,
        6.22312305e-04, 1.90860615e-03, 8.61866213e-03, 1.83285959e-03,
        1.83846429e-03, 6.52538205e-04, 2.33797394e-02, 1.36588479e-03,
        6.12687180e-03, 1.02770468e-02, 7.04589358e-04, 1.40654051e-03,
        4.55599511e-03, 1.32390894e-02, 5.79965650e-04, 8.40307097e-04,
        4.93463944e-04, 1.28919477e-04, 5.48354723e-03, 5.70566161e-03,
        9.31848362e-02, 4.84534297e-08, 3.01808957e-03, 1.45387277e-03,
        1.09089846e-02, 4.58485028e-03, 5.19727282e-02, 1.80679350e-03,
        1.05571328e-02, 3.30582284e-03, 2.08320911e-03, 4.11883183e-03,
        4.23007784e-03, 4.35318472e-03, 1.03494525e-02, 1.12419039e-01,
        5.88001590e-03, 9.54948657e-04, 6.41854778e-02, 8.69553257e-03,
        5.19460067e-04, 2.73375167e-03, 2.55688047e-03, 3.30666138e-04,
        3.68614532e-02, 1.06919697e-03, 1.00596866e-03, 4.09205997e-04,
        3.54572199e-03, 2.97659356e-03, 2.00647507e-02, 5.06437907e-04,
        4.14454378e-03, 2.06699013e-03, 4.55147092e-04, 9.62844177e-04,
        2.92926263e-02, 6.65543303e-02, 6.94774673e-04, 8.21918901e-03,
        1.90614595e-03, 3.97215551e-03, 2.03623972e-03, 4.78787285e-07,
        6.77176751e-04, 3.30246985e-04, 4.91157523e-04, 9.59743280e-03,
        3.87088861e-04, 6.53018185e-04, 8.71430244e-03, 4.61652334e-08]],
      dtype=float32)

In [13]:
loaded_model.predict(x_train[0].reshape(1,-1))


Out[13]:
array([[4.01732400e-02, 1.69577741e-03, 9.31578863e-04, 6.02654042e-03,
        2.79191788e-02, 5.37388446e-03, 2.46933708e-03, 6.26293186e-04,
        2.28643906e-03, 2.71171506e-04, 5.15630119e-04, 1.07777992e-03,
        2.43215403e-03, 1.25367625e-03, 2.61013559e-03, 7.90676549e-02,
        6.84397179e-04, 4.37424937e-03, 3.29821691e-04, 3.27693187e-02,
        2.87527661e-03, 4.19266755e-03, 1.07942172e-03, 1.31846359e-03,
        1.09363990e-02, 6.94359234e-03, 9.84150693e-02, 7.97016174e-03,
        7.58362710e-02, 1.56424951e-03, 3.72109488e-02, 1.14058200e-02,
        8.15052481e-04, 4.24836995e-04, 3.60342301e-03, 5.73986035e-04,
        8.73260724e-04, 7.17182935e-04, 1.43264467e-03, 5.70024690e-03,
        5.57794760e-04, 7.73043255e-04, 1.49696376e-02, 1.13147400e-01,
        1.69002183e-03, 5.74871758e-03, 6.21584360e-04, 2.30530072e-02,
        7.12187961e-04, 3.98535803e-06, 4.33705514e-03, 2.67115724e-03,
        1.87868334e-03, 1.90732593e-03, 3.12483124e-03, 4.92888852e-04,
        9.26003046e-03, 3.99711635e-03, 5.51565923e-02, 8.58825259e-03,
        1.28149623e-02, 7.72951462e-04, 1.07682694e-03, 2.54802126e-03,
        1.02200955e-02, 4.99993283e-03, 3.95059306e-03, 1.78430937e-02,
        1.13429083e-02, 2.92136543e-03, 2.36797542e-03, 2.77024489e-02,
        4.09558183e-03, 8.05317541e-04, 2.81111468e-02, 1.03234937e-02,
        2.89648678e-03, 6.77143410e-03, 9.08930786e-04, 3.88994487e-03,
        2.99906969e-04, 6.82204089e-04, 2.35432340e-03, 1.44438716e-04,
        3.93802812e-03, 1.41847448e-03, 1.33123028e-03, 2.21472536e-03,
        2.35495670e-03, 3.47273611e-02, 1.88372005e-03, 1.08211680e-06,
        1.10989464e-02, 3.50592425e-03, 1.84537517e-03, 5.95731474e-03,
        1.96734834e-02, 8.96637514e-03, 1.54045341e-03, 2.29544923e-04]],
      dtype=float32)

In [6]:
loaded_model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 9, 20)             400       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               152576    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               25700     
=================================================================
Total params: 178,676
Trainable params: 178,676
Non-trainable params: 0
_________________________________________________________________

In [ ]: