In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from small_script.myFunctions import *
import feather
import Bio.PDB as bio

from sklearn.metrics import confusion_matrix


d3_to_index = bio.Polypeptide.d3_to_index  # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index

%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.rcParams['figure.figsize'] = [16.18033, 10]

In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import to_categorical

from keras.models import model_from_json

from keras.metrics import top_k_categorical_accuracy

max_features = 20
batch_size = 1024*2
maxlen = 9
n = int(1e4)


Using TensorFlow backend.

get test data into csv format


In [3]:
# get the real frag info
min_seq_sep=3
max_seq_sep=9
parser = bio.PDBParser(QUIET=True)
three_to_one = bio.Polypeptide.three_to_one

#         out.write("pdb,i,j,res1,res2,dis_ca_ca,dis_ca_cb,dis_cb_ca,dis_cb_cb\n")


def getFragInfoFromPdb(pdb_list, pre, testpre):
    for cc, p in enumerate(pdb_list):
        has_something = False
        name = p.lower()[:4]
        pdbId = pdb = name
        location = pre+f"{pdbId}/{pdbId}/crystal_structure.pdb"
        structure = parser.get_structure("x", location)
        with open(testpre+f"{pdbId}.csv", "w") as out:
            for model in structure:
                for chain in model:
                    all_residues = list(chain)
                    # print(all_residues)
                    for i, residue in enumerate(all_residues):
                        outLine = ""
                        need = True
                        dis_ca_ca = []
                        dis_ca_cb = []
                        dis_cb_ca = []
                        dis_cb_cb = []
                        resId = residue.get_id()[1]
                        frag = all_residues[i:i+max_seq_sep]
                        resseq_list = [x.get_id()[1] for x in frag]
                        fragSeq = "".join([three_to_one(x.get_resname()) for x in frag])
                        # print(i, fragSeq)
                        if len(frag) != 9:
                            continue
                        if not np.all(np.ediff1d(resseq_list)==1):
                        # print(f"mismatch, {resId}, {resseq_list}")
                            continue
                        for ii in range(7):
                            if not need:
                                break
                            try:
                                r1 = frag[ii]
                            except Exception as ex:
                                need = False
                                break
                            # print(i, residue.get_resname())
                            for j, r2 in enumerate(frag[(min_seq_sep+ii):]):
                                # The H of GLY is replaced with CB in this dataset
                                try:
                                    r2_cb = r2["CB"]
                                except Exception as ex:
                                    try:
                                        r2_cb = r2["CA"]
                                    except Exception as ex:
                                        # print(pdbId, resId)
                                        need = False
                                        break
                                try:
                                    r1_cb = r1["CB"]
                                except Exception as ex:
                                    try:
                                        r1_cb = r1["CA"]
                                    except Exception as ex:
                                        # print(pdbId, resId)
                                        need = False
                                        break
                                try:
                                    r1_ca = r1["CA"]
                                    r2_ca = r2["CA"]
                                except Exception as ex:
                                    print(pdbId, resId)
                                    # os.system(f"echo '{pdbId}' >> {pre}/without_discontinues_and_gly_exception_2")
                                    need = False
                                    break
                                dis_ca_ca.append(str(r1_ca-r2_ca))
                                dis_ca_cb.append(str(r1_ca-r2_cb))
                                dis_cb_ca.append(str(r1_cb-r2_ca))
                                dis_cb_cb.append(str(r1_cb-r2_cb))
                        if need:
                            outLine = f"{pdbId},{i},{fragSeq},"+",".join(dis_ca_ca)+\
                            ","+",".join(dis_ca_cb)+\
                            ","+",".join(dis_cb_ca)+\
                            ","+",".join(dis_cb_cb)+"\n"
                            out.write(outLine)

In [35]:
pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/"
testpre = "/Users/weilu/Research/optimization/fragment/testSet/"
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
getFragInfoFromPdb(pdb_list, pre, testpre)


headline = "pdb,i,seq," + ",".join([f"caca_{i}" for i in range(1,22)]) + \
        "," + ",".join([f"cacb_{i}" for i in range(1,22)]) + \
        "," + ",".join([f"cbca_{i}" for i in range(1,22)]) + \
        "," + ",".join([f"cbcb_{i}" for i in range(1,22)]) 

# combine all to one data.
folder = "testSet"
combined_csv = "test_data"
os.system(f"echo '{headline}' > /Users/weilu/Research/optimization/fragment/{combined_csv}.csv")
pdb_list = os.listdir(f"/Users/weilu/Research/optimization/fragment/{folder}/")
for cc, pdb in enumerate(pdb_list):
    os.system(f"cat /Users/weilu/Research/optimization/fragment/{folder}/{pdb} >> /Users/weilu/Research/optimization/fragment/{combined_csv}.csv")

pipline from test data to accuracy


In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/test_data.csv")

In [4]:
import pickle
# pickle.dump(kmeans, open("kmeans_cluster100_v2", "wb"))
kmeans = pickle.load(open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "rb"))

In [5]:
data_original["cluster"] = kmeans.predict(data_original.iloc[:, 3:87].values)
def getScore(data, km):
    return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
data_original["rmsd"] = data_original.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)

In [6]:
data = data_original[["pdb", "i", "seq","cluster", "rmsd"]].reset_index(drop=True)
data["cluster"] = data["cluster"].astype(int)
for i in range(1,10):
    data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))
# data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")

In [7]:
clusterdata = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.
  labels, = index.labels

In [8]:
data["frequency"] = data["cluster"].apply(lambda x: clusterdata["cluster"].value_counts()[x])

In [10]:
data.head()


Out[10]:
pdb i seq cluster rmsd s1 s2 s3 s4 s5 s6 s7 s8 s9 frequency
0 1fc2 0 FNKEQQNAF 19 11.223527 4 11 8 3 13 13 11 0 4 36340
1 1fc2 1 NKEQQNAFY 70 8.539869 11 8 3 13 13 11 0 4 19 37302
2 1fc2 2 KEQQNAFYE 55 7.491690 8 3 13 13 11 0 4 19 3 337166
3 1fc2 3 EQQNAFYEI 55 4.934995 3 13 13 11 0 4 19 3 7 337166
4 1fc2 4 QQNAFYEIL 55 3.384191 13 13 11 0 4 19 3 7 9 337166

In [11]:
# pre = "/Users/weilu/Research/server/jan_2019/lstm100_v3/"
# pre = "/Users/weilu/Research/server/feb_2019/lstm100_2/"
pre = "/Users/weilu/Research/server/march_2019/lstm100_train_based_on_frequency/"
json_file = open(f"{pre}/model_2019-03-07.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(f"{pre}/model_2019-03-07.h5")

In [12]:
x_test = data.iloc[:, 5:14].values
y_test_value = data["cluster"].values
y_test = to_categorical(np.array(y_test_value), num_classes=100)

In [13]:
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)
def top_5_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)
def top_10_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=10)
def top_20_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=20)
loaded_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy', top_3_accuracy, 
                                                                  top_5_accuracy, 
                                                                  top_10_accuracy, top_20_accuracy])

In [14]:
top_n = 20
all_frag = []
for fragSeq in data_original["seq"]:
    fragIndex = [one_to_index(x) for x in fragSeq]
    all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
for i in range(top_n):
    data_original[f"pred_top{i+1}"] = clusters[:,i]

In [20]:
data_original[["pdb", "i", "seq", "cluster", "rmsd", "pred_top1", "pred_top2", "pred_top3", "pred_top4", "pred_top5", "pred_top6"]]


Out[20]:
pdb i seq cluster rmsd pred_top1 pred_top2 pred_top3 pred_top4 pred_top5 pred_top6
0 1fc2 0 FNKEQQNAF 19 11.223527 19 55 79 0 99 54
1 1fc2 1 NKEQQNAFY 70 8.539869 55 45 1 0 82 64
2 1fc2 2 KEQQNAFYE 55 7.491690 55 1 16 74 86 56
3 1fc2 3 EQQNAFYEI 55 4.934995 84 83 18 57 25 55
4 1fc2 4 QQNAFYEIL 55 3.384191 55 70 1 14 19 96
5 1fc2 5 QNAFYEILH 55 3.406676 55 1 71 70 2 38
6 1fc2 6 NAFYEILHL 55 3.127405 55 1 2 45 71 0
7 1fc2 7 AFYEILHLP 48 8.780100 71 86 8 99 1 23
8 1fc2 8 FYEILHLPN 23 6.889736 23 99 79 0 33 54
9 1fc2 9 YEILHLPNL 56 8.923290 56 12 74 55 99 32
10 1fc2 10 EILHLPNLN 53 13.655668 17 99 55 79 8 53
11 1fc2 11 ILHLPNLNE 75 14.797442 76 88 75 87 50 67
12 1fc2 12 LHLPNLNEE 83 9.137923 4 83 99 90 34 36
13 1fc2 13 HLPNLNEEQ 13 14.942413 49 32 13 5 63 44
14 1fc2 14 LPNLNEEQR 10 9.195280 63 35 3 77 7 42
15 1fc2 15 PNLNEEQRN 96 5.441713 96 14 47 19 55 1
16 1fc2 16 NLNEEQRNG 30 5.600426 30 1 55 70 18 74
17 1fc2 17 LNEEQRNGF 19 9.968624 19 55 49 70 0 97
18 1fc2 18 NEEQRNGFI 55 3.893249 16 53 62 98 55 13
19 1fc2 19 EEQRNGFIQ 55 3.826803 55 1 72 48 62 58
20 1fc2 20 EQRNGFIQS 55 4.374872 24 55 8 70 37 52
21 1fc2 21 QRNGFIQSL 55 5.561482 0 66 55 90 57 82
22 1fc2 22 RNGFIQSLK 55 4.858648 2 55 45 71 66 1
23 1fc2 23 NGFIQSLKD 55 4.537867 55 1 71 48 23 38
24 1fc2 24 GFIQSLKDD 55 5.359258 55 51 16 1 74 86
25 1fc2 25 FIQSLKDDP 1 5.444276 48 55 1 86 16 62
26 1fc2 26 IQSLKDDPS 86 4.391852 23 86 76 75 88 54
27 1fc2 27 QSLKDDPSQ 74 4.613582 49 13 97 32 5 44
28 1fc2 28 SLKDDPSQS 98 10.517590 98 73 53 12 46 27
29 1fc2 29 LKDDPSQSA 62 15.328637 75 15 52 69 39 78
... ... ... ... ... ... ... ... ... ... ... ...
285 2gb1 18 EAVDAATAE 47 7.264854 48 47 55 96 53 1
286 2gb1 19 AVDAATAEK 57 6.551999 18 30 55 23 1 57
287 2gb1 20 VDAATAEKV 19 6.362199 19 70 55 12 1 6
288 2gb1 21 DAATAEKVF 55 4.402400 55 1 19 62 70 48
289 2gb1 22 AATAEKVFK 55 4.360743 55 1 86 70 30 19
290 2gb1 23 ATAEKVFKQ 55 4.491683 55 19 1 70 12 48
291 2gb1 24 TAEKVFKQY 55 3.767578 55 1 19 70 48 99
292 2gb1 25 AEKVFKQYA 55 3.443909 55 1 19 48 62 70
293 2gb1 26 EKVFKQYAN 55 3.407581 55 62 1 48 75 86
294 2gb1 27 KVFKQYAND 1 3.607286 55 1 86 48 70 20
295 2gb1 28 VFKQYANDN 1 4.206800 55 1 48 70 12 74
296 2gb1 29 FKQYANDNG 55 3.775696 1 17 99 79 0 23
297 2gb1 30 KQYANDNGV 55 13.862867 53 81 86 99 48 55
298 2gb1 31 QYANDNGVD 16 6.800551 16 69 52 7 86 31
299 2gb1 32 YANDNGVDG 72 8.327533 72 42 31 68 98 35
300 2gb1 33 ANDNGVDGE 33 8.156164 95 33 31 47 84 40
301 2gb1 34 NDNGVDGEW 0 15.755901 31 7 69 52 55 14
302 2gb1 35 DNGVDGEWT 21 11.960841 31 22 84 52 16 40
303 2gb1 36 NGVDGEWTY 21 12.594809 45 82 66 2 79 8
304 2gb1 37 GVDGEWTYD 65 11.807746 8 4 71 66 90 99
305 2gb1 38 VDGEWTYDD 45 8.199402 66 45 2 82 71 8
306 2gb1 39 DGEWTYDDA 80 6.228466 80 71 2 38 51 89
307 2gb1 40 GEWTYDDAT 51 10.440306 5 51 89 32 44 80
308 2gb1 41 EWTYDDATK 32 11.438643 91 77 69 5 7 63
309 2gb1 42 WTYDDATKT 69 11.758261 69 31 7 35 98 19
310 2gb1 43 TYDDATKTF 31 7.781009 31 47 30 7 55 25
311 2gb1 44 YDDATKTFT 6 10.954181 6 95 33 79 17 72
312 2gb1 45 DDATKTFTV 33 11.730163 79 33 54 0 99 17
313 2gb1 46 DATKTFTVT 79 12.598200 90 2 45 82 55 71
314 2gb1 47 ATKTFTVTE 45 11.398768 8 71 38 2 55 89

315 rows × 11 columns


In [15]:
loaded_model.evaluate(x_test, y_test)


315/315 [==============================] - 0s 1ms/step
Out[15]:
[2.3251699621715245,
 0.47619047647430784,
 0.711111112814101,
 0.7714285731315613,
 0.8507936494691032,
 0.9142857161779252]

In [18]:
loaded_model.evaluate(x_test, y_test)


315/315 [==============================] - 0s 1ms/step
Out[18]:
[2.9926906101287356,
 0.3238095232891658,
 0.5238095233364711,
 0.6285714295175341,
 0.7904761910438538,
 0.8952380939135476]

In [33]:
loaded_model.evaluate(x_test, y_test)


315/315 [==============================] - 0s 1ms/step
Out[33]:
[2.9926906101287356,
 0.3238095232891658,
 0.5238095233364711,
 0.6285714295175341,
 0.7904761910438538,
 0.8952380939135476]

In [15]:
loaded_model.evaluate(x_test, y_test)


315/315 [==============================] - 0s 162us/step
Out[15]:
[2.9494901664673336,
 0.24761904769000553,
 0.4571428561021411,
 0.5904761914222959,
 0.7333333344686599,
 0.8666666691265409]

In [119]:
loaded_model.evaluate(x_test, y_test)


315/315 [==============================] - 0s 928us/step
Out[119]:
[3.0338871017334954,
 0.2730158725428203,
 0.46984127012510146,
 0.561904764175415,
 0.6793650791758583,
 0.8253968278566997]

In [85]:
def getRMSD(data, km, cluder_index_column):
    return np.sqrt(((km.cluster_centers_[int(data.values[cluder_index_column])] - data.values[:84])**2).sum())
for i in range(top_n):
    data_original[f"pred_top{i+1}_dRMSD"] = data_original.iloc[:,3:].apply(lambda x: getRMSD(x, kmeans, 86+i), axis=1)

In [87]:
data = data_original.iloc[:, 87:]

In [132]:
p1 = data.iloc[:,22:]
def minRMSD(data, n):
    return np.min(data.values[:n])
for i in range(top_n):
    p1[f"averageTop{i+1}"] = p1.apply(lambda x: minRMSD(x, i+1), axis=1)
p1 = p1.iloc[:,20:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
p1["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))
sns.boxplot("Top_dRMSD", "value", data=p1)


Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a43efdfd0>

average


In [127]:
p1 = data.iloc[:,22:]
def averageRMSD(data, n):
    return np.average(data.values[:n])
for i in range(top_n):
    p1[f"averageTop{i+1}"] = p1.apply(lambda x: averageRMSD(x, i+1), axis=1)
p1 = p1.iloc[:,20:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
p1["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))

In [130]:
sns.boxplot("Top_dRMSD", "value", data=p1)


Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a40852cf8>

i th


In [100]:
processed_data = data.iloc[:,22:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
processed_data["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))

In [111]:
sns.boxplot("Top_dRMSD", "value", data=processed_data)


Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a40ba1240>

In [93]:
data.describe().T


Out[93]:
count mean std min 25% 50% 75% max
cluster 315.0 47.311111 26.460493 0.000000 28.500000 55.000000 56.000000 98.000000
rmsd 315.0 8.754763 4.702357 1.202327 4.425866 8.587541 12.565353 19.827993
pred_top1 315.0 46.342857 30.840403 0.000000 19.000000 48.000000 72.500000 99.000000
pred_top2 315.0 45.688889 31.018359 0.000000 16.000000 48.000000 71.000000 99.000000
pred_top3 315.0 45.888889 29.775653 0.000000 19.000000 48.000000 70.000000 99.000000
pred_top4 315.0 48.787302 29.261219 0.000000 23.000000 52.000000 71.000000 99.000000
pred_top5 315.0 45.206349 30.882641 0.000000 17.500000 44.000000 71.000000 99.000000
pred_top6 315.0 49.130159 28.683517 0.000000 23.000000 53.000000 71.000000 99.000000
pred_top7 315.0 49.755556 30.528163 0.000000 20.500000 52.000000 75.000000 99.000000
pred_top8 315.0 43.968254 28.942051 0.000000 19.000000 45.000000 67.000000 99.000000
pred_top9 315.0 48.219048 29.483877 0.000000 20.000000 49.000000 73.500000 99.000000
pred_top10 315.0 47.749206 28.397496 0.000000 23.000000 50.000000 71.000000 99.000000
pred_top11 315.0 46.130159 29.501142 0.000000 20.000000 48.000000 71.000000 99.000000
pred_top12 315.0 46.682540 30.626695 0.000000 18.500000 48.000000 71.000000 99.000000
pred_top13 315.0 46.257143 29.524910 0.000000 20.000000 48.000000 70.000000 99.000000
pred_top14 315.0 48.273016 30.103988 0.000000 20.000000 48.000000 74.000000 99.000000
pred_top15 315.0 44.993651 29.745737 0.000000 19.000000 45.000000 70.000000 99.000000
pred_top16 315.0 46.533333 29.365824 0.000000 20.500000 47.000000 71.000000 99.000000
pred_top17 315.0 51.434921 30.132056 0.000000 22.500000 53.000000 76.000000 99.000000
pred_top18 315.0 49.295238 29.635876 0.000000 22.500000 50.000000 73.000000 99.000000
pred_top19 315.0 49.012698 30.437361 0.000000 20.500000 49.000000 78.000000 99.000000
pred_top20 315.0 48.085714 29.681681 0.000000 21.000000 49.000000 73.000000 99.000000
pred_top1_dRMSD 315.0 23.030612 18.088406 1.410373 10.654878 16.529025 28.882869 75.406481
pred_top2_dRMSD 315.0 27.255306 18.608782 1.297280 14.360756 22.235833 34.781450 78.069498
pred_top3_dRMSD 315.0 28.577873 17.424437 1.545246 16.567997 24.199660 35.811124 82.929281
pred_top4_dRMSD 315.0 29.072456 16.653237 1.202327 17.608105 25.122476 36.780830 75.599327
pred_top5_dRMSD 315.0 31.113994 16.561859 1.549842 18.829451 27.459261 39.198370 78.449659
pred_top6_dRMSD 315.0 31.716423 16.839438 1.885028 18.822552 27.562002 40.082998 80.171303
pred_top7_dRMSD 315.0 31.876523 16.276070 2.105131 21.970128 28.306990 38.498697 81.909514
pred_top8_dRMSD 315.0 34.070995 17.844686 2.204191 19.415756 30.070805 45.350051 79.581063
pred_top9_dRMSD 315.0 32.650358 16.731960 6.044274 20.884073 28.131805 42.586969 84.179514
pred_top10_dRMSD 315.0 34.594523 17.055663 1.705854 22.370933 30.406661 44.722559 78.680332
pred_top11_dRMSD 315.0 34.432532 16.678536 4.858648 21.918756 30.481420 43.545270 79.773496
pred_top12_dRMSD 315.0 35.881485 16.613546 3.313642 23.885228 31.788766 47.758629 85.360100
pred_top13_dRMSD 315.0 34.539124 15.984334 1.930836 23.204128 30.256104 44.465191 76.559508
pred_top14_dRMSD 315.0 34.858715 16.504184 2.032575 22.234683 31.019255 43.794278 80.212929
pred_top15_dRMSD 315.0 36.800321 16.422088 2.247318 23.840819 33.651171 46.945040 81.837867
pred_top16_dRMSD 315.0 37.092710 15.738115 2.299054 25.622140 33.956876 46.779491 83.687315
pred_top17_dRMSD 315.0 35.604281 16.112955 3.932949 23.294304 32.778882 47.085098 79.524947
pred_top18_dRMSD 315.0 37.696267 15.972762 5.778708 25.545528 34.738157 48.341967 78.691357
pred_top19_dRMSD 315.0 36.792103 16.659858 1.826833 24.620409 33.373709 48.943923 74.980444
pred_top20_dRMSD 315.0 36.772653 16.665938 6.489628 24.492256 33.392280 49.372151 79.804689

In [ ]:


In [7]:
# pre = "/Users/weilu/Research/server/jan_2019/lstm100_v3/"
# pre = "/Users/weilu/Research/server/feb_2019/lstm100_2/"
# json_file = open(f"{pre}/model.json", "r")
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# loaded_model.load_weights(f"{pre}/model.h5")

pre = "/Users/weilu/Research/server/march_2019/lstm100_train_based_on_frequency/"
json_file = open(f"{pre}/model_2019-03-07.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(f"{pre}/model_2019-03-07.h5")

def getFrags(pdb, toLocation, top_n=1, evenWeight=True, totalWeight=20.0):
    # seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
    seqFile = f"/Users/weilu/Research/server/march_2019/frag_memory_explore/all_simulations/{pdb}/{pdb}/{pdb}.seq"
    with open(seqFile) as f:
        lines = f.readlines()
    a = lines[0].strip()
    all_frag = []
    for i in range(0, len(a)-8):
        frag = a[i:i+9]
        # print(frag)
        fragIndex = [one_to_index(x) for x in frag]
        # print(fragIndex)
        all_frag.append(fragIndex)
    all_frag = np.array(all_frag)
    predict_prob = loaded_model.predict(all_frag)
    clusters = np.argsort(-predict_prob)[:, :top_n]
    n = predict_prob.shape[0]
    prob = predict_prob[np.arange(n).reshape(n,1), clusters]
    prob /= (prob.sum(axis=1)).reshape(n,1)
    # pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/fragment_memory/"
    pre = toLocation
    header = '''\
[Target]
query

[Memories]
'''
    with open(pre+f"{pdb}.mem", "w") as out:
        out.write(header)
        if evenWeight:
            weight = totalWeight/top_n
        for index, c in enumerate(clusters):
            for index2, i in enumerate(c):
                if not evenWeight:
                    weight = prob[index][index2]
                out.write(f"fraglib/{i}.gro {index+1} 1 9 {weight:.3}\n")

In [8]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 3
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
    name = p.lower()[:4]
    getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=6)

In [10]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 20
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore_2/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
    name = p.lower()[:4]
    getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=20)

In [11]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 1
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore_2/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
    name = p.lower()[:4]
    getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=6)

In [ ]:
pdb = "1ctf"
top_n = 2
seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
with open(seqFile) as f:
    lines = f.readlines()
a = lines[0].strip()
all_frag = []
for i in range(0, len(a)-8):
    frag = a[i:i+9]
    # print(frag)
    fragIndex = [one_to_index(x) for x in frag]
    # print(fragIndex)
    all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
# cluster100_centers.to_feather("/Users/weilu/Research/optimization/fragment/cluster_centers.feather")
cluster100_centers = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster_centers.feather")

In [77]:
fragSeq = "NEEQRNGFI"
predict_prob = loaded_model.predict(np.array([one_to_index(x) for x in fragSeq]).reshape(1,9))
np.argsort(-predict_prob)[:, :10]


Out[77]:
array([[16, 86, 48, 23, 55,  1, 62, 58, 12, 99]])

In [67]:
top_n = 2
all_frag = []
for fragSeq in data_original["seq"]:
    fragIndex = [one_to_index(x) for x in fragSeq]
    all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]

In [73]:
data_original = data_original.assign(pred_top1=clusters[:,0], pred_top2=clusters[:,1])

In [78]:
from keras.metrics import top_k_categorical_accuracy

In [84]:


In [85]:
data.head()


Out[85]:
pdb i seq cluster rmsd s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1bg6A02 107 RAVNVPTPL 0 5.756177 14 0 17 11 17 12 16 12 9
1 3g85A02 38 HKNGIKISE 0 6.329374 6 8 11 5 7 8 7 15 3
2 4je5C00 166 EAQGVITFP 0 6.365145 3 0 13 5 17 7 16 4 12
3 3q41A01 88 GFYRIPVLG 0 6.389094 5 4 19 14 7 12 17 9 5
4 1auqA00 144 KKKKVIVIP 0 6.450880 8 8 8 8 17 7 17 7 12

In [65]:
np.argsort(-predict_prob)[:, :top_n]


Out[65]:
array([[19, 12]])

In [ ]:
pdb = "1ctf"

seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
with open(seqFile) as f:
    lines = f.readlines()
a = lines[0].strip()
all_frag = []
for i in range(0, len(a)-8):
    frag = a[i:i+9]
    # print(frag)
    fragIndex = [one_to_index(x) for x in frag]
    # print(fragIndex)
    all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]

In [ ]:


In [ ]:


In [ ]:


In [ ]: