In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from small_script.myFunctions import *
import feather
import Bio.PDB as bio
from sklearn.metrics import confusion_matrix
d3_to_index = bio.Polypeptide.d3_to_index # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.rcParams['figure.figsize'] = [16.18033, 10]
In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import to_categorical
from keras.models import model_from_json
from keras.metrics import top_k_categorical_accuracy
max_features = 20
batch_size = 1024*2
maxlen = 9
n = int(1e4)
In [3]:
# get the real frag info
min_seq_sep=3
max_seq_sep=9
parser = bio.PDBParser(QUIET=True)
three_to_one = bio.Polypeptide.three_to_one
# out.write("pdb,i,j,res1,res2,dis_ca_ca,dis_ca_cb,dis_cb_ca,dis_cb_cb\n")
def getFragInfoFromPdb(pdb_list, pre, testpre):
for cc, p in enumerate(pdb_list):
has_something = False
name = p.lower()[:4]
pdbId = pdb = name
location = pre+f"{pdbId}/{pdbId}/crystal_structure.pdb"
structure = parser.get_structure("x", location)
with open(testpre+f"{pdbId}.csv", "w") as out:
for model in structure:
for chain in model:
all_residues = list(chain)
# print(all_residues)
for i, residue in enumerate(all_residues):
outLine = ""
need = True
dis_ca_ca = []
dis_ca_cb = []
dis_cb_ca = []
dis_cb_cb = []
resId = residue.get_id()[1]
frag = all_residues[i:i+max_seq_sep]
resseq_list = [x.get_id()[1] for x in frag]
fragSeq = "".join([three_to_one(x.get_resname()) for x in frag])
# print(i, fragSeq)
if len(frag) != 9:
continue
if not np.all(np.ediff1d(resseq_list)==1):
# print(f"mismatch, {resId}, {resseq_list}")
continue
for ii in range(7):
if not need:
break
try:
r1 = frag[ii]
except Exception as ex:
need = False
break
# print(i, residue.get_resname())
for j, r2 in enumerate(frag[(min_seq_sep+ii):]):
# The H of GLY is replaced with CB in this dataset
try:
r2_cb = r2["CB"]
except Exception as ex:
try:
r2_cb = r2["CA"]
except Exception as ex:
# print(pdbId, resId)
need = False
break
try:
r1_cb = r1["CB"]
except Exception as ex:
try:
r1_cb = r1["CA"]
except Exception as ex:
# print(pdbId, resId)
need = False
break
try:
r1_ca = r1["CA"]
r2_ca = r2["CA"]
except Exception as ex:
print(pdbId, resId)
# os.system(f"echo '{pdbId}' >> {pre}/without_discontinues_and_gly_exception_2")
need = False
break
dis_ca_ca.append(str(r1_ca-r2_ca))
dis_ca_cb.append(str(r1_ca-r2_cb))
dis_cb_ca.append(str(r1_cb-r2_ca))
dis_cb_cb.append(str(r1_cb-r2_cb))
if need:
outLine = f"{pdbId},{i},{fragSeq},"+",".join(dis_ca_ca)+\
","+",".join(dis_ca_cb)+\
","+",".join(dis_cb_ca)+\
","+",".join(dis_cb_cb)+"\n"
out.write(outLine)
In [35]:
pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/"
testpre = "/Users/weilu/Research/optimization/fragment/testSet/"
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
getFragInfoFromPdb(pdb_list, pre, testpre)
headline = "pdb,i,seq," + ",".join([f"caca_{i}" for i in range(1,22)]) + \
"," + ",".join([f"cacb_{i}" for i in range(1,22)]) + \
"," + ",".join([f"cbca_{i}" for i in range(1,22)]) + \
"," + ",".join([f"cbcb_{i}" for i in range(1,22)])
# combine all to one data.
folder = "testSet"
combined_csv = "test_data"
os.system(f"echo '{headline}' > /Users/weilu/Research/optimization/fragment/{combined_csv}.csv")
pdb_list = os.listdir(f"/Users/weilu/Research/optimization/fragment/{folder}/")
for cc, pdb in enumerate(pdb_list):
os.system(f"cat /Users/weilu/Research/optimization/fragment/{folder}/{pdb} >> /Users/weilu/Research/optimization/fragment/{combined_csv}.csv")
In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/test_data.csv")
In [4]:
import pickle
# pickle.dump(kmeans, open("kmeans_cluster100_v2", "wb"))
kmeans = pickle.load(open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "rb"))
In [5]:
data_original["cluster"] = kmeans.predict(data_original.iloc[:, 3:87].values)
def getScore(data, km):
return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
data_original["rmsd"] = data_original.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)
In [6]:
data = data_original[["pdb", "i", "seq","cluster", "rmsd"]].reset_index(drop=True)
data["cluster"] = data["cluster"].astype(int)
for i in range(1,10):
data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))
# data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")
In [7]:
clusterdata = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")
In [8]:
data["frequency"] = data["cluster"].apply(lambda x: clusterdata["cluster"].value_counts()[x])
In [10]:
data.head()
Out[10]:
In [11]:
# pre = "/Users/weilu/Research/server/jan_2019/lstm100_v3/"
# pre = "/Users/weilu/Research/server/feb_2019/lstm100_2/"
pre = "/Users/weilu/Research/server/march_2019/lstm100_train_based_on_frequency/"
json_file = open(f"{pre}/model_2019-03-07.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(f"{pre}/model_2019-03-07.h5")
In [12]:
x_test = data.iloc[:, 5:14].values
y_test_value = data["cluster"].values
y_test = to_categorical(np.array(y_test_value), num_classes=100)
In [13]:
def top_3_accuracy(y_true, y_pred):
return top_k_categorical_accuracy(y_true, y_pred, k=3)
def top_5_accuracy(y_true, y_pred):
return top_k_categorical_accuracy(y_true, y_pred, k=5)
def top_10_accuracy(y_true, y_pred):
return top_k_categorical_accuracy(y_true, y_pred, k=10)
def top_20_accuracy(y_true, y_pred):
return top_k_categorical_accuracy(y_true, y_pred, k=20)
loaded_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy', top_3_accuracy,
top_5_accuracy,
top_10_accuracy, top_20_accuracy])
In [14]:
top_n = 20
all_frag = []
for fragSeq in data_original["seq"]:
fragIndex = [one_to_index(x) for x in fragSeq]
all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
for i in range(top_n):
data_original[f"pred_top{i+1}"] = clusters[:,i]
In [20]:
data_original[["pdb", "i", "seq", "cluster", "rmsd", "pred_top1", "pred_top2", "pred_top3", "pred_top4", "pred_top5", "pred_top6"]]
Out[20]:
In [15]:
loaded_model.evaluate(x_test, y_test)
Out[15]:
In [18]:
loaded_model.evaluate(x_test, y_test)
Out[18]:
In [33]:
loaded_model.evaluate(x_test, y_test)
Out[33]:
In [15]:
loaded_model.evaluate(x_test, y_test)
Out[15]:
In [119]:
loaded_model.evaluate(x_test, y_test)
Out[119]:
In [85]:
def getRMSD(data, km, cluder_index_column):
return np.sqrt(((km.cluster_centers_[int(data.values[cluder_index_column])] - data.values[:84])**2).sum())
for i in range(top_n):
data_original[f"pred_top{i+1}_dRMSD"] = data_original.iloc[:,3:].apply(lambda x: getRMSD(x, kmeans, 86+i), axis=1)
In [87]:
data = data_original.iloc[:, 87:]
In [132]:
p1 = data.iloc[:,22:]
def minRMSD(data, n):
return np.min(data.values[:n])
for i in range(top_n):
p1[f"averageTop{i+1}"] = p1.apply(lambda x: minRMSD(x, i+1), axis=1)
p1 = p1.iloc[:,20:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
p1["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))
sns.boxplot("Top_dRMSD", "value", data=p1)
Out[132]:
In [127]:
p1 = data.iloc[:,22:]
def averageRMSD(data, n):
return np.average(data.values[:n])
for i in range(top_n):
p1[f"averageTop{i+1}"] = p1.apply(lambda x: averageRMSD(x, i+1), axis=1)
p1 = p1.iloc[:,20:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
p1["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))
In [130]:
sns.boxplot("Top_dRMSD", "value", data=p1)
Out[130]:
In [100]:
processed_data = data.iloc[:,22:].reset_index().melt(id_vars=["index"], var_name="dRMSD")
processed_data["Top_dRMSD"] = processed_data["dRMSD"].str.split("_").apply(lambda x: int(x[1][3:]))
In [111]:
sns.boxplot("Top_dRMSD", "value", data=processed_data)
Out[111]:
In [93]:
data.describe().T
Out[93]:
In [ ]:
In [7]:
# pre = "/Users/weilu/Research/server/jan_2019/lstm100_v3/"
# pre = "/Users/weilu/Research/server/feb_2019/lstm100_2/"
# json_file = open(f"{pre}/model.json", "r")
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# loaded_model.load_weights(f"{pre}/model.h5")
pre = "/Users/weilu/Research/server/march_2019/lstm100_train_based_on_frequency/"
json_file = open(f"{pre}/model_2019-03-07.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(f"{pre}/model_2019-03-07.h5")
def getFrags(pdb, toLocation, top_n=1, evenWeight=True, totalWeight=20.0):
# seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
seqFile = f"/Users/weilu/Research/server/march_2019/frag_memory_explore/all_simulations/{pdb}/{pdb}/{pdb}.seq"
with open(seqFile) as f:
lines = f.readlines()
a = lines[0].strip()
all_frag = []
for i in range(0, len(a)-8):
frag = a[i:i+9]
# print(frag)
fragIndex = [one_to_index(x) for x in frag]
# print(fragIndex)
all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
n = predict_prob.shape[0]
prob = predict_prob[np.arange(n).reshape(n,1), clusters]
prob /= (prob.sum(axis=1)).reshape(n,1)
# pre = "/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/fragment_memory/"
pre = toLocation
header = '''\
[Target]
query
[Memories]
'''
with open(pre+f"{pdb}.mem", "w") as out:
out.write(header)
if evenWeight:
weight = totalWeight/top_n
for index, c in enumerate(clusters):
for index2, i in enumerate(c):
if not evenWeight:
weight = prob[index][index2]
out.write(f"fraglib/{i}.gro {index+1} 1 9 {weight:.3}\n")
In [8]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 3
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
name = p.lower()[:4]
getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=6)
In [10]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 20
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore_2/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
name = p.lower()[:4]
getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=20)
In [11]:
pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
top_n = 1
pre = f"/Users/weilu/Research/server/march_2019/frag_memory_explore_2/fragment_memory_top{top_n}/"
os.system(f"mkdir -p {pre}")
for p in pdb_list:
name = p.lower()[:4]
getFrags(name, pre, top_n=top_n, evenWeight=True, totalWeight=6)
In [ ]:
pdb = "1ctf"
top_n = 2
seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
with open(seqFile) as f:
lines = f.readlines()
a = lines[0].strip()
all_frag = []
for i in range(0, len(a)-8):
frag = a[i:i+9]
# print(frag)
fragIndex = [one_to_index(x) for x in frag]
# print(fragIndex)
all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
# cluster100_centers.to_feather("/Users/weilu/Research/optimization/fragment/cluster_centers.feather")
cluster100_centers = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster_centers.feather")
In [77]:
fragSeq = "NEEQRNGFI"
predict_prob = loaded_model.predict(np.array([one_to_index(x) for x in fragSeq]).reshape(1,9))
np.argsort(-predict_prob)[:, :10]
Out[77]:
In [67]:
top_n = 2
all_frag = []
for fragSeq in data_original["seq"]:
fragIndex = [one_to_index(x) for x in fragSeq]
all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
In [73]:
data_original = data_original.assign(pred_top1=clusters[:,0], pred_top2=clusters[:,1])
In [78]:
from keras.metrics import top_k_categorical_accuracy
In [84]:
In [85]:
data.head()
Out[85]:
In [65]:
np.argsort(-predict_prob)[:, :top_n]
Out[65]:
In [ ]:
pdb = "1ctf"
seqFile = f"/Users/weilu/Research/server/jan_2019/iterative_optimization_another_set/all_simulations/{pdb}/{pdb}/{pdb}.seq"
with open(seqFile) as f:
lines = f.readlines()
a = lines[0].strip()
all_frag = []
for i in range(0, len(a)-8):
frag = a[i:i+9]
# print(frag)
fragIndex = [one_to_index(x) for x in frag]
# print(fragIndex)
all_frag.append(fragIndex)
all_frag = np.array(all_frag)
predict_prob = loaded_model.predict(all_frag)
clusters = np.argsort(-predict_prob)[:, :top_n]
In [ ]:
In [ ]:
In [ ]:
In [ ]: