In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
from small_script.myFunctions import *
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [3]:
data = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/proteins-2019-05-01.csv")
data.pdbid = data.pdbid.apply(lambda x: x[2:-1])
In [4]:
data.head()
Out[4]:
In [5]:
d = data.query("classtype_id == 1")
# dd = d.groupby("superfamily_id").apply(pd.DataFrame.sample, 5, replace=True).reset_index(drop=True)
dd = d.reset_index(drop=True)
In [6]:
dd.shape
Out[6]:
In [7]:
ddd = dd.drop_duplicates().reset_index(drop=True)
In [8]:
ddd.to_csv("/Users/weilu/Research/database/membrane_training_set/chosen_large_data.csv")
In [16]:
a = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/chosen_large_data.csv", index_col=0)
In [17]:
a.shape
Out[17]:
In [18]:
# print(a.pdbid.tolist())
pdb_list = a.pdbid.tolist()
In [30]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/"
filtered_list = []
for pdb in pdb_list:
location = pre + f"{pdb}.pdb"
a = get_inside_or_not_table(location)
# ratio = sum(a)/len(a)
ratio = sum(a)/(len(a)+1e-6)
if ratio < 0.4:
print("not good", pdb, ratio)
else:
filtered_list.append(pdb)
# print(pdb, ratio)
In [31]:
len(filtered_list)
Out[31]:
In [15]:
In [21]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/"
a = glob.glob(pre+"cleaned_pdbs/*")
In [24]:
b = [i.split("/")[-1][:4] for i in a]
In [25]:
pdb_list = b
In [33]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/"
for pdb in pdb_list:
toLocation = pre + f"dompdb/{pdb}.pdb"
location = pre + f"cleaned_pdbs/{pdb}.pdb"
try:
extractTransmembrane(toLocation, location)
except:
pass
In [28]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/dompdb/"
filtered_list = []
for pdb in pdb_list:
location = pre + f"{pdb}.pdb"
a = get_inside_or_not_table(location)
# ratio = sum(a)/len(a)
ratio = sum(a)/(len(a)+1e-6)
if ratio < 0.4:
print("not good", pdb, ratio)
else:
filtered_list.append(pdb)
# print(pdb, ratio)
In [29]:
len(filtered_list)
Out[29]:
In [32]:
pdb_list = filtered_list
In [39]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/"
In [40]:
with open(f"{pre}/optimization/protein_list", "w") as out:
for pdb in pdb_list:
# print(pdb)
out.write(pdb+"\n")
In [38]:
for pdb in pdb_list:
location = f"{pre}/dompdb/{pdb}.pdb"
toLocation = f"{pre}/S20_seq/{pdb}.seq"
seq,resseqs = getSeqFromPDB(location, considerGap=False)
with open(toLocation, "w") as out:
out.write(seq+'\n')
In [46]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k1000 = np.loadtxt(location)
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys2000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k2000 = np.loadtxt(location)
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys4000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k4000 = np.loadtxt(location)
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys8000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k8000 = np.loadtxt(location)
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys8000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma_filtered"
gamma_k8000_f = np.loadtxt(location)
In [23]:
In [3]:
In [4]:
pre = '/Users/weilu/Research/server/may_2019/four_body/four_body_helix_use_cb_complete_phi/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_Truephi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma"
gamma_complete_phis = np.loadtxt(location)
In [5]:
name = "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_Truephi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A,B,B_filtered,Gamma,Gamma_filtered,Lamb,Lamb_filtered,half_B,other_half_B,std_half_B,A_prime = get_raw_optimization_data(pre, name)
In [6]:
from pyCodeLib import *
In [7]:
total_phis = len(A)
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(
A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, setCutoff=726)
In [9]:
six_letter_code_combinations = ['000004', '000013', '000022', '000031', '000040', '000103', '000112', '000121', '000130', '000202', '000211', '000220', '000301', '000310', '000400', '001003', '001012', '001021', '001030', '001102', '001111', '001120', '001201', '001210', '001300', '002002', '002011', '002020', '002101', '002110', '002200', '003001', '003010', '003100', '004000', '010003', '010012', '010021', '010030', '010102', '010111', '010120', '010201', '010210', '010300', '011002', '011011', '011020', '011101', '011110', '011200', '012001', '012010', '012100', '013000', '020002', '020011', '020020', '020101', '020110', '020200', '021001', '021010', '021100', '022000', '030001', '030010', '030100', '031000', '040000', '100003', '100012', '100021', '100030', '100102', '100111', '100120', '100201', '100210', '100300', '101002', '101011', '101020', '101101', '101110', '101200', '102001', '102010', '102100', '103000', '110002', '110011', '110020', '110101', '110110', '110200', '111001', '111010', '111100', '112000', '120001', '120010', '120100', '121000', '130000', '200002', '200011', '200020', '200101', '200110', '200200', '201001', '201010', '201100', '202000', '210001', '210010', '210100', '211000', '220000', '300001', '300010', '300100', '301000', '310000', '400000']
six_letter_code_combinations = np.array(six_letter_code_combinations)
In [10]:
g = -filtered_gamma[:126]
sorted_letter = six_letter_code_combinations[np.argsort(g)]
y_g = g[np.argsort(g)]
In [47]:
from adjustText import adjust_text
In [14]:
fig, ax = plt.subplots()
ax.plot(sorted_letter, y_g)
for i, txt in enumerate(sorted_letter):
ax.annotate(txt, (i, y_g[i]))
In [11]:
fig, ax = plt.subplots()
ax.plot(sorted_letter[:10], y_g[:10])
for i, txt in enumerate(sorted_letter[:10]):
ax.annotate(txt, (i, y_g[i]))
In [16]:
fig, ax = plt.subplots()
ax.plot(sorted_letter[-20:], y_g[-20:])
for i, txt in enumerate(sorted_letter[-20:]):
ax.annotate(txt, (i, y_g[i]))
In [12]:
gamma_complete_phis.shape
Out[12]:
In [24]:
gamma_complete_phis = gamma_complete_phis[:126]
sorted_letter = six_letter_code_combinations[np.argsort(gamma_complete_phis)]
In [30]:
gamma_complete_phis.round(1)
Out[30]:
In [29]:
y.round(1)
Out[29]:
In [20]:
y = gamma_complete_phis[np.argsort(gamma_complete_phis)]
In [21]:
fig, ax = plt.subplots()
ax.plot(sorted_letter, gamma_complete_phis[np.argsort(gamma_complete_phis)])
for i, txt in enumerate(sorted_letter):
ax.annotate(txt, (i, y[i]))
In [ ]:
In [18]:
plt.plot(sorted_letter, gamma_complete_phis[np.argsort(gamma_complete_phis)])
Out[18]:
In [10]:
plt.plot(gamma_complete_phis[:125])
Out[10]:
In [45]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_large_data/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_True_gamma_filtered"
gamma_large = np.loadtxt(location)
In [47]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_more_data/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_True_gamma"
gamma_more = np.loadtxt(location)
In [53]:
In [55]:
six_letter_code_combinations = np.array(six_letter_code_combinations)
In [52]:
np.argsort(gamma_large)
Out[52]:
In [56]:
sorted_letter = six_letter_code_combinations[np.argsort(gamma_large)]
In [63]:
np.sum(gamma_large < -3)
Out[63]:
In [64]:
plt.plot(sorted_letter[:8], gamma_large[np.argsort(gamma_large)][:8])
Out[64]:
In [59]:
plt.plot(sorted_letter, gamma_large[np.argsort(gamma_large)])
Out[59]:
In [57]:
sorted_letter
Out[57]:
In [61]:
plt.plot(gamma_more, label="gamma_more")
plt.plot(gamma_large, label="gamma_large")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()
# plt.ylim(-5,5)
Out[61]:
In [60]:
plt.plot(gamma_k1000, label="k1000")
plt.plot(gamma_k2000, label="k2000")
plt.plot(gamma_k4000, label="k4000")
# plt.plot(gamma_more, label="gamma_more")
# plt.plot(gamma_large, label="gamma_large")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()
Out[60]:
In [48]:
plt.plot(gamma_k1000, label="k1000")
plt.plot(gamma_k2000, label="k2000")
plt.plot(gamma_k4000, label="k4000")
plt.plot(gamma_more, label="gamma_more")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()
plt.ylim(-5,5)
Out[48]:
In [50]:
import glob
a = glob.glob("/Users/weilu/Research/server/may_2019/four_body_helix_more_data/optimization/data/*.dat")
data_all = []
for d in a:
name = d.split("/")[-1].split("-")[0]
tmp = pd.read_csv(d, sep=" ", names=["res_pair1_chain", "res_pair1_index_1", "res_pair1_index_2",
"res_pair2_chain", "res_pair2_index_1", "res_pair2_index_2",
"res_type", "six_letter_string", "total_phi",
"d00", "d11", "d01", "d10"])
data_all.append(tmp.assign(protein=name))
data = pd.concat(data_all)
data = data.reset_index(drop=True)
data["six_letter_string"] = data["six_letter_string"].astype(str).str.pad(width=6, fillchar="0")
In [52]:
data.head()
Out[52]:
In [53]:
data["six_letter_string"].value_counts()
Out[53]:
In [ ]:
def getSeqFromPDB(location, considerGap=True):
x = PDBParser().get_structure("x", location)
seq = ""
resseqs = []
preResId = 0
for res in x.get_residues():
resId = res.get_id()[1]
if considerGap and resId != preResId + 1:
seq += " "
resseqs.append(-1)
seq += three_to_one(res.get_resname())
resseqs.append(res.get_id()[1])
preResId = resId
return seq,resseqs