In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
# from small_script.myFunctions import *
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [3]:
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB.Polypeptide import three_to_one
def get_inside_or_not_table(pdb_file):
parser = PDBParser(PERMISSIVE=1,QUIET=True)
try:
structure = parser.get_structure('X', pdb_file)
except:
return [0]
inside_or_not_table = []
for res in structure.get_residues():
if res.get_id()[0] != " ":
continue# skip
try:
res["CA"].get_vector()
except:
print(pdb_file, res.get_id())
return [0]
inside_or_not_table.append(int(abs(res["CA"].get_vector()[-1]) < 15))
return inside_or_not_table
def extractTransmembrane(toLocation, location):
x = PDBParser().get_structure("x", location)
class Transmembrane(Select):
def accept_residue(self, residue):
if abs(residue["CA"].get_vector()[-1]) < 15:
return 1
else:
return 0
io = PDBIO()
io.set_structure(x)
io.save(toLocation, Transmembrane())
def getSeqFromPDB(location, considerGap=True):
x = PDBParser().get_structure("x", location)
seq = ""
resseqs = []
preResId = 0
for res in x.get_residues():
resId = res.get_id()[1]
if considerGap and resId != preResId + 1:
seq += " "
resseqs.append(-1)
seq += three_to_one(res.get_resname())
resseqs.append(res.get_id()[1])
preResId = resId
return seq,resseqs
In [14]:
info = pd.read_csv("/Users/weilu/Research/database/membrane_contact_dtabase/for_iter0_training_complete_jun06.csv", index_col=0)
In [25]:
a = info.query("InMembraneRatio > 0.2 and InMembraneRatio < 0.8 and Length < 2000").reset_index()
a.to_csv("/Users/weilu/Research/database/relative_k/chosen_jun19.csv")
In [22]:
a.hist("Length", bins=100)
Out[22]:
In [23]:
a.shape
Out[23]:
In [18]:
a.columns
Out[18]:
In [31]:
chosen = pd.read_csv("/Users/weilu/Research/database/relative_k/chosen_jun19.csv", index_col=0)
parser = PDBParser(PERMISSIVE=1,QUIET=True)
In [33]:
pre = "/Users/weilu/Research/database/hybrid_contact_database/"
toPre = "/Users/weilu/Research/server/jun_2019/relative_k/database/"
pdb_list = chosen.Protein.tolist()
for pdb in pdb_list:
fromLocation = f'{pre}/cleaned/{pdb}.pdb'
toLocation = f"{toPre}/dompdb/"
os.system(f"cp {fromLocation} {toLocation}")
# seq = ""
# for res in structure.get_residues():
# resName = three_to_one(res.resname)
# seq += resName
# with open(f"{toPre}/S20_seq/{pdb}.seq", "w") as out:
# out.write(seq+"\n")
In [37]:
for pdb in pdb_list:
fromLocation = f'{pre}/cleaned/{pdb}.pdb'
# toLocation = f"{toPre}/dompdb/"
# os.system(f"cp {fromLocation} {toLocation}")
try:
structure = parser.get_structure(pdb, fromLocation)
except:
print(fromLocation)
print("cannot get ", pdb)
continue
seq = ""
for res in structure.get_residues():
resName = three_to_one(res.resname)
seq += resName
with open(f"{toPre}/S20_seq/{pdb}.seq", "w") as out:
out.write(seq+"\n")
In [38]:
np.random.shuffle(pdb_list)
with open(f"{toPre}..//optimization/protein_list", "w") as out:
for pdb in pdb_list:
out.write(pdb+"\n")
In [88]:
with open("/Users/weilu/Research/server/jun_2019/relative_k/optimization/small_protein_list") as f:
names = f.readlines()
In [171]:
with open("/Users/weilu/Research/server/jun_2019/relative_k/optimization/protein_list") as f:
names = f.readlines()
In [172]:
names = [i.strip() for i in names]
In [173]:
name = names[4].strip()
name
Out[173]:
In [174]:
len(names)
Out[174]:
In [179]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
cutoff = 1.0
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
for i, name in enumerate(names[1:]):
if i % 100 == 0:
print(i)
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
value_list += one_value_list
value_sum_list += one_value_sum_list
value_list /= len(names)
one_value_sum_list /= len(names)
In [180]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)
Out[180]:
In [175]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
cutoff = 0.9
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
for i, name in enumerate(names[1:]):
if i % 100 == 0:
print(i)
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
value_list += one_value_list
value_sum_list += one_value_sum_list
value_list /= len(names)
one_value_sum_list /= len(names)
In [178]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)
Out[178]:
In [170]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)
Out[170]:
In [ ]:
In [ ]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
In [163]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)
for name in names[1:]:
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)
value_list += one_value_list
value_sum_list += one_value_sum_list
value_list /= len(names)
one_value_sum_list /= len(names)
In [166]:
plt.plot(gamma_list, value_sum_list)
Out[166]:
In [165]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)
Out[165]:
In [131]:
def get_value(gamma, decoy, decoyQ, native, cutoff=0.95):
# gamma = 1
limit =60
energy = [a[0]+a[1]*gamma for a in decoy]
z_m = list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m").reset_index()
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]
# dz = d["z"].tolist()
value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
value_sum = ((d["z"]) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
return value, value_sum
In [162]:
def get_value_array(**kwargs):
gamma_list = np.linspace(1,4, num=200)
value_list = []
value_sum_list = []
for gamma in gamma_list:
value, value_sum = get_value(gamma, **kwargs)
# print(gamma, value)
value_list.append(value)
value_sum_list.append(value_sum)
return np.array(value_list), np.array(value_sum_list)
In [150]:
value_list, value_sum_list = get_value_array(gamma=gamma, decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)
In [132]:
gamma_list = np.linspace(1,4, num=200)
value_list = []
value_sum_list = []
for gamma in gamma_list:
value, value_sum = get_value(gamma, decoy, decoyQ, native, cutoff=0.9)
# print(gamma, value)
value_list.append(value)
value_sum_list.append(value_sum)
In [133]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)
Out[133]:
In [128]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)
Out[128]:
In [82]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)
Out[82]:
In [83]:
get_value(2, decoy, decoyQ, native)
Out[83]:
In [117]:
native_energy
Out[117]:
In [126]:
native_energy/ (2.5 + 128*2 )
Out[126]:
In [144]:
gamma = 1.8
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m = list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
d = d.reset_index()
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]
value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")
Out[144]:
In [111]:
gamma = 2
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m = list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
d = d.reset_index()
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]
value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")
Out[111]:
In [105]:
d
Out[105]:
In [84]:
gamma = 2
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m = list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]
value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")
Out[84]:
In [47]:
plt.plot( ((d["z"].reset_index()["z"] > 0.95) ) )
Out[47]:
In [227]:
d
Out[227]:
In [192]:
decoys_all = np.loadtxt("/Users/weilu/Research/server/jun_2019/relative_k/phis/protein_list_phi_relative_k_well4.5_6.5_5.0_10_phi_decoy_all_summary.txt")
decoy_Q = np.loadtxt("/Users/weilu/Research/server/jun_2019/relative_k/phis/phi_relative_k_well_5j4i_decoysQ_shifted_4.5_6.5_5.0_10")
In [197]:
normalized = decoys_all.sum(axis=0) / (1-decoy_Q).sum()
In [198]:
normalized
Out[198]:
In [183]:
(1-decoy_Q).sum()
Out[183]:
In [ ]:
In [174]:
decoys_all.mean(axis=0)
Out[174]:
In [ ]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy_high = np.loadtxt(f"{pre}/{name}_phis_high")
decoy_low = np.loadtxt(f"{pre}/{name}_phis_low")
In [220]:
gamma = 2
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m = list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")
Out[220]:
In [216]:
gamma = 10
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m = list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")
Out[216]:
In [212]:
gamma = 0.1
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m = list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")
Out[212]:
In [151]:
In [157]:
In [169]:
def interaction_well(r, r_min, r_max, kappa):
return 0.5 * (np.tanh(kappa * (r - r_min)) * np.tanh(kappa * (r_max - r))) + 0.5
# r = np.linspace(10, 20, num=200)
r = np.linspace(-60,60, num=200)
y = interaction_well(r, 12, 18, 1) + interaction_well(r, -18, -12, 1)
plt.plot(r, y)
Out[169]:
In [167]:
r = np.linspace(10, 20, num=200)
y = interaction_well(r, 12, 18, 1)
plt.plot(r, y)
Out[167]:
In [134]:
d.plot("z_m", "energy")
Out[134]:
In [122]:
len(energy)
Out[122]:
In [46]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_decoysQ_shifted_4.5_6.5_5.0_10")
In [54]:
def compute_energy(decoy, native, decoyQ, gamma):
energy = [a[0]+a[1]*gamma for a in decoy]
return np.array(energy)
In [70]:
gamma = 10
native_energy = native[0] + native[1]*gamma
In [71]:
decoy_energy = compute_energy(decoy, native, decoyQ, gamma)
In [72]:
offset_all = np.arange(-50, 50, 1)
z = native_energy/decoy_energy
plt.plot(offset_all, z)
Out[72]:
In [45]:
decoyQ
Out[45]:
In [ ]:
In [3]:
data = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/proteins-2019-05-01.csv")
data.pdbid = data.pdbid.apply(lambda x: x[2:-1])
In [4]:
data.head()
Out[4]:
In [4]:
dd = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/chosen.csv", index_col=0)
pdb_list = dd.pdbid.tolist()
In [ ]:
In [5]:
filtered_list = []
for pdb in pdb_list:
location = f"/Users/weilu/Research/database/relative_k/cleaned_pdbs/{pdb}.pdb"
a = get_inside_or_not_table(location)
ratio = sum(a)/len(a)
if ratio < 0.2 or ratio > 0.8:
print("not good", pdb, ratio)
else:
filtered_list.append(pdb)
# print("good", pdb, ratio)
# print(pdb, ratio)
In [6]:
len(filtered_list)
Out[6]:
In [7]:
pdb_list = filtered_list
In [8]:
pdb_list
Out[8]:
In [23]:
with open("/Users/weilu/Research/server/may_2019/relative_k/optimization/protein_list", "w") as out:
for pdb in pdb_list:
# print(pdb)
out.write(pdb+"\n")
In [28]:
for pdb in pdb_list:
location = f"/Users/weilu/Research/server/may_2019/relative_k/database/dompdb/{pdb}.pdb"
toLocation = f"/Users/weilu/Research/server/may_2019/relative_k/database/S20_seq/{pdb}.seq"
seq,resseqs = getSeqFromPDB(location, considerGap=False)
with open(toLocation, "w") as out:
out.write(seq+'\n')
In [ ]:
0.01442
-0.31612
In [29]:
complete_proteins = "../database/cath-dataset-nonredundant-S20Clean.list"
In [33]:
a = "proteins_name_list/proteins_name_list_0.txt"
In [34]:
a.split('/')[-1].split('.')[0]
Out[34]:
In [ ]: