In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
# from small_script.myFunctions import *



%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [3]:
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB.Polypeptide import three_to_one
def get_inside_or_not_table(pdb_file):
    parser = PDBParser(PERMISSIVE=1,QUIET=True)
    try:
        structure = parser.get_structure('X', pdb_file)
    except:
        return [0]
    inside_or_not_table = []
    for res in structure.get_residues():
        if res.get_id()[0] != " ":
            continue# skip
        try:
            res["CA"].get_vector()
        except:
            print(pdb_file, res.get_id())
            return [0]
        inside_or_not_table.append(int(abs(res["CA"].get_vector()[-1]) < 15))
    return inside_or_not_table
def extractTransmembrane(toLocation, location):
    x = PDBParser().get_structure("x", location)
    class Transmembrane(Select):
        def accept_residue(self, residue):
            if abs(residue["CA"].get_vector()[-1]) < 15:
                return 1
            else:
                return 0

    io = PDBIO()
    io.set_structure(x)
    io.save(toLocation, Transmembrane())

def getSeqFromPDB(location, considerGap=True):
    x = PDBParser().get_structure("x", location)
    seq = ""
    resseqs = []
    preResId = 0
    for res in x.get_residues():
        resId = res.get_id()[1]
        if considerGap and resId != preResId + 1:
            seq += " "
            resseqs.append(-1)
        seq += three_to_one(res.get_resname())
        resseqs.append(res.get_id()[1])
        preResId = resId
    return seq,resseqs

In [14]:
info = pd.read_csv("/Users/weilu/Research/database/membrane_contact_dtabase/for_iter0_training_complete_jun06.csv", index_col=0)

In [25]:
a = info.query("InMembraneRatio > 0.2 and InMembraneRatio < 0.8 and Length < 2000").reset_index()
a.to_csv("/Users/weilu/Research/database/relative_k/chosen_jun19.csv")

In [22]:
a.hist("Length", bins=100)


Out[22]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1acbe940>]],
      dtype=object)

In [23]:
a.shape


Out[23]:
(1116, 3)

In [18]:
a.columns


Out[18]:
Index(['Protein', 'Length', 'InMembraneRatio'], dtype='object')

I will compute the z score for those 1116 structures.


In [31]:
chosen = pd.read_csv("/Users/weilu/Research/database/relative_k/chosen_jun19.csv", index_col=0)
parser = PDBParser(PERMISSIVE=1,QUIET=True)

In [33]:
pre = "/Users/weilu/Research/database/hybrid_contact_database/"
toPre = "/Users/weilu/Research/server/jun_2019/relative_k/database/"
pdb_list = chosen.Protein.tolist()
for pdb in pdb_list:
    fromLocation = f'{pre}/cleaned/{pdb}.pdb'
    toLocation = f"{toPre}/dompdb/"
    os.system(f"cp {fromLocation} {toLocation}")
#     seq = ""
#     for res in structure.get_residues():
#         resName = three_to_one(res.resname)
#         seq += resName
#     with open(f"{toPre}/S20_seq/{pdb}.seq", "w") as out:
#         out.write(seq+"\n")

In [37]:
for pdb in pdb_list:
    fromLocation = f'{pre}/cleaned/{pdb}.pdb'
#     toLocation = f"{toPre}/dompdb/"
#     os.system(f"cp {fromLocation} {toLocation}")
    try:
        structure = parser.get_structure(pdb, fromLocation)
    except:
        print(fromLocation)
        print("cannot get ", pdb)
        continue
    seq = ""
    for res in structure.get_residues():
        resName = three_to_one(res.resname)
        seq += resName
    with open(f"{toPre}/S20_seq/{pdb}.seq", "w") as out:
        out.write(seq+"\n")

In [38]:
np.random.shuffle(pdb_list)
with open(f"{toPre}..//optimization/protein_list", "w") as out:
    for pdb in pdb_list:
        out.write(pdb+"\n")

In [88]:
with open("/Users/weilu/Research/server/jun_2019/relative_k/optimization/small_protein_list") as f:
    names = f.readlines()

In [171]:
with open("/Users/weilu/Research/server/jun_2019/relative_k/optimization/protein_list") as f:
    names = f.readlines()

In [172]:
names = [i.strip() for i in names]

In [173]:
name = names[4].strip()
name


Out[173]:
'4eiy'

In [174]:
len(names)


Out[174]:
1116

In [179]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
cutoff = 1.0
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
for i, name in enumerate(names[1:]):
    if i % 100 == 0:
        print(i)
    decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
    native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
    decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
    one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
    value_list += one_value_list
    value_sum_list += one_value_sum_list

value_list /= len(names)
one_value_sum_list /= len(names)


0
100
200
300
400
500
600
700
800
900
1000
1100

In [180]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)


Out[180]:
[<matplotlib.lines.Line2D at 0x1a348b3550>]

In [175]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
cutoff = 0.9
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
for i, name in enumerate(names[1:]):
    if i % 100 == 0:
        print(i)
    decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
    native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
    decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
    one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=cutoff)
    value_list += one_value_list
    value_sum_list += one_value_sum_list

value_list /= len(names)
one_value_sum_list /= len(names)


0
100
200
300
400
500
600
700
800
900
1000
1100

In [178]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)


Out[178]:
[<matplotlib.lines.Line2D at 0x1a33feab38>]

In [170]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)


Out[170]:
[<matplotlib.lines.Line2D at 0x1a33225da0>]

In [ ]:


In [ ]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")

In [163]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
name = names[0]
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
value_list, value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)
for name in names[1:]:
    decoy = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoys_shifted_4.5_6.5_5.0_10")
    native = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_native_4.5_6.5_5.0_10")
    decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_{name}_decoysQ_shifted_4.5_6.5_5.0_10")
    one_value_list, one_value_sum_list = get_value_array(decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)
    value_list += one_value_list
    value_sum_list += one_value_sum_list

value_list /= len(names)
one_value_sum_list /= len(names)

In [166]:
plt.plot(gamma_list, value_sum_list)


Out[166]:
[<matplotlib.lines.Line2D at 0x1a32902b00>]

In [165]:
plt.plot(gamma_list, value_list)
# plt.plot(gamma_list, value_sum_list)


Out[165]:
[<matplotlib.lines.Line2D at 0x1a328942b0>]

In [131]:
def get_value(gamma, decoy, decoyQ, native, cutoff=0.95):
    # gamma = 1
    limit =60
    energy = [a[0]+a[1]*gamma for a in decoy]
    z_m =  list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
    d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
    d.columns = ["z_m", "energy", "Q"]
    d = d.sort_values("z_m").reset_index()
    native_energy =native[0] + native[1]*gamma
    d["z"] = native_energy/d["energy"]
    # dz = d["z"].tolist()
    value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
    value_sum =  ((d["z"]) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
    return value, value_sum

In [162]:
def get_value_array(**kwargs):
    gamma_list = np.linspace(1,4, num=200)
    value_list = []
    value_sum_list = []
    for gamma in gamma_list:
        value, value_sum = get_value(gamma, **kwargs)
        # print(gamma, value)
        value_list.append(value)
        value_sum_list.append(value_sum)
    return np.array(value_list), np.array(value_sum_list)

In [150]:
value_list, value_sum_list = get_value_array(gamma=gamma, decoy=decoy, decoyQ=decoyQ, native=native, cutoff=0.9)

In [132]:
gamma_list = np.linspace(1,4, num=200)
value_list = []
value_sum_list = []
for gamma in gamma_list:
    value, value_sum = get_value(gamma, decoy, decoyQ, native, cutoff=0.9)
    # print(gamma, value)
    value_list.append(value)
    value_sum_list.append(value_sum)

In [133]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)


Out[133]:
[<matplotlib.lines.Line2D at 0x1a2dfc3320>]

In [128]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)


Out[128]:
[<matplotlib.lines.Line2D at 0x1a2db69898>]

In [82]:
plt.plot(gamma_list, value_list)
plt.plot(gamma_list, value_sum_list)


Out[82]:
[<matplotlib.lines.Line2D at 0x1a27d66860>]

In [83]:
get_value(2, decoy, decoyQ, native)


Out[83]:
(1.0, 0.9755688843602008)

In [117]:
native_energy


Out[117]:
221.33339999999998

In [126]:
native_energy/ (2.5 + 128*2 )


Out[126]:
0.8562220502901353

In [144]:
gamma = 1.8
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m =  list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
d = d.reset_index()
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]

value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")


0.48108864356347847
Out[144]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3122e0f0>

In [111]:
gamma = 2
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m =  list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
d = d.reset_index()
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]

value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")


0.41069983150911604
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2d205588>

In [105]:
d


Out[105]:
index z_m energy Q z
0 59 -60.0 162.9544 0.0 1.014328
1 60 -59.0 162.9544 0.0 1.014328
2 61 -58.0 162.9544 0.0 1.014328
3 62 -57.0 162.9544 0.0 1.014328
4 63 -56.0 162.9544 0.0 1.014328
5 64 -55.0 162.9544 0.0 1.014328
6 65 -54.0 162.9544 0.0 1.014328
7 66 -53.0 162.9544 0.0 1.014328
8 67 -52.0 162.9544 0.0 1.014328
9 68 -51.0 162.9544 0.0 1.014328
10 69 -50.0 162.9544 0.0 1.014328
11 70 -49.0 162.9544 0.0 1.014328
12 71 -48.0 162.9544 0.0 1.014328
13 72 -47.0 162.9544 0.0 1.014328
14 73 -46.0 162.9544 0.0 1.014328
15 74 -45.0 162.9544 0.0 1.014328
16 75 -44.0 162.9544 0.0 1.014328
17 76 -43.0 162.9544 0.0 1.014328
18 77 -42.0 162.9544 0.0 1.014328
19 78 -41.0 162.9544 0.0 1.014328
20 79 -40.0 162.9544 0.0 1.014328
21 80 -39.0 162.9544 0.0 1.014328
22 81 -38.0 162.9544 0.0 1.014328
23 82 -37.0 162.9544 0.0 1.014328
24 83 -36.0 162.9544 0.0 1.014328
25 84 -35.0 162.9544 0.0 1.014328
26 85 -34.0 162.9544 0.0 1.014328
27 86 -33.0 162.9544 0.0 1.014328
28 87 -32.0 162.9544 0.0 1.014328
29 88 -31.0 162.9544 0.0 1.014328
... ... ... ... ... ...
88 29 30.0 158.7718 0.0 1.041049
89 30 31.0 161.3037 0.0 1.024708
90 31 32.0 166.8417 0.0 0.990695
91 32 33.0 170.1233 0.0 0.971585
92 33 34.0 170.3250 0.0 0.970434
93 34 35.0 169.6295 0.0 0.974413
94 35 36.0 167.1150 0.0 0.989075
95 36 37.0 161.5368 0.0 1.023229
96 37 38.0 158.1782 0.0 1.044956
97 38 39.0 143.5881 0.0 1.151134
98 39 40.0 140.9730 0.0 1.172488
99 40 41.0 140.2142 0.0 1.178834
100 41 42.0 141.7289 0.0 1.166235
101 42 43.0 131.9997 0.0 1.252194
102 43 44.0 129.6248 0.0 1.275136
103 44 45.0 130.4661 0.0 1.266913
104 45 46.0 128.8012 0.0 1.283289
105 46 47.0 128.8794 0.0 1.282511
106 47 48.0 127.7465 0.0 1.293884
107 48 49.0 129.3980 0.0 1.277371
108 49 50.0 129.1467 0.0 1.279856
109 50 51.0 128.3426 0.0 1.287875
110 51 52.0 129.5045 0.0 1.276320
111 52 53.0 131.4498 0.0 1.257432
112 53 54.0 131.2152 0.0 1.259680
113 54 55.0 131.2152 0.0 1.259680
114 55 56.0 131.3004 0.0 1.258863
115 56 57.0 131.6649 0.0 1.255378
116 57 58.0 131.6649 0.0 1.255378
117 58 59.0 131.6649 0.0 1.255378

118 rows × 5 columns


In [84]:
gamma = 2
limit =60
cutoff = 1.0
energy = [a[0]+a[1]*gamma for a in decoy]
z_m =  list(np.arange(1, limit, 1)) + list(np.arange(-limit, -1, 1))
d = pd.DataFrame([z_m, energy, list(decoyQ)]).T
d.columns = ["z_m", "energy", "Q"]
d = d.sort_values("z_m")
native_energy =native[0] + native[1]*gamma
d["z"] = native_energy/d["energy"]

value = ( ((d["z"] > cutoff)) * (1-d["Q"]) ).sum()/(1-d["Q"]).sum()
print(value)
d.plot("z_m", "z")


0.1512625495519803
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a27bd3ba8>

In [47]:
plt.plot( ((d["z"].reset_index()["z"] > 0.95) ) )


Out[47]:
[<matplotlib.lines.Line2D at 0x1a1e235a58>]

In [227]:
d


Out[227]:
z_m energy z
59 -60.0 687.9749 0.964768
60 -59.0 687.9749 0.964768
61 -58.0 687.9749 0.964768
62 -57.0 687.9749 0.964768
63 -56.0 687.9749 0.964768
64 -55.0 687.9749 0.964768
65 -54.0 687.9749 0.964768
66 -53.0 687.9749 0.964768
67 -52.0 687.9749 0.964768
68 -51.0 687.9749 0.964768
69 -50.0 687.9749 0.964768
70 -49.0 687.9749 0.964768
71 -48.0 687.9749 0.964768
72 -47.0 687.9749 0.964768
73 -46.0 687.9749 0.964768
74 -45.0 687.9749 0.964768
75 -44.0 687.9749 0.964768
76 -43.0 687.9749 0.964768
77 -42.0 687.9749 0.964768
78 -41.0 687.9749 0.964768
79 -40.0 687.9749 0.964768
80 -39.0 687.9749 0.964768
81 -38.0 687.9749 0.964768
82 -37.0 687.9749 0.964768
83 -36.0 687.9749 0.964768
84 -35.0 687.9749 0.964768
85 -34.0 687.9749 0.964768
86 -33.0 687.3165 0.965692
87 -32.0 687.3163 0.965693
88 -31.0 687.4239 0.965541
... ... ... ...
29 30.0 693.2924 0.957368
30 31.0 693.2924 0.957368
31 32.0 693.2924 0.957368
32 33.0 693.2924 0.957368
33 34.0 693.2924 0.957368
34 35.0 693.2924 0.957368
35 36.0 693.2924 0.957368
36 37.0 693.2924 0.957368
37 38.0 693.2924 0.957368
38 39.0 693.2924 0.957368
39 40.0 693.2924 0.957368
40 41.0 693.2924 0.957368
41 42.0 693.2924 0.957368
42 43.0 693.2924 0.957368
43 44.0 693.2924 0.957368
44 45.0 693.2924 0.957368
45 46.0 693.2924 0.957368
46 47.0 693.2924 0.957368
47 48.0 693.2924 0.957368
48 49.0 693.2924 0.957368
49 50.0 693.2924 0.957368
50 51.0 693.2924 0.957368
51 52.0 693.2924 0.957368
52 53.0 693.2924 0.957368
53 54.0 693.2924 0.957368
54 55.0 693.2924 0.957368
55 56.0 693.2924 0.957368
56 57.0 693.2924 0.957368
57 58.0 693.2924 0.957368
58 59.0 693.2924 0.957368

118 rows × 3 columns


In [192]:
decoys_all = np.loadtxt("/Users/weilu/Research/server/jun_2019/relative_k/phis/protein_list_phi_relative_k_well4.5_6.5_5.0_10_phi_decoy_all_summary.txt")
decoy_Q = np.loadtxt("/Users/weilu/Research/server/jun_2019/relative_k/phis/phi_relative_k_well_5j4i_decoysQ_shifted_4.5_6.5_5.0_10")

In [197]:
normalized = decoys_all.sum(axis=0) / (1-decoy_Q).sum()

In [198]:
normalized


Out[198]:
array([ 89.09506801, 295.90188123])

In [183]:
(1-decoy_Q).sum()


Out[183]:
105.9998

In [ ]:


In [174]:
decoys_all.mean(axis=0)


Out[174]:
array([ 80.03440161, 265.80966297])

In [ ]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy_high = np.loadtxt(f"{pre}/{name}_phis_high")
decoy_low = np.loadtxt(f"{pre}/{name}_phis_low")

In [220]:
gamma = 2
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m =  list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")


Out[220]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e189b70>

In [216]:
gamma = 10
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m =  list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")


Out[216]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1d0fb4e0>

In [212]:
gamma = 0.1
energy = [a[0]+a[1]*gamma for a in decoy_low]
z_m =  list(z_m_high) + list(z_m_low)
d = pd.DataFrame([z_m, energy]).T
d.columns = ["z_m", "energy"]
d = d.sort_values("z_m")
native_energy = d.query("abs(z_m) == 15").iloc[0][1]
d["z"] = native_energy/d["energy"]
d.plot("z_m", "z")


Out[212]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1c9d3c18>

In [151]:


In [157]:


In [169]:
def interaction_well(r, r_min, r_max, kappa):
    return 0.5 * (np.tanh(kappa * (r - r_min)) * np.tanh(kappa * (r_max - r))) + 0.5
# r = np.linspace(10, 20, num=200)
r = np.linspace(-60,60, num=200)
y = interaction_well(r, 12, 18, 1) + interaction_well(r, -18, -12, 1)
plt.plot(r, y)


Out[169]:
[<matplotlib.lines.Line2D at 0x1a1c4575c0>]

In [167]:
r = np.linspace(10, 20, num=200)
y = interaction_well(r, 12, 18, 1)
plt.plot(r, y)


Out[167]:
[<matplotlib.lines.Line2D at 0x1a1cb1b390>]

In [134]:
d.plot("z_m", "energy")


Out[134]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1d4d72b0>

In [122]:
len(energy)


Out[122]:
118

In [46]:
pre = "/Users/weilu/Research/server/jun_2019/relative_k/phis"
decoy = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_decoys_shifted_4.5_6.5_5.0_10")
native = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_native_4.5_6.5_5.0_10")
decoyQ = np.loadtxt(f"{pre}/phi_relative_k_well_5j4i_decoysQ_shifted_4.5_6.5_5.0_10")

In [54]:
def compute_energy(decoy, native, decoyQ, gamma):
    energy = [a[0]+a[1]*gamma for a in decoy]
    return np.array(energy)

In [70]:
gamma = 10
native_energy = native[0] + native[1]*gamma

In [71]:
decoy_energy = compute_energy(decoy, native, decoyQ, gamma)

In [72]:
offset_all = np.arange(-50, 50, 1)
z = native_energy/decoy_energy
plt.plot(offset_all, z)


Out[72]:
[<matplotlib.lines.Line2D at 0x1a1d9b6c50>]

In [45]:
decoyQ


Out[45]:
array([0.37528604, 0.37528604, 0.37528604, 0.37528604, 0.37528604,
       0.37528604, 0.37528604, 0.37528604, 0.37528604, 0.37299771,
       0.37185355, 0.36498856, 0.35697941, 0.34668192, 0.33524027,
       0.31922197, 0.29977117, 0.27803204, 0.25858124, 0.23913043,
       0.21395881, 0.23226545, 0.24828375, 0.27116705, 0.28832952,
       0.30778032, 0.33409611, 0.35469108, 0.37871854, 0.39702517,
       0.41533181, 0.43592677, 0.46224256, 0.48283753, 0.49885584,
       0.51716247, 0.5389016 , 0.55377574, 0.58009153, 0.60183066,
       0.62356979, 0.64988558, 0.67963387, 0.70823799, 0.74485126,
       0.78489703, 0.81922197, 0.86613272, 0.8993135 , 0.95308924,
       1.        , 0.95652174, 0.92334096, 0.87757437, 0.83524027,
       0.79290618, 0.74485126, 0.70709382, 0.66132723, 0.63501144,
       0.60526316, 0.5778032 , 0.54576659, 0.52402746, 0.50800915,
       0.48855835, 0.46338673, 0.44622426, 0.41990847, 0.40045767,
       0.3798627 , 0.3604119 , 0.33867277, 0.32036613, 0.29519451,
       0.27002288, 0.25629291, 0.23112128, 0.2173913 , 0.18306636,
       0.16132723, 0.18649886, 0.20366133, 0.22654462, 0.25171625,
       0.27459954, 0.29633867, 0.31235698, 0.33524027, 0.34324943,
       0.35354691, 0.36155606, 0.36727689, 0.36842105, 0.36842105,
       0.36956522, 0.37299771, 0.37528604, 0.37528604, 0.37528604])

In [ ]:


In [3]:
data = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/proteins-2019-05-01.csv")
data.pdbid = data.pdbid.apply(lambda x: x[2:-1])

In [4]:
data.head()


Out[4]:
id ordering family_name_cache species_name_cache membrane_name_cache name description comments pdbid resolution ... species_id family_id superfamily_id classtype_id type_id secondary_representations_count structure_subunits_count citations_count created_at updated_at
0 1 2024.0 OmpA family Escherichia coli Gram-neg. outer Outer membrane protein A (OMPA), disordered loops NaN OmpA is required for the action of colicins K ... 1qjp 1.65 ... 9 34 26 2 1 3 1 2 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
1 2 2028.0 Enterobacterial Ail/Lom protein Escherichia coli Gram-neg. outer Outer membrane protein X (OMPX) NaN OmpX from Escherichia coli promotes adhesion t... 1qj8 1.90 ... 9 355 26 2 1 7 1 1 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
2 3 2033.0 Opacity porins Neisseria meningitidis Gram-neg. outer Outer membrane protein NspA NaN Pathogenic Neisseria spp. possess a repertoire... 1p4t 2.55 ... 24 337 235 2 1 0 1 0 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
3 4 1740.0 Influenza virus matrix protein 2 Influenza virus Viral M2 proton channel of Influenza A, closed state... NaN NaN 3lbw 1.65 ... 51 263 185 11 1 3 4 0 2018-08-13 03:49:46 UTC 2018-10-02 17:42:36 UTC
4 5 2045.0 OM protease omptin, OMPT Yersinia pestis Gram-neg. outer Plasminogen activator PLA (coagulase/fibrinoly... NaN NaN 2x55 1.85 ... 299 36 27 2 1 2 1 0 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC

5 rows × 31 columns


In [4]:
dd = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/chosen.csv", index_col=0)
pdb_list = dd.pdbid.tolist()

In [ ]:


In [5]:
filtered_list = []
for pdb in pdb_list:
    location = f"/Users/weilu/Research/database/relative_k/cleaned_pdbs/{pdb}.pdb"
    a = get_inside_or_not_table(location)
    ratio = sum(a)/len(a)
    if ratio < 0.2 or ratio > 0.8:
        print("not good", pdb, ratio)
    else:
        filtered_list.append(pdb)
        # print("good", pdb, ratio)
    # print(pdb, ratio)


not good 5lcb 0.0847457627118644
not good 6f0k 0.09178743961352658
not good 6adq 0.1346153846153846
not good 2oau 0.15748031496062992
not good 6cxh 0.1099476439790576
not good 1p49 0.09124087591240876
not good 4i0u 0.11527377521613832
not good 3bpp 0.0
not good 1kn9 0.004273504273504274
not good 4qo2 0.8186813186813187
not good 2hi7 0.0
not good 5svl 0.16516516516516516
not good 6el1 0.06267806267806268
not good 2ksf 0.8691588785046729
not good 2bhv 0.010582010582010581
not good 6idf 0.037481259370314844
not good 4jza 0.0
not good 5iji 0.18061674008810572
not good 1lv7 0.00398406374501992
not good 3b8n 0.0
not good 3pjv 0.0078125
not good 2lop 0.88
not good 2lom 0.8387096774193549
not good 2mmu 0.82
not good 5eke 0.1736111111111111
not good 5sy1 0.0
not good 5wud 0.8469387755102041
not good 5uph 0.0
not good 5v7v 0.021207177814029365
not good 6f2d 0.03940886699507389
not good 6c5w 0.14736842105263157
not good 6bug 0.0
not good 6mlu 0.0784313725490196
not good 6mct 0.8076923076923077

In [6]:
len(filtered_list)


Out[6]:
86

In [7]:
pdb_list = filtered_list

In [8]:
pdb_list


Out[8]:
['6c6l',
 '6aky',
 '1j4n',
 '5gjw',
 '1kpl',
 '4y7k',
 '6eu6',
 '5kxi',
 '4j05',
 '6ajg',
 '6fn4',
 '4nv6',
 '2zjs',
 '6gct',
 '2c3e',
 '5ksd',
 '4b4a',
 '4llh',
 '5y79',
 '5oc0',
 '3fi1',
 '4m58',
 '4a2n',
 '4uc1',
 '2q7r',
 '2yvx',
 '3h90',
 '3b4r',
 '6akg',
 '2kdc',
 '6al2',
 '3m73',
 '2m67',
 '5t77',
 '6bvg',
 '6bbg',
 '3rce',
 '6g72',
 '5vkv',
 '6d9z',
 '4r1i',
 '6nsj',
 '4p02',
 '5tcx',
 '5oyb',
 '6n28',
 '4quv',
 '4qtn',
 '5hwy',
 '3tij',
 '4av3',
 '2lor',
 '4il3',
 '3zd0',
 '5o5e',
 '4od5',
 '4o6m',
 '4pgr',
 '5jwy',
 '4o9u',
 '2mpn',
 '4q2e',
 '4x5m',
 '4rp9',
 '4zr1',
 '5a40',
 '4xu4',
 '5azb',
 '5dir',
 '5zfp',
 '5n6h',
 '5vre',
 '5mlz',
 '5tsa',
 '5xj5',
 '6bml',
 '6b87',
 '6cb2',
 '5zug',
 '6c70',
 '6bar',
 '6bhp',
 '4cad',
 '6iu3',
 '6nt6',
 '6m97']

In [23]:
with open("/Users/weilu/Research/server/may_2019/relative_k/optimization/protein_list", "w") as out:
    for pdb in pdb_list:
        # print(pdb)
        out.write(pdb+"\n")

In [28]:
for pdb in pdb_list:
    location = f"/Users/weilu/Research/server/may_2019/relative_k/database/dompdb/{pdb}.pdb"
    toLocation = f"/Users/weilu/Research/server/may_2019/relative_k/database/S20_seq/{pdb}.seq"
    seq,resseqs = getSeqFromPDB(location, considerGap=False)
    with open(toLocation, "w") as out:
        out.write(seq+'\n')

In [ ]:
0.01442
-0.31612

In [29]:
complete_proteins = "../database/cath-dataset-nonredundant-S20Clean.list"

In [33]:
a = "proteins_name_list/proteins_name_list_0.txt"

In [34]:
a.split('/')[-1].split('.')[0]


Out[34]:
'proteins_name_list_0'

In [ ]: