In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions

from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [2]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3

In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [4]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_native_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_native = a
data_native.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[4]:
Res1 Res2 Theta
83 CYS ASP 4
64 ASP CYS 4
124 GLU CYS 7
85 CYS GLN 7
86 CYS GLU 7
... ... ... ...
199 ILE VAL 370
209 LEU ILE 424
190 ILE LEU 424
219 LEU VAL 490
390 VAL LEU 490

400 rows × 3 columns


In [5]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_iterative = a
data_iterative.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[5]:
Res1 Res2 Theta
84 CYS CYS 1
242 MET ASN 5
52 ASN MET 5
105 GLN GLN 7
357 TRP TRP 7
... ... ... ...
209 LEU ILE 440
389 VAL ILE 512
199 ILE VAL 512
219 LEU VAL 614
390 VAL LEU 614

400 rows × 3 columns


In [12]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_old = a
data_old.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[12]:
Res1 Res2 Theta
357 TRP TRP 35
168 HIS HIS 49
44 ASN CYS 54
82 CYS ASN 54
64 ASP CYS 55
... ... ... ...
390 VAL LEU 3135
219 LEU VAL 3135
190 ILE LEU 3583
209 LEU ILE 3583
210 LEU LEU 3753

400 rows × 3 columns


In [13]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data2 = a
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[13]:
Res1 Res2 Theta
83 CYS ASP 31
64 ASP CYS 31
104 GLN CYS 42
44 ASN CYS 42
82 CYS ASN 42
... ... ... ...
390 VAL LEU 3564
219 LEU VAL 3564
210 LEU LEU 4051
190 ILE LEU 4107
209 LEU ILE 4107

400 rows × 3 columns


In [9]:
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")


Out[9]:
<seaborn.axisgrid.JointGrid at 0x101472320>

In [28]:
b = data2.query("Res1=='SER' and Res2=='ILE'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")


Out[28]:
<seaborn.axisgrid.JointGrid at 0x12141a358>

In [33]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")[:-100].hist("Theta", bins=50)


Out[33]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1211ddf60>]],
      dtype=object)

In [16]:
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")


Out[16]:
<seaborn.axisgrid.JointGrid at 0x1202deeb8>

In [14]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")


Out[14]:
<seaborn.axisgrid.JointGrid at 0x120668550>

In [28]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data_old.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[28]:
<seaborn.axisgrid.JointGrid at 0x11eee8f60>

In [25]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[25]:
<seaborn.axisgrid.JointGrid at 0x11f340860>

In [30]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[30]:
<seaborn.axisgrid.JointGrid at 0x103b57eb8>

In [30]:
info = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/info_collection.csv")

In [33]:
info = info.query("Steps==2001")[["Q", "Run", "Protein", "Folder"]].reset_index(drop=True)

In [34]:
info.query("Folder")


Out[34]:
Q Run Protein Folder
0 0.36 0 1hoe iteration_3
1 0.42 1 1hoe iteration_3
2 0.43 0 1tif iteration_3
3 0.45 1 1tif iteration_3
4 0.80 0 1vcc iteration_3
... ... ... ... ...
149 0.47 1 2hbg iteration_3
150 0.81 0 1akr iteration_3
151 0.76 1 1akr iteration_3
152 0.51 0 1osa iteration_3
153 0.44 1 1osa iteration_3

154 rows × 4 columns


In [38]:
data_iterative = data_iterative.merge(info, on=["Protein", "Run"])

In [43]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q < 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[43]:
<seaborn.axisgrid.JointGrid at 0x12041e2b0>

In [44]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q > 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[44]:
<seaborn.axisgrid.JointGrid at 0x11e60ae10>

In [42]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[42]:
<seaborn.axisgrid.JointGrid at 0x128da7cc0>

In [40]:
sns.scatterplot("D_H", "D_P", hue="Q", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e965780>

In [153]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)


Out[153]:
<matplotlib.axes._subplots.AxesSubplot at 0x12ff224e0>

In [10]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1109d5e80>

In [101]:
pre = "/Users/weilu/Research/server/mar_2020/environmental_information/"
pdb = "1akr"
ii = 1
data = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}.csv", index_col=0)
data_envr = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}_environment.csv", index_col=0)

# data_envr["Density_H"] = data_envr["Density_H"].round()
# data_envr["Density_P"] = data_envr["Density_P"].round()

data_with_info = data.merge(data_envr, how='left', left_on="Index1", right_on="index").merge(data_envr, how='left', left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 5e-2 and Type == 'Direct'").reset_index(drop=True)
# data_ = data_with_info

In [102]:
data_.query(f"Res1=='{res1}' and Res2=='{res2}'")


Out[102]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y
16 LEU LEU Direct 0.475 3 44 6.877109 4 45 3 2.000 4.751 44 2.011 1.475
83 LEU LEU Direct 1.000 24 52 6.056805 25 53 24 1.131 5.574 52 2.000 5.403
86 LEU LEU Direct 0.409 24 50 6.903615 25 51 24 1.131 5.574 50 2.465 3.409
151 LEU LEU Direct 0.207 53 76 7.001105 54 77 53 0.000 6.172 76 2.000 4.037
205 LEU LEU Direct 1.000 76 110 5.694576 77 111 76 2.000 4.037 110 1.996 4.020

In [128]:
# true positive
pdb = "1akr"
res1 = "LEU"
res2 = "LEU"
run = 0
a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)

In [119]:
contacts = set(b["Index1"].astype(str) + "_" + b["Index2"].astype(str))

In [120]:
contacts


Out[120]:
{'24_50', '24_52', '3_52', '44_72', '76_110'}

In [6]:
def isTP(contact, native_contacts):
    if contact in native_contacts:
        return "TP"
    if contact not in native_contacts:
        return "FP"


res1 = "THR"
res2 = "PRO"
run = 0
pdb = "1akr"
pdb_list = data_iterative["Protein"].unique()
new_d = []
for run in range(2):
    for pdb in pdb_list:
        a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
        b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
        native_contacts = set(a["Index1"].astype(str) + "_" + a["Index2"].astype(str))
        b["Contact"] = b["Index1"].astype(str) + "_" + b["Index2"].astype(str)
        b["isTP"] = b["Contact"].apply(isTP, native_contacts=native_contacts)
        new_d.append(b)
new_d = pd.concat(new_d).reset_index(drop=True)

In [7]:
new_d


Out[7]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x ... Density_P_x index_y Density_H_y Density_P_y Protein Run D_H D_P Contact isTP
0 THR PRO Direct 1.000 28 2 4.986226 29 3 28 ... 0.000 2 1.038 1.000 1by9 0 5.038 1.000 28_2 FP
1 THR PRO Direct 0.994 65 81 6.079741 66 82 65 ... 2.000 81 0.994 1.000 1fna 0 3.435 3.000 65_81 TP
2 THR PRO Direct 0.908 88 58 6.357295 89 59 88 ... 1.010 58 0.908 1.129 1fna 0 2.815 2.139 88_58 TP
3 THR PRO Direct 0.693 58 75 6.504425 59 76 58 ... 1.998 75 1.693 1.000 1bm8 0 3.386 2.998 58_75 TP
4 THR PRO Direct 1.000 64 51 5.319895 65 52 64 ... 2.000 51 1.246 1.001 3vub 0 3.206 3.001 64_51 TP
5 THR PRO Direct 1.000 47 19 5.083985 48 20 47 ... 0.000 19 2.000 0.177 1kte 0 5.980 0.177 47_19 TP
6 THR PRO Direct 1.000 95 77 5.553338 96 78 95 ... 1.028 77 4.153 1.000 1bkf 0 9.006 2.028 95_77 TP
7 THR PRO Direct 1.000 19 93 4.892044 20 94 19 ... 1.050 93 2.000 0.912 1sfp 0 4.000 1.962 19_93 FP
8 THR PRO Direct 1.000 30 62 5.411886 31 63 30 ... 2.020 62 1.001 1.000 1sfp 0 3.811 3.020 30_62 TP
9 THR PRO Direct 1.000 22 1 5.513818 23 2 22 ... 0.598 1 1.022 0.999 2mcm 0 4.372 1.597 22_1 FP
10 THR PRO Direct 0.953 117 20 6.284565 118 21 117 ... 0.913 20 1.198 2.071 2a0b 0 3.151 2.984 117_20 TP
11 THR PRO Direct 1.000 14 26 4.916763 15 27 14 ... 1.986 26 1.294 0.856 2sak 0 2.769 2.842 14_26 TP
12 THR PRO Direct 0.999 74 32 5.849476 75 33 74 ... 0.000 32 4.984 0.999 2sak 0 9.983 0.999 74_32 FP
13 THR PRO Direct 0.941 54 98 6.308799 55 99 54 ... 0.752 98 3.224 1.024 1bgf 0 6.262 1.776 54_98 TP
14 THR PRO Direct 0.992 35 2 6.100786 36 3 35 ... 0.000 2 2.973 0.759 1opy 0 5.966 0.759 35_2 FP
15 THR PRO Direct 0.430 69 83 6.613797 70 84 69 ... 1.000 83 3.475 2.000 1opy 0 4.835 3.000 69_83 TP
16 THR PRO Direct 1.000 12 28 4.872084 13 29 12 ... 1.102 28 2.048 1.007 1rlw 0 5.040 2.109 12_28 FP
17 THR PRO Direct 0.997 12 53 5.991587 13 54 12 ... 1.102 53 2.999 2.451 1rlw 0 5.991 3.553 12_53 FP
18 THR PRO Direct 0.956 15 26 6.278195 16 27 15 ... 0.995 26 4.104 1.000 1rlw 0 7.954 1.995 15_26 TP
19 THR PRO Direct 0.725 52 37 6.488734 53 38 52 ... 0.997 37 3.735 0.000 1hmt 0 7.908 0.997 52_37 TP
20 THR PRO Direct 0.998 52 37 5.943890 53 38 52 ... 1.001 37 2.819 1.001 1crb 0 4.817 2.002 52_37 TP
21 THR PRO Direct 1.000 54 44 5.338830 55 45 54 ... 2.156 44 3.000 0.999 1ax8 0 6.004 3.155 54_44 TP
22 THR PRO Direct 0.493 12 2 6.588423 13 3 12 ... 2.823 2 1.493 1.316 1rss 0 4.068 4.139 12_2 TP
23 THR PRO Direct 1.000 140 27 5.474146 141 28 140 ... 4.120 27 2.733 4.004 1jon 0 4.212 8.124 140_27 FP
24 THR PRO Direct 1.000 95 72 5.552165 96 73 95 ... 0.072 72 3.002 0.278 1aly 0 5.005 0.350 95_72 TP
25 THR PRO Direct 1.000 28 2 5.575712 29 3 28 ... 0.000 2 2.952 0.000 1by9 1 5.953 0.000 28_2 FP
26 THR PRO Direct 0.364 65 81 6.641636 66 82 65 ... 1.138 81 0.364 0.139 1fna 1 3.515 1.277 65_81 TP
27 THR PRO Direct 0.671 88 58 6.514388 89 59 88 ... 2.025 58 0.672 1.011 1fna 1 2.030 3.036 88_58 TP
28 THR PRO Direct 0.998 68 35 5.970904 69 36 68 ... 1.106 35 3.930 0.000 1plc 1 7.994 1.106 68_35 FP
29 THR PRO Direct 1.000 58 75 5.666871 59 76 58 ... 0.156 75 2.162 2.000 1bm8 1 3.162 2.156 58_75 TP
30 THR PRO Direct 0.782 64 51 6.458181 65 52 64 ... 1.054 51 1.782 0.237 3vub 1 2.564 1.291 64_51 TP
31 THR PRO Direct 1.000 47 19 4.833291 48 20 47 ... 0.000 19 2.000 0.157 1kte 1 5.998 0.157 47_19 TP
32 THR PRO Direct 1.000 95 77 5.206377 96 78 95 ... 0.986 77 3.997 2.000 1bkf 1 6.993 2.986 95_77 TP
33 THR PRO Direct 1.000 30 65 5.047096 31 66 30 ... 0.990 65 3.001 1.153 1sfp 1 6.001 2.143 30_65 FP
34 THR PRO Direct 0.998 86 62 5.947662 87 63 86 ... 1.076 62 1.000 1.000 1sfp 1 2.996 2.076 86_62 TP
35 THR PRO Direct 0.691 94 40 6.505275 95 41 94 ... 1.008 40 1.724 1.002 1neu 1 7.285 2.010 94_40 FP
36 THR PRO Direct 1.000 14 26 5.060741 15 27 14 ... 1.989 26 2.056 0.739 2sak 1 4.052 2.728 14_26 TP
37 THR PRO Direct 1.000 54 98 5.010488 55 99 54 ... 0.954 98 1.998 2.005 1bgf 1 5.871 2.959 54_98 TP
38 THR PRO Direct 0.957 69 83 6.274398 70 84 69 ... 1.000 83 2.776 2.332 1opy 1 3.764 3.332 69_83 TP
39 THR PRO Direct 0.154 52 37 6.756447 53 38 52 ... 0.914 37 2.203 0.000 1hmt 1 6.351 0.914 52_37 TP
40 THR PRO Direct 1.000 27 10 5.427485 28 11 27 ... 0.999 10 2.290 0.000 1htp 1 6.219 0.999 27_10 FP
41 THR PRO Direct 1.000 52 37 5.246543 53 38 52 ... 1.001 37 3.900 1.906 1crb 1 5.900 2.907 52_37 TP
42 THR PRO Direct 0.982 54 44 6.184997 55 45 54 ... 2.082 44 3.622 2.991 1ax8 1 6.604 5.073 54_44 TP
43 THR PRO Direct 1.000 95 72 4.828091 96 73 95 ... 0.000 72 2.984 0.007 1aly 1 5.984 0.007 95_72 TP
44 THR PRO Direct 0.996 16 128 6.042614 17 129 16 ... 0.610 128 2.494 1.011 1akr 1 6.476 1.621 16_128 TP

45 rows × 21 columns


In [ ]:


In [ ]:


In [125]:
native_contacts


Out[125]:
{'24_50',
 '24_52',
 '3_44',
 '44_72',
 '53_110',
 '53_72',
 '53_76',
 '76_110',
 '76_113'}

In [131]:


In [143]:


In [8]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_iter3/saved_gammas/iter3_z_weighted_2_cutoff400_impose_Aprime_constraint"
gamma_info = get_contact_gamma_info(gammaFile)

In [9]:
gamma_info.query("Interaction=='Direct'").sort_values("Gamma")


Out[9]:
Interaction Res1 Res2 Index Gamma
372 Direct Y P 193 -1.124
371 Direct P Y 193 -1.124
379 Direct W S 197 -1.121
378 Direct S W 197 -1.121
214 Direct K E 110 -1.080
... ... ... ... ... ...
351 Direct F F 182 1.058
243 Direct F G 125 1.091
242 Direct G F 125 1.091
277 Direct H V 143 1.524
278 Direct V H 143 1.524

400 rows × 5 columns


In [11]:
gamma_info.query("Interaction=='Direct' and Res1=='L'")


Out[11]:
Interaction Res1 Res2 Index Gamma
20 Direct L A 10 -0.388
57 Direct L R 29 0.280
92 Direct L N 47 0.045
125 Direct L D 64 0.576
156 Direct L C 80 0.056
185 Direct L Q 95 -0.039
212 Direct L E 109 -0.480
237 Direct L G 122 -0.340
260 Direct L H 134 -0.417
281 Direct L I 145 -0.322
300 Direct L L 155 -0.074
301 Direct L K 156 -0.287
303 Direct L M 157 -0.639
305 Direct L F 158 -0.365
307 Direct L P 159 -0.455
309 Direct L S 160 0.464
311 Direct L T 161 -0.118
313 Direct L W 162 -0.176
315 Direct L Y 163 -0.610
317 Direct L V 164 -0.835

In [2]:
def get_contact_gamma_info(gammaFile):
    # check the gamma.
    # read in gamma, and sort by size.
    # gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
    gamma = np.loadtxt(gammaFile)

    res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
                            'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

    inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
    c = 0
    info_ = []
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            c += 1
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
            if i != j:
                info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
            c += 1
    contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
    return contact_gammas

In [59]:
data_native


Out[59]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y Protein D_H D_P
0 ASP ASP Direct 0.615 0 39 6.387458 1 40 0 2.615 0.000 39 2.612 0.001 1hoe 5.227 0.001
1 THR ASP Direct 1.000 1 39 3.803764 2 40 1 1.026 1.000 39 2.612 0.001 1hoe 3.638 1.001
2 ARG ASP Direct 0.997 67 39 5.408635 68 40 67 3.023 1.000 39 2.612 0.001 1hoe 5.635 1.001
3 ASP ARG Direct 1.000 0 67 4.196109 1 68 0 2.615 0.000 67 3.023 1.000 1hoe 5.638 1.000
4 VAL ARG Direct 1.000 35 67 5.400923 36 68 35 4.864 1.000 67 3.023 1.000 1hoe 7.887 2.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20225 LEU PHE Direct 0.111 111 91 7.342279 112 92 111 0.000 2.111 91 0.986 4.953 1osa 0.986 7.064
20226 GLU ASP Direct 1.000 103 92 5.176369 104 93 103 3.997 1.000 92 5.000 2.004 1osa 8.997 3.004
20227 GLU ARG Direct 0.874 103 93 6.003034 104 94 103 3.997 1.000 93 1.874 0.076 1osa 5.871 1.076
20228 HIS ARG Direct 1.000 106 93 4.551530 107 94 106 2.970 0.000 93 1.874 0.076 1osa 4.844 0.076
20229 VAL ARG Direct 0.076 107 93 6.888728 108 94 107 0.687 3.946 93 1.874 0.076 1osa 2.561 4.022

20230 rows × 18 columns


In [54]:
data_native.query(f"Res1=='{res1}' and Res2=='{res2}'")


Out[54]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y Protein D_H D_P
16 LEU LEU Direct 1.0 13 69 4.486654 14 70 13 2.793 2.000 69 1.999 5.000 1hoe 4.792 7.000
212 LEU LEU Direct 1.0 15 42 6.106652 16 43 15 2.019 5.527 42 2.988 5.999 1tif 5.007 11.526
214 LEU LEU Direct 1.0 32 42 5.137943 33 43 32 3.892 1.886 42 2.988 5.999 1tif 6.880 7.885
366 LEU LEU Direct 1.0 10 30 5.106055 11 31 10 2.003 5.997 30 1.932 5.996 1vcc 3.935 11.993
492 LEU LEU Direct 1.0 6 50 5.629869 7 51 6 0.000 9.000 50 0.000 8.000 1by9 0.000 17.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19877 LEU LEU Direct 1.0 53 110 5.940358 54 111 53 0.000 9.958 110 1.999 7.008 1akr 1.999 16.966
19878 LEU LEU Direct 1.0 76 110 5.347788 77 111 76 2.279 6.739 110 1.999 7.008 1akr 4.278 13.747
20052 LEU LEU Direct 1.0 76 113 5.790491 77 114 76 2.279 6.739 113 2.935 1.959 1akr 5.214 8.698
20083 LEU LEU Direct 1.0 3 68 5.984374 4 69 3 1.000 3.000 68 0.998 3.996 1osa 1.998 6.996
20125 LEU LEU Direct 1.0 31 47 5.299009 32 48 31 0.784 6.989 47 5.163 2.328 1osa 5.947 9.317

289 rows × 18 columns


In [24]:
data_native.query("Res1=='LEU' and Res2=='LEU' and D_H > 9")


Out[24]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y Protein D_H D_P
7390 LEU LEU Direct 0.999 4 44 6.195842 5 45 4 5.270 3.561 44 4.985 0.999 1tmy 10.255 4.560
11901 LEU LEU Direct 1.000 6 63 5.423016 7 64 6 5.892 1.000 63 4.004 1.000 1cpq 9.896 2.000
14426 LEU LEU Direct 0.998 10 116 6.235893 11 117 10 3.262 3.131 116 5.969 1.133 2end 9.231 4.264

In [23]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fd9dda0>

In [22]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
sns.scatterplot("D_H", "D_P", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)


No handles with labels found to put in legend.
Out[22]:
<matplotlib.legend.Legend at 0x11e58e0b8>

In [39]:
a = data_native.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="reg")


Out[39]:
<seaborn.axisgrid.JointGrid at 0x120a03da0>

In [20]:
cbd_info = pd.read_csv("/Users/weilu/opt/parameters/side_chain/cbd_cbd_real_contact_symmetric.csv")

In [22]:
cbd_info.query("ResName1 == 'SER'")


Out[22]:
ResName1 ResName2 r_max r_min
19 SER SER 4.359305 3.198264
20 SER PRO 4.323022 3.109163
21 SER VAL 4.271867 3.411367
22 SER THR 4.547018 3.236274
23 SER CYS 4.243967 3.176515
24 SER ILE 4.792703 2.905342
25 SER LEU 4.725601 3.041130
26 SER ASN 4.646894 3.344635
27 SER ASP 3.718539 3.013373
28 SER GLN 4.553369 3.024015
29 SER LYS 4.350099 3.242161
30 SER GLU 4.048059 3.258734
31 SER MET 5.140508 3.441655
32 SER HIS 4.257428 3.561157
33 SER PHE 4.850533 3.393127
34 SER ARG 4.990777 2.941009
35 SER TYR 4.985624 3.275489
36 SER TRP 5.014418 3.435178

In [ ]:
def get_r_min_max(a, res1, res2, type="Direct"):
    res1_name = res1.get_resname()
    res2_name = res2.get_resname()
    if type == "Direct":
        if res1_name == "GLY" or res2_name == "GLY":
            r_min_res1_res2 = 2.5
            r_max_res1_res2 = 6.5
        else:
            b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
            if len(b) == 0:
                b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
            try:
                r_min_res1_res2 = float(b["r_min"]) - 0.5
                r_max_res1_res2 = float(b["r_max"]) + 1.5
            except:
                print("problem", b)
    else:
        if res1_name == "GLY" or res2_name == "GLY":
            r_min_res1_res2 = 6.5
            r_max_res1_res2 = 9.5
        else:
            b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
            if len(b) == 0:
                b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
            try:
                r_min_res1_res2 = float(b["r_max"]) + 1.5
                r_max_res1_res2 = float(b["r_max"]) + 4.5
            except:
                print(b)
    return r_min_res1_res2, r_max_res1_res2

def get_interaction_data_with_cbd_info(structure, cbd_info):
    # get all the pair of interaction, direct and mediated. as a dataFrame.
    res_list = get_res_list(structure)
    neighbor_list = get_neighbor_list(structure)
    sequence = get_sequence_from_structure(structure)
    # cb_density = calculate_cb_density(res_list, neighbor_list)
    cb_density = calculate_cb_density_com_wellCenter(res_list, neighbor_list, cbd_info)
    r_min_direct = 2.5
    r_max_direct = 6.5
    r_min = 6.5
    r_max = 9.5
    kappa = 5.0
    min_seq_sep = 10
    density_threshold = 2.6
    density_kappa = 7.0
    # phi_mediated_contact_well = np.zeros((2, 20,20))
    v_mediated = 0
    data_ = []
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        rho_i = cb_density[res1globalindex]
        for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            rho_j = cb_density[res2globalindex]

            # if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
            if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
                if res1.resname == res2.resname:
                    if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
                        continue
                res1type = get_res_type(res_list, res1)
                res2type = get_res_type(res_list, res2)
                rij = get_interaction_distance(res1, res2)
                # theta = interaction_well(rij, r_min, r_max, kappa)
                r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Mediated")
                theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
                
                water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
                protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
                data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
                direct_theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
                data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
                # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
    data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])

    # contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
    # contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
    # contact_gammas["Type"] = contact_gammas["Interaction"]
    # a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
    # a["theta_gamma"] = a["Theta"] * a["Gamma"]
    return data


def calculate_property_density_with_cbd_info(res_list, neighbor_list, propertyTable, cbd_info, min_seq_sep=2, rmin=2.5):
    num_residues = len(res_list)
    density = np.zeros(num_residues)
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
                rij = get_interaction_distance(res1, res2)
                hasProperty = propertyTable[three_to_one(res2.resname)]
                r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
                density[res1globalindex] += hasProperty * interaction_well(rij, r_min_res1_res2, r_max_res1_res2, 5)
    return density

def get_environment_with_cbd_info(structure, cbd_info):
    res_type_map_HP = {
        'C': 0,
        'M': 0,
        'F': 0,
        'I': 0,
        'L': 0,
        'V': 0,
        'W': 0,
        'Y': 0,
        'A': 1,
        'H': 1,
        'T': 1,
        'G': 1,
        'P': 1,
        'D': 1,
        'E': 1,
        'N': 1,
        'Q': 1,
        'R': 1,
        'K': 1,
        'S': 1
    }
    isH = {}
    isP = {}
    for i in range(20):
        isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
        isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
    res_list = get_res_list(structure)
    neighbor_list = get_neighbor_list(structure)
    sequence = get_sequence_from_structure(structure)
    density_H = calculate_property_density_with_cbd_info(res_list, neighbor_list, isH, cbd_info).round(3)
    density_P = calculate_property_density_with_cbd_info(res_list, neighbor_list, isP, cbd_info).round(3)
    environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
    return environment_info

In [ ]:


In [6]:
def get_interaction_data(structure):
    # get all the pair of interaction, direct and mediated. as a dataFrame.
    res_list = get_res_list(structure)
    neighbor_list = get_neighbor_list(structure)
    sequence = get_sequence_from_structure(structure)
    cb_density = calculate_cb_density(res_list, neighbor_list)
    r_min_direct = 2.5
    r_max_direct = 6.5
    r_min = 6.5
    r_max = 9.5
    kappa = 5.0
    min_seq_sep = 10
    density_threshold = 2.6
    density_kappa = 7.0
    # phi_mediated_contact_well = np.zeros((2, 20,20))
    v_mediated = 0
    data_ = []
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        rho_i = cb_density[res1globalindex]
        for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            rho_j = cb_density[res2globalindex]

            # if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
            if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
                if res1.resname == res2.resname:
                    if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
                        continue
                res1type = get_res_type(res_list, res1)
                res2type = get_res_type(res_list, res2)
                rij = get_interaction_distance(res1, res2)
                theta = interaction_well(rij, r_min, r_max, kappa)
                water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
                protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
                data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
                data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
                # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
                # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
    data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])

    # contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
    # contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
    # contact_gammas["Type"] = contact_gammas["Interaction"]
    # a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
    # a["theta_gamma"] = a["Theta"] * a["Gamma"]
    return data


def get_contact_gamma_info(gammaFile):
    # check the gamma.
    # read in gamma, and sort by size.
    # gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
    gamma = np.loadtxt(gammaFile)

    res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
                            'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

    inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
    c = 0
    info_ = []
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            c += 1
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
            if i != j:
                info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
            c += 1
    contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
    return contact_gammas

In [6]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

In [19]:


In [72]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1fna/cbd_1fna.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

In [21]:


In [99]:
def calculate_property_density(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
    num_residues = len(res_list)
    density = np.zeros(num_residues)
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
                rij = get_interaction_distance(res1, res2)
                hasProperty = propertyTable[three_to_one(res2.resname)]
                density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
    return density

def get_environment(structure):
    res_type_map_HP = {
        'C': 0,
        'M': 0,
        'F': 0,
        'I': 0,
        'L': 0,
        'V': 0,
        'W': 0,
        'Y': 0,
        'A': 1,
        'H': 1,
        'T': 1,
        'G': 1,
        'P': 1,
        'D': 1,
        'E': 1,
        'N': 1,
        'Q': 1,
        'R': 1,
        'K': 1,
        'S': 1
    }
    isH = {}
    isP = {}
    for i in range(20):
        isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
        isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
    res_list = get_res_list(structure)
    neighbor_list = get_neighbor_list(structure)
    sequence = get_sequence_from_structure(structure)
    density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
    density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
    environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
    return environment_info

In [118]:
data_list = []
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna.csv", index_col=0)
data_envr = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna_environment.csv", index_col=0)

data_envr["Density_H"] = data_envr["Density_H"].round()
data_envr["Density_P"] = data_envr["Density_P"].round()

data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
data_list.append(data_.assign(Protein=pdb))

In [122]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data.csv", index_col=0)

In [209]:


In [210]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[210]:
Res1 Res2 Theta
357 TRP TRP 35
168 HIS HIS 49
44 ASN CYS 54
82 CYS ASN 54
64 ASP CYS 55
... ... ... ...
390 VAL LEU 3135
219 LEU VAL 3135
190 ILE LEU 3583
209 LEU ILE 3583
210 LEU LEU 3753

400 rows × 3 columns


In [203]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[203]:
Res1 Res2 Theta
357 TRP TRP 2
83 CYS ASP 5
64 ASP CYS 5
348 TRP HIS 7
177 HIS TRP 7
... ... ... ...
199 ILE VAL 291
209 LEU ILE 368
190 ILE LEU 368
219 LEU VAL 442
390 VAL LEU 442

400 rows × 3 columns


In [212]:


In [158]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1pne/cbd_1pne.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

In [159]:
res_type_map_HP = {
    'C': 0,
    'M': 0,
    'F': 0,
    'I': 0,
    'L': 0,
    'V': 0,
    'W': 0,
    'Y': 0,
    'A': 1,
    'H': 1,
    'T': 1,
    'G': 1,
    'P': 1,
    'D': 1,
    'E': 1,
    'N': 1,
    'Q': 1,
    'R': 1,
    'K': 1,
    'S': 1
}
isH = {}
isP = {}
for i in range(20):
    isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
    isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()

In [197]:
def calculate_property_density_debug(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
    num_residues = len(res_list)
    density = np.zeros(num_residues)
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):

                rij = get_interaction_distance(res1, res2)
                hasProperty = propertyTable[three_to_one(res2.resname)]
                theta = interaction_well(rij, rmin, 6.5, 5)
                if res1globalindex == 64:
                    if hasProperty * theta > 0.1:
                        print(res1index, res1.resname, res2.resname, res2index, rij, hasProperty, theta)
                density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
    return density

In [198]:
calculate_property_density_debug(res_list, neighbor_list, isP)[64]


65 LEU CYS 70 4.830256 1 0.9999999438976255
65 LEU LEU 50 5.5888577 1 0.9998896147329259
65 LEU PHE 39 5.879114 1 0.9979925058011887
65 LEU LEU 63 5.7178817 1 0.9995990140910096
65 LEU LEU 87 6.0440707 1 0.989639013538697
65 LEU ILE 42 5.4401717 1 0.9999750417908473
65 LEU VAL 102 6.6332526 1 0.20874180797125436
65 LEU ILE 100 5.9916315 1 0.9938411353589731
65 LEU LEU 111 5.7475533 1 0.9994605746142904
Out[198]:
8.189138664358062

In [164]:
environment_info.query("index==108")


Out[164]:
index Density_H Density_P
108 108 0.039 8.954

In [195]:
data2.query("ResId2=='65' and Protein == '1pne'")


Out[195]:
Res1 Res2 Type Theta Index1 Index2 r ResId1 ResId2 index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y Protein
14367 PHE LEU Direct 0.998 38 64 5.879114 39 65 38 2.776 4.976 64 0.999 8.189 1pne
14368 ILE LEU Direct 1.000 41 64 5.440172 42 65 41 2.991 5.830 64 0.999 8.189 1pne
14369 GLU LEU Direct 0.999 45 64 5.846354 46 65 45 4.000 1.999 64 0.999 8.189 1pne
14370 LEU LEU Direct 1.000 49 64 5.588858 50 65 49 0.004 6.959 64 0.999 8.189 1pne
14371 ILE LEU Direct 0.994 99 64 5.991632 100 65 99 0.886 5.010 64 0.999 8.189 1pne
14372 VAL LEU Direct 0.209 101 64 6.633253 102 65 101 0.000 6.422 64 0.999 8.189 1pne

In [221]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")


Out[221]:
<seaborn.axisgrid.JointGrid at 0x130776080>

In [13]:
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-8f515669a5ec> in <module>()
----> 1 a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
      2 a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
      3 a["D_P"] = a["Density_P_x"] + a["Density_P_y"]

NameError: name 'data2' is not defined

In [154]:
a.sort_values("D_P")


Out[154]:
Res1 Res2 Type Theta Index1 Index2 r index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y Protein D_H D_P
135 LEU LEU Direct 1.000 6 63 5.423016 6 5.211 1.000 63 3.988 1.000 1cpq 9.199 2.000
175 LEU LEU Direct 1.000 36 88 4.225886 36 3.000 2.000 88 1.698 1.000 2sns 4.698 3.000
79 LEU LEU Direct 1.000 2 70 4.998909 2 3.856 2.049 70 1.023 1.000 1a6f 4.879 3.049
154 LEU LEU Direct 0.114 90 127 6.704611 90 3.000 2.121 127 1.305 1.108 1kuh 4.305 3.229
70 LEU LEU Direct 0.935 82 102 6.232599 82 1.000 2.935 102 2.125 0.935 1a1x 3.125 3.870
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
123 LEU LEU Direct 1.000 32 71 5.142135 32 0.009 7.098 71 0.000 7.966 1rlw 0.009 15.064
168 LEU LEU Direct 1.000 49 64 5.588858 49 0.004 6.959 64 0.999 8.189 1pne 1.003 15.148
172 LEU LEU Direct 0.990 64 86 6.044071 64 0.999 8.189 86 0.008 6.990 1pne 1.007 15.179
165 LEU LEU Direct 1.000 49 108 5.543379 49 0.004 6.959 108 0.039 8.954 1pne 0.043 15.913
4 LEU LEU Direct 1.000 6 50 5.629869 6 0.000 8.373 50 0.001 7.982 1by9 0.001 16.355

258 rows × 16 columns


In [214]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[214]:
<seaborn.axisgrid.JointGrid at 0x1305430b8>

In [151]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[151]:
<seaborn.axisgrid.JointGrid at 0x12fb76630>

In [ ]:


In [140]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")


Out[140]:
<seaborn.axisgrid.JointGrid at 0x127ec8e48>

In [152]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")


Out[152]:
<seaborn.axisgrid.JointGrid at 0x12fb679e8>

In [139]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")


Out[139]:
<seaborn.axisgrid.JointGrid at 0x127edfb38>

In [ ]:


In [109]:
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)


Out[109]:
Res1 Res2 Type Theta Index1 Index2 r index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y
0 ALA SER Direct 0.985 68 78 6.081271 68 3.233 2.011 78 3.792 1.000
1 ARG TYR Direct 1.000 27 67 5.312370 27 1.678 2.179 67 4.000 0.000
2 THR TYR Direct 1.000 29 67 5.425073 29 2.678 2.000 67 4.000 0.000
3 LEU ILE Direct 0.915 2 64 6.262689 2 1.001 5.733 64 0.000 4.962
4 LEU ILE Direct 0.949 12 64 6.207278 12 1.001 5.948 64 0.000 4.962
... ... ... ... ... ... ... ... ... ... ... ... ... ...
102 VAL THR Direct 0.587 23 70 6.464955 23 4.024 1.060 70 3.616 1.587
103 ARG GLY Direct 1.000 24 71 4.846040 24 1.041 0.000 71 1.996 0.001
104 ARG GLU Direct 1.000 27 41 4.193047 27 1.678 2.179 41 2.003 0.000
105 THR GLU Direct 1.000 29 41 5.378955 29 2.678 2.000 41 2.003 0.000
106 THR VAL Direct 1.000 29 39 4.840663 29 2.678 2.000 39 2.000 0.000

107 rows × 13 columns


In [111]:
data_with_info.shape


Out[111]:
(2349, 13)

In [39]:
isH = {}
isP = {}
for i in range(20):
    isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
    isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]

In [75]:
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round()
density_P = calculate_property_density(res_list, neighbor_list, isP).round()

In [76]:
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()

In [77]:
data_with_info = data.merge(environment_info, left_on="Index1", right_on="index").merge(environment_info, left_on="Index2", right_on="index")

In [ ]:


In [80]:
a = data_with_info.query("Type=='Direct' and Theta > 1e-1")

In [87]:
a.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")


Out[87]:
Res1 Res2 Theta
0 ALA GLU 1
31 PHE ILE 1
32 PRO ALA 1
33 PRO GLU 1
34 PRO ILE 1
... ... ... ...
1 ALA SER 3
68 VAL ILE 3
27 LEU THR 3
22 ILE ILE 6
25 LEU ILE 6

72 rows × 3 columns


In [112]:
a.shape


Out[112]:
(107, 13)

In [88]:
a.query("Res1 =='LEU' and Res2 == 'ILE'")


Out[88]:
Res1 Res2 Type Theta Index1 Index2 r index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y
131 LEU ILE Direct 1.000 2 84 4.197815 2 1.0 6.0 84 0.0 2.0
791 LEU ILE Direct 1.000 2 14 5.333030 2 1.0 6.0 14 1.0 7.0
929 LEU ILE Direct 0.915 2 64 6.262689 2 1.0 6.0 64 0.0 5.0
938 LEU ILE Direct 0.949 12 64 6.207278 12 1.0 6.0 64 0.0 5.0
1142 LEU ILE Direct 1.000 2 82 5.090292 2 1.0 6.0 82 2.0 2.0
1535 LEU ILE Direct 1.000 12 53 4.717994 12 1.0 6.0 53 0.0 7.0

In [70]:
data.merge(environment_info, left_on="Index2", right_on="index")


Out[70]:
Res1 Res2 Type Theta Index1 Index2 r index_x Density_H_x Density_P_x index_y Density_H_y Density_P_y
0 ARG VAL Protein 0.0 0 69 13.319039 0 1.892 3.972 69 2.902 1.0
1 ARG VAL Water 0.0 0 69 13.319039 0 1.892 3.972 69 2.902 1.0
2 ARG VAL Direct 0.0 0 69 13.319039 0 1.892 3.972 69 2.902 1.0
3 TRP VAL Protein 0.0 16 69 12.675022 16 2.649 4.679 69 2.902 1.0
4 TRP VAL Water 0.0 16 69 12.675022 16 2.649 4.679 69 2.902 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2344 ILE VAL Water 0.0 28 39 10.884260 28 1.000 5.199 39 2.000 0.0
2345 ILE VAL Direct 0.0 28 39 10.884260 28 1.000 5.199 39 2.000 0.0
2346 THR VAL Protein 0.0 29 39 4.840663 29 2.678 2.000 39 2.000 0.0
2347 THR VAL Water 0.0 29 39 4.840663 29 2.678 2.000 39 2.000 0.0
2348 THR VAL Direct 1.0 29 39 4.840663 29 2.678 2.000 39 2.000 0.0

2349 rows × 13 columns


In [71]:
data


Out[71]:
Res1 Res2 Type Theta Index1 Index2 r index Density_H Density_P
0 ARG VAL Protein 0.0 0 69 13.319039 0 1.892 3.972
1 ARG VAL Water 0.0 0 69 13.319039 0 1.892 3.972
2 ARG VAL Direct 0.0 0 69 13.319039 0 1.892 3.972
3 ARG TYR Protein 0.0 0 67 10.726437 0 1.892 3.972
4 ARG TYR Water 0.0 0 67 10.726437 0 1.892 3.972
... ... ... ... ... ... ... ... ... ... ...
2344 VAL SER Water 0.0 69 79 8.581514 69 2.902 1.000
2345 VAL SER Direct 0.0 69 79 8.581514 69 2.902 1.000
2346 THR LYS Protein 0.0 70 80 13.351789 70 3.616 1.587
2347 THR LYS Water 0.0 70 80 13.351789 70 3.616 1.587
2348 THR LYS Direct 0.0 70 80 13.351789 70 3.616 1.587

2349 rows × 10 columns


In [66]:
environment_info


Out[66]:
index Density_H Density_P
0 0 1.892 3.972
1 1 1.443 0.000
2 2 1.001 5.733
3 3 1.443 1.000
4 4 1.000 3.294
... ... ... ...
86 86 1.959 2.000
87 87 1.000 0.686
88 88 2.552 3.960
89 89 2.956 0.000
90 90 2.007 0.000

91 rows × 3 columns


In [ ]:


In [57]:
density_H


Out[57]:
array([1.892, 1.443, 1.001, 1.443, 1.   , 2.001, 0.   , 1.021, 2.956,
       2.553, 2.954, 2.992, 1.001, 3.002, 1.   , 3.   , 2.649, 2.   ,
       1.001, 2.891, 0.   , 3.988, 0.893, 4.024, 1.041, 1.278, 3.994,
       1.678, 1.   , 2.678, 1.   , 0.993, 2.983, 3.003, 0.   , 1.995,
       0.   , 1.978, 0.029, 2.   , 0.035, 2.003, 1.   , 0.012, 1.034,
       2.009, 0.038, 1.   , 1.162, 2.   , 1.162, 0.   , 1.855, 0.436,
       1.884, 0.955, 1.203, 1.   , 2.998, 2.   , 3.004, 2.12 , 3.044,
       4.01 , 0.   , 3.009, 1.001, 4.   , 3.233, 2.902, 3.616, 1.996,
       1.889, 3.493, 1.001, 1.173, 0.   , 1.552, 3.792, 2.4  , 0.   ,
       1.263, 1.779, 2.159, 0.   , 3.   , 1.959, 1.   , 2.552, 2.956,
       2.007])

In [50]:
density_P.round(1)


Out[50]:
array([4., 0., 6., 1., 3., 1., 1., 3., 0., 0., 0., 0., 6., 1., 7., 1., 5.,
       0., 1., 3., 0., 0., 0., 1., 0., 1., 4., 2., 5., 2., 3., 2., 2., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 4., 1., 4., 0., 2., 0., 0., 1., 1.,
       5., 1., 7., 0., 0., 4., 1., 0., 1., 1., 0., 2., 0., 5., 1., 5., 0.,
       2., 1., 2., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 2., 0., 2.,
       0., 2., 1., 4., 0., 0.])

In [51]:
cb_density.round(1)


Out[51]:
array([6., 1., 7., 2., 4., 3., 1., 4., 3., 3., 3., 3., 7., 4., 8., 4., 7.,
       2., 2., 6., 0., 4., 1., 5., 1., 2., 8., 4., 6., 5., 4., 3., 5., 3.,
       0., 2., 0., 2., 0., 2., 1., 2., 5., 1., 5., 2., 2., 1., 1., 3., 2.,
       5., 3., 7., 2., 1., 5., 2., 3., 3., 4., 2., 5., 4., 5., 4., 6., 4.,
       5., 4., 5., 2., 3., 3., 2., 1., 0., 2., 5., 3., 1., 2., 4., 2., 2.,
       3., 4., 2., 7., 3., 2.])

In [31]:
# get environmental information
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
    res1index = get_local_index(res1)
    res1chain = get_chain(res1)
    rho_i = cb_density[res1globalindex]
    for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
        res2index = get_local_index(res2)
        res2chain = get_chain(res2)
        res2globalindex = get_global_index(res_list, res2)
        rho_j = cb_density[res2globalindex]

        if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
            res1type = get_res_type(res_list, res1)
            res2type = get_res_type(res_list, res2)
            rij = get_interaction_distance(res1, res2)
            theta = interaction_well(rij, r_min, r_max, kappa)
            water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
            protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
            data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij])
            data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij])
            direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
            data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij])
            # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
            # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r"])

In [47]:
structure = s
kappa=5.0
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
# kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
    res1index = get_local_index(res1)
    res1chain = get_chain(res1)
    rho_i = cb_density[res1globalindex]
    for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
        res2index = get_local_index(res2)
        res2chain = get_chain(res2)
        res2globalindex = get_global_index(res_list, res2)
        rho_j = cb_density[res2globalindex]

        if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
            res1type = get_res_type(res_list, res1)
            res2type = get_res_type(res_list, res2)
            rij = get_interaction_distance(res1, res2)
            theta = interaction_well(rij, r_min, r_max, kappa)
            water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
            protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
            data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex])
            data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex])
            direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
            data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex])
            # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
            # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2"])

In [74]:


In [4]:


In [30]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_larger_excl_withoutBurial/iter_larger_excl_withoutBurial_30"
contact_gammas = get_contact_gamma_info(gammaFile)

contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]

# pdb = "1sfp"
# pdb = "1who"
pdb = "1jon"
pdb = "1neu"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/crystal_structure.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data_native = get_interaction_data(structure)

a_native = data_native.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a_native["theta_gamma"] = a_native["Theta"] * a_native["Gamma"]

In [31]:
a[["Type", "theta_gamma"]].groupby("Type").sum()


Out[31]:
theta_gamma
Type
Direct -101.276373
Protein -105.200971
Water -1.311072

In [32]:
a_native[["Type", "theta_gamma"]].groupby("Type").sum()


Out[32]:
theta_gamma
Type
Direct -59.127062
Protein -66.563254
Water -4.710177

In [33]:
a[["Type", "theta_gamma"]].groupby("Type").sum().sum()


Out[33]:
theta_gamma   -207.788416
dtype: float64

In [34]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)


Out[34]:
Res1 Res2 Type theta_gamma
276 SER VAL Protein 7
264 SER LYS Protein 7
140 GLY VAL Protein 7
165 ILE ILE Direct 7
364 VAL SER Protein 8

In [35]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)


Out[35]:
Res1 Res2 Type theta_gamma
165 ILE ILE Direct -5.859840
54 ASP LYS Direct -4.141928
181 ILE VAL Direct -3.560194
171 ILE PHE Direct -3.555981
29 ARG PHE Protein -2.875323

In [36]:
b2_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2_native.sort_values("theta_gamma").tail(5)


Out[36]:
Res1 Res2 Type theta_gamma
299 VAL SER Protein 5
142 ILE PHE Protein 5
137 ILE ILE Protein 5
307 VAL VAL Protein 5
172 PHE ASP Protein 6

In [37]:
b3_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3_native.sort_values("theta_gamma").head(5)


Out[37]:
Res1 Res2 Type theta_gamma
250 TRP ILE Direct -4.199720
306 VAL VAL Direct -3.221582
180 PHE ILE Direct -2.645379
136 ILE ILE Direct -2.453640
292 VAL LEU Direct -2.448768

In [60]:
gammaFile = "/Users/weilu/Research/server/mar_2020/cath_dataset_shuffle_optimization/optimization_iter0_com_density_rmin_2p5/saved_gammas/iter0_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)

contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]

pdb = "1sfp"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]

In [61]:
a[["Type", "theta_gamma"]].groupby("Type").sum()


Out[61]:
theta_gamma
Type
Direct -71.788934
Protein -11.780707
Water -2.330682

In [62]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)

In [63]:



Out[63]:
Res1 Res2 Type theta_gamma
111 GLY TYR Protein 5
134 ILE LEU Protein 6
132 ILE ILE Protein 6
130 ILE GLY Protein 6
146 ILE VAL Protein 6

In [20]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)

In [23]:



Out[23]:
Res1 Res2 Type theta_gamma
19 CYS CYS Direct -25.324650
149 VAL LEU Direct -11.089641
20 CYS CYS Protein -6.585576
26 CYS LEU Protein -3.843926
104 PHE CYS Direct -3.533796

In [25]:
fastaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/3cyr/0/crystal_structure.fasta"
seq = read_fasta(fastaFile)

In [26]:
seq.count("C")


Out[26]:
8

In [27]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"]  #dimer.
skip_pdb_list += ["1aqe"]  # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list

In [65]:
info_ = []
for pdb in pdb_list:
    seq = np.loadtxt(f"/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/database/S20_seq/{pdb}.seq", dtype=str)
    seq = str(seq)
    info_.append([pdb, seq, seq.count("C"), len(seq)])

In [66]:
data = pd.DataFrame(info_, columns=["Protein", "Seq", "Count", "Length"])

In [67]:
data.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/cys_count_data.csv")

In [47]:
data["Length"] = data["Seq"].apply(lambda x:len(x))

In [70]:
print(data.sort_values("Count").query("Count >= 6")["Protein"].to_list())


['1by2', '1rcb', '1hyp', '3lzt', '3cyr', '7rsa', '1rzl', '1b6e', '1poc', '1bea', '1poa']

In [64]:
data


Out[64]:
Res1 Res2 Type Theta Index1 Index2 r
0 LEU ALA Protein 0.000 0 18 11.045428
1 LEU ALA Water 0.000 0 18 11.045428
2 LEU ALA Direct 0.000 0 18 11.045428
3 LEU ARG Protein 0.033 0 40 9.838098
4 LEU ARG Water 0.000 0 40 9.838098
... ... ... ... ... ... ... ...
3097 TYR GLU Water 0.000 89 101 15.198022
3098 TYR GLU Direct 0.000 89 101 15.198022
3099 ILE TYR Protein 0.000 90 100 10.454025
3100 ILE TYR Water 0.000 90 100 10.454025
3101 ILE TYR Direct 0.000 90 100 10.454025

3102 rows × 7 columns


In [20]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)

In [75]:
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]

In [77]:
a["theta_gamma"] = a["Theta"] * a["Gamma"]

In [78]:
a[["Type", "theta_gamma"]].groupby("Type").sum()


Out[78]:
theta_gamma
Type
Direct -949.732540
Protein -479.435681
Water -8.402658

In [79]:
a["theta_gamma"].sum()


Out[79]:
-1437.5708789999999

In [80]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()

In [83]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()

In [91]:
b3.sort_values("theta_gamma").head(200)["theta_gamma"].sum()


Out[91]:
-1483.202181

In [96]:
a.query("Res1 == 'TYR' and Res2 == 'CYS' and Theta > 0.1 and Type =='Direct'").sort_values("r")


Out[96]:
Res1 Res2 Type Theta Index1 Index2 r Interaction Index Gamma theta_gamma
890 TYR CYS Direct 0.956 23 77 2.808838 Direct 88 -4.012 -3.835472
921 TYR CYS Direct 0.999 66 84 3.204737 Direct 88 -4.012 -4.007988
876 TYR CYS Direct 1.000 2 89 3.921630 Direct 88 -4.012 -4.012000
877 TYR CYS Direct 1.000 2 84 3.969086 Direct 88 -4.012 -4.012000
926 TYR CYS Direct 1.000 68 91 4.097748 Direct 88 -4.012 -4.012000
897 TYR CYS Direct 1.000 26 91 4.154008 Direct 88 -4.012 -4.012000
912 TYR CYS Direct 1.000 62 117 4.176590 Direct 88 -4.012 -4.012000
898 TYR CYS Direct 1.000 26 70 4.354343 Direct 88 -4.012 -4.012000
906 TYR CYS Direct 1.000 50 84 4.464048 Direct 88 -4.012 -4.012000
905 TYR CYS Direct 1.000 50 117 4.704973 Direct 88 -4.012 -4.012000
873 TYR CYS Direct 1.000 2 117 4.796232 Direct 88 -4.012 -4.012000
909 TYR CYS Direct 1.000 50 89 4.796544 Direct 88 -4.012 -4.012000
879 TYR CYS Direct 1.000 2 49 4.818510 Direct 88 -4.012 -4.012000
901 TYR CYS Direct 1.000 26 77 4.863667 Direct 88 -4.012 -4.012000
923 TYR CYS Direct 1.000 66 91 4.958446 Direct 88 -4.012 -4.012000
895 TYR CYS Direct 1.000 26 84 4.969316 Direct 88 -4.012 -4.012000
929 TYR CYS Direct 1.000 104 117 4.990378 Direct 88 -4.012 -4.012000
925 TYR CYS Direct 1.000 68 84 5.031310 Direct 88 -4.012 -4.012000
870 TYR CYS Direct 1.000 2 25 5.094150 Direct 88 -4.012 -4.012000
900 TYR CYS Direct 1.000 26 89 5.218085 Direct 88 -4.012 -4.012000
913 TYR CYS Direct 1.000 62 84 5.235210 Direct 88 -4.012 -4.012000
927 TYR CYS Direct 1.000 68 89 5.246518 Direct 88 -4.012 -4.012000
882 TYR CYS Direct 1.000 2 91 5.403767 Direct 88 -4.012 -4.012000
915 TYR CYS Direct 1.000 62 89 5.464846 Direct 88 -4.012 -4.012000
920 TYR CYS Direct 1.000 66 89 5.552884 Direct 88 -4.012 -4.012000
928 TYR CYS Direct 1.000 68 98 5.720343 Direct 88 -4.012 -4.012000
903 TYR CYS Direct 1.000 26 98 5.720448 Direct 88 -4.012 -4.012000
884 TYR CYS Direct 0.999 23 84 5.763156 Direct 88 -4.012 -4.007988
907 TYR CYS Direct 0.997 50 91 5.911228 Direct 88 -4.012 -3.999964
886 TYR CYS Direct 0.996 23 91 5.938394 Direct 88 -4.012 -3.995952
893 TYR CYS Direct 0.995 23 43 5.965650 Direct 88 -4.012 -3.991940
888 TYR CYS Direct 0.760 23 59 6.384588 Direct 88 -4.012 -3.049120
922 TYR CYS Direct 0.603 66 77 6.458406 Direct 88 -4.012 -2.419236
914 TYR CYS Direct 0.318 62 91 6.576455 Direct 88 -4.012 -1.275816

In [86]:
b3.sort_values("theta_gamma").head(20)


Out[86]:
Res1 Res2 Type theta_gamma
342 TYR CYS Direct -130.883476
321 TRP CYS Direct -106.422800
113 CYS TYR Protein -90.723716
343 TYR CYS Protein -87.706208
112 CYS TYR Direct -73.202952
320 TRP ASN Protein -58.802645
226 MET CYS Direct -50.142196
246 PHE CYS Direct -32.165329
279 SER CYS Direct -30.213630
188 ILE TYR Direct -28.798753
333 TRP TYR Direct -24.746634
17 ARG ASP Direct -24.667500
59 ASN TRP Protein -22.552750
319 TRP ARG Direct -22.380120
346 TYR ILE Direct -19.988483
94 CYS CYS Protein -19.848447
324 TRP ILE Direct -18.720945
110 CYS TRP Direct -17.923650
19 ARG CYS Direct -17.749700
88 CYS ARG Direct -17.403716

In [69]:
b2.sort_values("theta_gamma").tail(10)


Out[69]:
Res1 Res2 Type theta_gamma
87 CYS ALA Protein 18
112 CYS TYR Direct 19
320 TRP ASN Protein 19
336 TYR ALA Protein 20
357 TYR TYR Direct 22
343 TYR CYS Protein 28
113 CYS TYR Protein 29
321 TRP CYS Direct 30
342 TYR CYS Direct 34
94 CYS CYS Protein 45

In [82]:
a.query("Res1 == 'CYS' and Res2 == 'CYS' and Theta > 0.1")


Out[82]:
Res1 Res2 Type Theta Index1 Index2 r Interaction Index Gamma theta_gamma
3153 CYS CYS Protein 1.000 10 25 7.411365 Protein 284 -0.477 -0.477000
3154 CYS CYS Protein 0.987 10 59 9.068994 Protein 284 -0.477 -0.470799
3155 CYS CYS Protein 1.000 10 27 7.557992 Protein 284 -0.477 -0.477000
3157 CYS CYS Protein 1.000 10 84 8.094460 Protein 284 -0.477 -0.477000
3158 CYS CYS Protein 0.938 10 77 6.771117 Protein 284 -0.477 -0.447426
3159 CYS CYS Protein 0.998 10 43 7.104249 Protein 284 -0.477 -0.476046
3160 CYS CYS Protein 0.999 10 42 8.782217 Protein 284 -0.477 -0.476523
3161 CYS CYS Protein 0.678 10 91 9.425323 Protein 284 -0.477 -0.323406
3162 CYS CYS Protein 0.288 25 59 9.590501 Protein 284 -0.477 -0.137376
3163 CYS CYS Protein 0.976 25 98 9.128296 Protein 284 -0.477 -0.465552
3164 CYS CYS Protein 1.000 25 117 8.697128 Protein 284 -0.477 -0.477000
3165 CYS CYS Protein 1.000 25 70 7.770286 Protein 284 -0.477 -0.477000
3169 CYS CYS Protein 0.859 25 49 9.318975 Protein 284 -0.477 -0.409743
3170 CYS CYS Protein 0.903 25 43 9.277326 Protein 284 -0.477 -0.430731
3174 CYS CYS Protein 1.000 27 84 8.294059 Protein 284 -0.477 -0.477000
3175 CYS CYS Protein 1.000 27 42 8.483358 Protein 284 -0.477 -0.477000
3176 CYS CYS Protein 1.000 27 91 7.991209 Protein 284 -0.477 -0.477000
3177 CYS CYS Protein 0.932 27 70 6.761860 Protein 284 -0.477 -0.444564
3178 CYS CYS Protein 0.994 27 59 7.015271 Protein 284 -0.477 -0.474138
3179 CYS CYS Protein 0.244 27 89 9.613122 Protein 284 -0.477 -0.116388
3182 CYS CYS Protein 1.000 27 98 8.172283 Protein 284 -0.477 -0.477000
3183 CYS CYS Protein 0.898 27 43 6.717986 Protein 284 -0.477 -0.428346
3186 CYS CYS Protein 0.999 42 117 7.180018 Protein 284 -0.477 -0.476523
3190 CYS CYS Protein 0.135 42 77 9.685992 Protein 284 -0.477 -0.064395
3194 CYS CYS Protein 0.999 43 117 8.810677 Protein 284 -0.477 -0.476523
3198 CYS CYS Protein 1.000 43 77 7.933357 Protein 284 -0.477 -0.477000
3200 CYS CYS Protein 0.990 49 59 6.957804 Protein 284 -0.477 -0.472230
3201 CYS CYS Protein 0.969 49 98 9.154268 Protein 284 -0.477 -0.462213
3204 CYS CYS Protein 1.000 49 89 8.122187 Protein 284 -0.477 -0.477000
3205 CYS CYS Protein 1.000 49 84 8.692106 Protein 284 -0.477 -0.477000
3207 CYS CYS Protein 1.000 59 117 7.468670 Protein 284 -0.477 -0.477000
3210 CYS CYS Protein 0.995 59 70 8.971240 Protein 284 -0.477 -0.474615
3212 CYS CYS Protein 1.000 59 77 8.543577 Protein 284 -0.477 -0.477000
3213 CYS CYS Protein 1.000 59 98 8.630322 Protein 284 -0.477 -0.477000
3215 CYS CYS Protein 1.000 70 117 8.256154 Protein 284 -0.477 -0.477000
3216 CYS CYS Protein 1.000 70 89 8.483834 Protein 284 -0.477 -0.477000
3217 CYS CYS Protein 0.986 70 84 9.071579 Protein 284 -0.477 -0.470322
3218 CYS CYS Protein 1.000 70 91 7.877710 Protein 284 -0.477 -0.477000
3219 CYS CYS Protein 0.883 77 98 9.298271 Protein 284 -0.477 -0.421191
3221 CYS CYS Protein 0.999 77 89 8.816828 Protein 284 -0.477 -0.476523
3222 CYS CYS Protein 0.964 77 91 6.828478 Protein 284 -0.477 -0.459828
3224 CYS CYS Protein 1.000 84 117 8.211732 Protein 284 -0.477 -0.477000
3225 CYS CYS Protein 0.999 89 117 7.158068 Protein 284 -0.477 -0.476523
3226 CYS CYS Protein 0.999 91 117 8.794488 Protein 284 -0.477 -0.476523
3227 CYS CYS Protein 1.000 98 117 7.307405 Protein 284 -0.477 -0.477000
3316 CYS CYS Direct 1.000 25 89 3.975121 Direct 74 2.484 2.484000
3318 CYS CYS Direct 1.000 25 77 5.051316 Direct 74 2.484 2.484000
3333 CYS CYS Direct 0.102 27 43 6.717986 Direct 74 2.484 0.253368
3342 CYS CYS Direct 0.167 43 59 2.339643 Direct 74 2.484 0.414828

In [56]:
a.sort_values("theta_gamma")


Out[56]:
Res1 Res2 Type Theta Index1 Index2 Interaction Index Gamma theta_gamma
1904 GLN HIS Direct 1.000 9 46 Direct 93 -4.963 -4.963000
2490 MET CYS Direct 1.000 7 84 Direct 82 -4.654 -4.654000
2501 MET CYS Direct 1.000 7 43 Direct 82 -4.654 -4.654000
2499 MET CYS Direct 1.000 7 25 Direct 82 -4.654 -4.654000
2491 MET CYS Direct 0.999 7 42 Direct 82 -4.654 -4.649346
... ... ... ... ... ... ... ... ... ... ...
149 ASN ASN Direct 0.960 0 112 Direct 39 1.119 1.074240
5023 SER THR Direct 1.000 32 74 Direct 196 1.075 1.075000
6450 ASP TRP Protein 0.998 19 60 Protein 281 1.097 1.094806
3316 CYS CYS Direct 1.000 25 89 Direct 74 2.484 2.484000
3318 CYS CYS Direct 1.000 25 77 Direct 74 2.484 2.484000

7521 rows × 10 columns


In [4]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1

pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
def get_contact_info(pdbFile, seq, frame_index=0):
    # frame_index = 0
    cutoff = 9.5
    MAX_OFFSET = 9
    parser = PDBParser()
    structure = parser.get_structure("x", pdbFile)
    models = list(structure.get_models())
    model = models[frame_index]
    all_residues = list(model.get_residues())
    info_ = []
    for i, res1 in enumerate(all_residues):
        if seq[i] == "G":
            a1 = res1["CA"]
        else:
            a1 = res1["CB"]
        for j, res2 in enumerate(all_residues):
            if seq[j] == "G":
                a2 = res2["CA"]
            else:
                a2 = res2["CB"]
            dis = a1 - a2
            if dis < cutoff and j - i > MAX_OFFSET:
                info_.append([pdb, i, j, seq[i], seq[j], dis])
    info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj", "Dis"])
    return info

In [82]:
data = pd.read_csv("training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"]  #dimer.
skip_pdb_list += ["1aqe"]  # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list

In [9]:
info_ = []
for pdb in pdb_list:
    pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
    seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
    info = get_contact_info(pdbFile, seq)
    info_.append(info)
info = pd.concat(info_)

In [10]:


In [14]:
direct_contact_table = np.zeros((20,20))
for i, line in info.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    dis = line["Dis"]
    if dis < 6.5:
        direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
        if resi != resj:
            direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1

In [16]:
direct_contact_table.min()


Out[16]:
3.0

In [15]:
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [11]:
contact_table = np.zeros((20,20))
for i, line in info.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    dis = line["Dis"]
    contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
    if resi != resj:
        contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1

In [12]:
len(info)


Out[12]:
32267

In [13]:
contact_table.min()


Out[13]:
12.0

In [115]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [109]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [88]:
info.groupby(["Resi", "Resj"])["Dis"].count()


Out[88]:
Resi  Resj
A     A       297
      C        60
      D        87
      E        88
      F       105
             ... 
Y     S        55
      T        61
      V       122
      W        24
      Y        46
Name: Dis, Length: 400, dtype: int64

In [81]:
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
get_contact_info(pdbFile, seq)


Out[81]:
Protein i j Resi Resj Dis
0 2a0b 1 104 K H 5.800971
1 2a0b 1 105 K D 8.981970
2 2a0b 1 108 K V 4.504978
3 2a0b 1 111 K A 8.126446
4 2a0b 2 108 S V 5.637322
... ... ... ... ... ... ...
267 2a0b 79 94 I W 6.115420
268 2a0b 79 95 I I 5.385581
269 2a0b 79 98 I M 4.980760
270 2a0b 81 94 S W 8.732571
271 2a0b 84 94 L W 7.445224

272 rows × 6 columns


In [78]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1

pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")

frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
    if seq[i] == "G":
        a1 = res1["CA"]
    else:
        a1 = res1["CB"]
    for j, res2 in enumerate(all_residues):
        if seq[j] == "G":
            a2 = res2["CA"]
        else:
            a2 = res2["CB"]
        dis = a1 - a2
        if dis < cutoff and j - i > MAX_OFFSET:
            info_.append([pdb, i, j, seq[i], seq[j], dis])

In [74]:


In [6]:
def getContactMapFromPDB(pdbFile, seq, frame_index=0):
    cutoff = 9.5
    MAX_OFFSET = 9
    parser = PDBParser()
    structure = parser.get_structure("x", pdbFile)
    models = list(structure.get_models())
    model = models[frame_index]
    all_residues = list(model.get_residues())
    n = len(all_residues)
    contact_table = np.zeros((n,n))
    # print(pdb, n)#
    for i, res1 in enumerate(all_residues):
        if seq[i] == "G":
            a1 = res1["CA"]
        else:
            a1 = res1["CB"]
        for j, res2 in enumerate(all_residues):
            if seq[j] == "G":
                a2 = res2["CA"]
            else:
                a2 = res2["CB"]
            contact_table[i][j] = a1 - a2

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data

In [81]:
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
# pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[81]:
<matplotlib.image.AxesImage at 0x11f8780b8>

In [7]:
pdb = "3lzt"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[7]:
<matplotlib.image.AxesImage at 0x11cffc208>

In [66]:
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[66]:
<matplotlib.image.AxesImage at 0x120a5f240>

In [58]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)

In [63]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

In [64]:
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[64]:
<matplotlib.image.AxesImage at 0x122e16278>

In [56]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/native.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    if seq[i] == "G":
        a1 = res1["CA"]
    else:
        a1 = res1["CB"]
    for j, res2 in enumerate(all_residues):
        if seq[j] == "G":
            a2 = res2["CA"]
        else:
            a2 = res2["CB"]
        contact_table[i][j] = a1 - a2

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0

In [55]:



Out[55]:
[<Model id=0>, <Model id=1>]

In [23]:
three_to_one("TRP")


Out[23]:
'W'

In [25]:
seq.count("W")


Out[25]:
1

In [ ]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"

cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    for j, res2 in enumerate(all_residues):
        contact_table[i][j] = res1["CA"]-res2["CA"]

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
data_ca = data

In [11]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/tmp.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)

data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data

In [12]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/original.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)

data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_original = data


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [14]:
data_original.dtypes


Out[14]:
i      int64
j      int64
p    float64
dtype: object

In [15]:
data_A.dtypes


Out[15]:
i      int64
j      int64
p    float64
dtype: object

In [17]:
seq = "MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA"

In [18]:
len(seq)


Out[18]:
807

In [33]:
seq


Out[33]:
'MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA'

In [61]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p

In [72]:
data_s = data.astype(int)
t_s = (t>0.08).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-72-7d542ff5db06> in <module>
      1 data_s = data.astype(int)
      2 t_s = (t>0.08).astype(float)
----> 3 combined = data_s + t_s * 2
      4 from matplotlib import colors
      5 cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])

ValueError: operands could not be broadcast together with shapes (766,766) (807,807) 

In [71]:
plt.imshow((t>0.08).astype(float), origin="bottom")
plt.colorbar()


Out[71]:
<matplotlib.colorbar.Colorbar at 0x1a198c42b0>

In [70]:
plt.imshow(data, origin="bottom")


Out[70]:
<matplotlib.image.AxesImage at 0x1a195e3fd0>

In [20]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()


Out[20]:
<matplotlib.colorbar.Colorbar at 0x1a176c9fd0>

In [21]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_original.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p

In [26]:
plt.imshow((t>0.1).astype(float), origin="bottom")
plt.colorbar()


Out[26]:
<matplotlib.colorbar.Colorbar at 0x1a184b3cc0>

In [ ]:


In [ ]:


In [7]:
len(a[5])


Out[7]:
51

In [22]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)

fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)

In [86]:
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
    out.write(header.format(pdb))
    out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
    for index, d in data_A.iterrows():
    #     print(index)
        i = int(d["i"])
        j = int(d["j"])
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    for index, d in data_B.iterrows():
    #     print(index)
        i = int(d["i"]) + len(seq_A)
        j = int(d["j"]) + len(seq_A)
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    out.write("END\n")

In [80]:
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = round(d["p"], 8)
    s = int(d["s"])
    ss = int(d["ss"])

In [84]:
data_A


Out[84]:
i j s ss p
0 34 59 0.0 8.0 0.997731
1 38 56 0.0 8.0 0.980999
2 41 52 0.0 8.0 0.973823
3 38 59 0.0 8.0 0.966630
4 27 66 0.0 8.0 0.962626
5 34 62 0.0 8.0 0.961573
6 35 59 0.0 8.0 0.916884
7 41 55 0.0 8.0 0.895539
8 31 63 0.0 8.0 0.874988
9 30 62 0.0 8.0 0.842116
10 20 73 0.0 8.0 0.814816
11 27 69 0.0 8.0 0.767721
12 37 55 0.0 8.0 0.758386
13 83 160 0.0 8.0 0.728676
14 42 52 0.0 8.0 0.727045
15 23 69 0.0 8.0 0.709704
16 83 161 0.0 8.0 0.701026
17 37 276 0.0 8.0 0.693589
18 31 62 0.0 8.0 0.680381
19 31 66 0.0 8.0 0.678132
20 37 275 0.0 8.0 0.675751
21 41 48 0.0 8.0 0.668435
22 109 195 0.0 8.0 0.657830
23 217 274 0.0 8.0 0.654746
24 113 199 0.0 8.0 0.654502
25 105 192 0.0 8.0 0.653025
26 109 196 0.0 8.0 0.652981
27 102 188 0.0 8.0 0.652657
28 112 196 0.0 8.0 0.650273
29 106 192 0.0 8.0 0.650126
... ... ... ... ... ...
36555 32 53 0.0 8.0 0.000235
36556 40 63 0.0 8.0 0.000228
36557 35 53 0.0 8.0 0.000224
36558 30 56 0.0 8.0 0.000219
36559 39 54 0.0 8.0 0.000218
36560 42 62 0.0 8.0 0.000209
36561 39 53 0.0 8.0 0.000201
36562 36 65 0.0 8.0 0.000194
36563 40 58 0.0 8.0 0.000191
36564 36 56 0.0 8.0 0.000183
36565 41 61 0.0 8.0 0.000179
36566 36 64 0.0 8.0 0.000146
36567 28 60 0.0 8.0 0.000140
36568 42 61 0.0 8.0 0.000136
36569 40 60 0.0 8.0 0.000136
36570 40 62 0.0 8.0 0.000133
36571 36 53 0.0 8.0 0.000128
36572 32 61 0.0 8.0 0.000121
36573 40 57 0.0 8.0 0.000088
36574 36 60 0.0 8.0 0.000080
36575 32 57 0.0 8.0 0.000078
36576 39 62 0.0 8.0 0.000075
36577 36 57 0.0 8.0 0.000066
36578 40 61 0.0 8.0 0.000064
36579 39 57 0.0 8.0 0.000064
36580 32 60 0.0 8.0 0.000058
36581 32 56 0.0 8.0 0.000049
36582 39 58 0.0 8.0 0.000049
36583 39 61 0.0 8.0 0.000045
36584 36 61 0.0 8.0 0.000044

36585 rows × 5 columns


In [79]:
round(p, 8)


Out[79]:
0.0023355

In [23]:
seq = seq_A + seq_B

In [26]:
seq


Out[26]:
'SMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNLYSSVWILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEADDRYICCRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSGHQKRKALKPTVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPILYALGASCHRPDKCCLGYQKRPLPQVLLSSWYPTSQLCSKPGVIFLTKRGRQVCADKSKDWVKKLMQQLPVTA'

In [27]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p

In [28]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()


Out[28]:
<matplotlib.colorbar.Colorbar at 0x1a184bfb38>

In [29]:
def getSeqFromRaptorXContact(fileLocation):
    with open(fileLocation) as f:
        a = f.readlines()
    i = 4
    seq = ""
    assert a[i] == "MODEL 1\n"
    i += 1
    while True:
        line = a[i]
        if line[0].isdigit():
            break
        i += 1
        seq += line.strip()
    # print(i)
    return seq
def getContactMapFromPDB(pdbFile):
    cutoff = 9.5
    MAX_OFFSET = 6
    parser = PDBParser()
    structure = parser.get_structure('target', pdbFile)
    all_residues = list(structure.get_residues())
    n = len(all_residues)
    contact_table = np.zeros((n,n))
    # print(pdb, n)#
    for i, res1 in enumerate(all_residues):
        for j, res2 in enumerate(all_residues):
            contact_table[i][j] = res1["CA"]-res2["CA"]

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data

In [53]:
pdbFile = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4_A.pdb"
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
# chainA = structure[0]["A"]
# all_residues = list(chainA.get_residues())
all_residues = []
for res in structure.get_residues():
    if res.get_id()[0] == " ":
        all_residues.append(res)
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    for j, res2 in enumerate(all_residues):
        contact_table[i][j] = res1["CA"]-res2["CA"]

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0

In [60]:
plt.imshow(data, origin="bottom")


Out[60]:
<matplotlib.image.AxesImage at 0x1a18819128>

In [58]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)

In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain A is discontinuous at line 8106.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain G is discontinuous at line 8197.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain A is discontinuous at line 8214.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain G is discontinuous at line 8653.
  PDBConstructionWarning,
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-30-304a6c71b370> in <module>
----> 1 data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")

<ipython-input-29-efa1942577c9> in getContactMapFromPDB(pdbFile)
     25     for i, res1 in enumerate(all_residues):
     26         for j, res2 in enumerate(all_residues):
---> 27             contact_table[i][j] = res1["CA"]-res2["CA"]
     28 
     29     data = (contact_table < cutoff)

~/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/Entity.py in __getitem__(self, id)
     40     def __getitem__(self, id):
     41         """Return the child with given id."""
---> 42         return self.child_dict[id]
     43 
     44     def __delitem__(self, id):

KeyError: 'CA'

In [ ]:


In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")


4rws 346

In [70]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)

In [72]:
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")


Out[72]:
<matplotlib.image.AxesImage at 0x1a24ae82b0>

In [60]:
combined = data_s + t_s * 2

In [63]:
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[63]:
<matplotlib.image.AxesImage at 0x1a235698d0>

In [87]:
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)

In [88]:
model = structure[0]

In [90]:
a = list(model.get_chains())

In [92]:
c = a[0]

In [94]:
c.id


Out[94]:
'A'

In [95]:
c = "ALL"

In [99]:
"ABC" is not "ALL"


Out[99]:
True

In [93]:
c.get_id()


Out[93]:
'A'

In [55]:
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")


Out[55]:
<matplotlib.image.AxesImage at 0x1a21a9a9b0>

In [43]:
plt.imshow(data_s, origin="bottom")


Out[43]:
<matplotlib.image.AxesImage at 0x1a1cd67828>

In [44]:
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()


Out[44]:
<matplotlib.image.AxesImage at 0x1a1d144e48>

In [16]:
346


Out[16]:
70

In [17]:
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"

In [20]:
len(seq)


Out[20]:
276

In [18]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-f0b1e6216f4f> in <module>
      4 data["i"] = data["i"].astype(int)
      5 data["j"] = data["j"].astype(int)
----> 6 n = int(info.query(f"Protein =='{pdb}'")["Length"])
      7 t = np.zeros((n,n))
      8 for index, d in data.iterrows():

NameError: name 'info' is not defined

In [ ]:
def getContactMap(pdb):
    fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
    data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
    data["i"] = data["i"].astype(int)
    data["j"] = data["j"].astype(int)
    n = int(info.query(f"Protein =='{pdb}'")["Length"])
    t = np.zeros((n,n))
    for index, d in data.iterrows():
    #     print(index)
        i = int(d["i"]) - 1
        j = int(d["j"]) - 1 
        p = d["p"]
    #     print(i,j,p)
        t[i,j] = p
        t[j,i] = p