notebook.community

Edit and run



In [1]:

    
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions

from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2



In [2]:

    
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3



In [3]:

    
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100



In [20]:

    
def get_cb_atom(res, resType):
    if resType == "G":
        atom = res["CA"]
    else:
        atom = res["CB"]
    return atom



In [40]:

    
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1

pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
def get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0):
    # frame_index = 0
    cutoff = 9.5
    MAX_OFFSET = 9
    parser = PDBParser()
    structure = parser.get_structure("x", pdbFile)
    models = list(structure.get_models())
    model = models[frame_index]
    all_residues = list(model.get_residues())
    
    decoy_structure = parser.get_structure("x", movieFile)
    decoy_models = list(decoy_structure.get_models())

    info_ = []
    for i, res1 in enumerate(all_residues):
        a1 = get_cb_atom(res1, seq[i])
        for j, res2 in enumerate(all_residues):
            a2 = get_cb_atom(res2, seq[j])
            dis = a1 - a2
            dis_list = [dis]
            for decoy_frame_index in range(-50, 0): 
                decoy_model = decoy_models[decoy_frame_index]
                decoy_all_residues = list(decoy_model.get_residues())
                assert len(all_residues) == len(decoy_all_residues)
                decoy_a1 = get_cb_atom(decoy_all_residues[i], seq[i])
                decoy_a2 = get_cb_atom(decoy_all_residues[j], seq[j])
                decoy_dis = decoy_a1 - decoy_a2
                dis_list.append(decoy_dis)
            with_in_cutoff = any(dis<cutoff for dis in dis_list)
            if j - i > MAX_OFFSET and with_in_cutoff:
                info_.append([pdb, i, j, seq[i], seq[j]] + dis_list)
    
    info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj"] + ["dis_native"] + [f"dis{i}" for i in range(0,50)])
    return info



In [41]:

    
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
info = get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0)



In [83]:

    
info_ = []
for i, pdb in enumerate(pdb_list):
    print(i)
    pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
    seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
    movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_2/{pdb}/0/movie.pdb"
    info = get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0)
    info_.append(info)
    
    
    
    
info = pd.concat(info_)



In [85]:

    
info.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/dis_info.csv")



In [87]:

    
info = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/dis_info.csv", index_col=0)



In [180]:

    
a = pd.read_csv("/Users/weilu/opt/parameters/side_chain/cbd_cbd_real_contact_symmetric.csv")



In [38]:

    
a









    Out[38]:







  
    
      
      Res1
      Res2
      Type
      Theta
      Index1
      Index2
      r
      Interaction
      Index
      Gamma
      theta_gamma
    
  
  
    
      0
      ILE
      ASN
      Protein
      0.509
      0
      101
      9.496481
      Protein
      256
      -0.463
      -0.235667
    
    
      1
      ILE
      ASN
      Protein
      0.981
      32
      101
      9.105055
      Protein
      256
      -0.463
      -0.454203
    
    
      2
      ILE
      ASN
      Protein
      0.000
      47
      86
      11.481540
      Protein
      256
      -0.463
      -0.000000
    
    
      3
      ILE
      ASN
      Protein
      0.548
      47
      92
      9.480688
      Protein
      256
      -0.463
      -0.253724
    
    
      4
      ILE
      ASN
      Protein
      0.000
      49
      86
      14.068905
      Protein
      256
      -0.463
      -0.000000
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4129
      ASN
      PHE
      Water
      0.000
      92
      117
      8.285746
      Water
      470
      0.174
      0.000000
    
    
      4130
      ASN
      PHE
      Direct
      0.000
      92
      117
      8.285746
      Direct
      50
      0.113
      0.000000
    
    
      4131
      ASN
      GLU
      Protein
      0.000
      92
      118
      10.309171
      Protein
      253
      -0.033
      -0.000000
    
    
      4132
      ASN
      GLU
      Water
      0.000
      92
      118
      10.309171
      Water
      463
      -0.182
      -0.000000
    
    
      4133
      ASN
      GLU
      Direct
      0.000
      92
      118
      10.309171
      Direct
      43
      -0.151
      -0.000000
    
  

4134 rows × 11 columns



In [187]:

    
res1_name = "ARG"
res2_name = "VAL"
a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")









    Out[187]:







  
    
      
      ResName1
      ResName2
      r_max
      r_min
      r_diff



In [188]:

    
a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")









    Out[188]:







  
    
      
      ResName1
      ResName2
      r_max
      r_min
      r_diff
    
  
  
    
      67
      VAL
      ARG
      5.138564
      3.673704
      1.46486



In [189]:

    
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
    b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
r_min = float(b["r_min"])
r_max = float(b["r_max"])



In [199]:



In [202]:

    
from Bio.PDB.Polypeptide import three_to_one



In [204]:

    
three_to_one("SER")









    Out[204]:





'S'



In [201]:



In [205]:

    
gamma_se_map_1_letter = {   'A': 0,  'R': 1,  'N': 2,  'D': 3,  'C': 4,
                            'Q': 5,  'E': 6,  'G': 7,  'H': 8,  'I': 9,
                            'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14,
                            'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19}
def three_to_index(resName):
    return gamma_se_map_1_letter[three_to_one(resName)]



In [207]:

    
three_to_index("GLY")









    Out[207]:





7



In [208]:

    
r_min_direct_table = np.zeros((20, 20))
r_min_direct_table[three_to_index("GLY"),:] = 2.5
r_min_direct_table[:,three_to_index("GLY")] = 2.5
r_max_direct_table = np.zeros((20, 20))
r_max_direct_table[three_to_index("GLY"),:] = 6.5
r_max_direct_table[:,three_to_index("GLY")] = 6.5

r_min_mediated_table = np.zeros((20, 20))
r_min_mediated_table[three_to_index("GLY"),:] = 6.5
r_min_mediated_table[:,three_to_index("GLY")] = 6.5
r_max_mediated_table = np.zeros((20, 20))
r_max_mediated_table[three_to_index("GLY"),:] = 9.5
r_max_mediated_table[:,three_to_index("GLY")] = 9.5



for i, line in a.iterrows():
    res1 = line["ResName1"]
    res2 = line["ResName2"]
    r_min = line["r_min"]
    r_max = line["r_max"]
    r_min_direct_table[three_to_index(res1)][three_to_index(res2)] = r_min - 0.5
    r_min_direct_table[three_to_index(res2)][three_to_index(res1)] = r_min - 0.5
    
    r_max_direct_table[three_to_index(res1)][three_to_index(res2)] = r_max + 1.5
    r_max_direct_table[three_to_index(res2)][three_to_index(res1)] = r_max + 1.5
    
    r_min_mediated_table[three_to_index(res1)][three_to_index(res2)] = r_max + 1.5
    r_min_mediated_table[three_to_index(res2)][three_to_index(res1)] = r_max + 1.5
    
    r_max_mediated_table[three_to_index(res1)][three_to_index(res2)] = r_max + 4.5
    r_max_mediated_table[three_to_index(res2)][three_to_index(res1)] = r_max + 4.5



In [194]:

    
a.query(f"ResName1=='GLY'")









    Out[194]:







  
    
      
      ResName1
      ResName2
      r_max
      r_min
      r_diff



In [182]:

    
a["r_diff"] = a["r_max"] - a["r_min"]



In [184]:

    
a["r_min"].hist()









    Out[184]:





<matplotlib.axes._subplots.AxesSubplot at 0x16805a9b0>



In [183]:

    
a["r_diff"].hist()









    Out[183]:





<matplotlib.axes._subplots.AxesSubplot at 0x13751fa90>



In [88]:

    
info["with_in_cutoff"] = info.apply(lambda x: sum(x[5:]<6.5), axis=1)



In [89]:

    
cutoff = 6.5
info["correct"] = info.apply(lambda x: sum((x[5]<cutoff) & (x[6:]<cutoff)), axis=1)



In [90]:

    
info["n_decoy"] = 50



In [91]:

    
result = info.query("with_in_cutoff > 0").groupby(["Resi", "Resj"])[["with_in_cutoff", "correct", "n_decoy"]].sum().reset_index()



In [92]:

    
result["correct_ratio"] = result["correct"] / result["n_decoy"]



In [134]:

    
a = -1000.123
b = -10.1234
c = -1000.12
print(f"www{a:>8.2f}ww{b:>8.3f}ww{c:>8.3f}ww")
print('{:.8s}'.format('{:0.4f}'.format(a)))









    



www-1000.12ww -10.123ww-1000.120ww
-1000.12



In [ ]:

    
def prepare_virtual_sites(pdb_file, use_cis_proline=False):
    parser = PDBParser(QUIET=True)
    structure=parser.get_structure('X',pdb_file,)
    for model in structure:
        for chain in model:
            r_im={}
            r_i={}
            for residue in chain:
                r_im=r_i
                r_i={}
                for atom in residue:
                    r_i[atom.get_name()]=atom
                if use_cis_proline and residue.get_resname() == "IPR":
                    if 'N' in r_i:
                        r_i['N'].set_coord(-0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord())
                    if 'C' in r_im:
                        r_im['C'].set_coord(0.2196*r_im['CA'].get_coord()+ 0.2300*r_i['CA'].get_coord() + 0.5507*r_im['O'].get_coord())
                    if 'H' in r_i:
                        r_i['H'].set_coord(-0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord())
                else:
                    if 'N' in r_i:
                        r_i['N'].set_coord(0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord())
                    if 'C' in r_im:
                        r_im['C'].set_coord(0.44365*r_im['CA'].get_coord()+ 0.23520*r_i['CA'].get_coord()+ 0.32115 *r_im['O'].get_coord())
                    if 'H' in r_i:
                        r_i['H'].set_coord(0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord())
    io = PDBIO()
    io.set_structure(structure)
    io.save(pdb_file)



In [171]:

    
pdb_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/C1903-openmmawsem.pdb"
parser = PDBParser(QUIET=True)
structure=parser.get_structure('X',pdb_file,)
res = list(structure.get_residues())



In [172]:



In [ ]:

    
def prepare_virtual_sites(pdb_file, use_cis_proline=False):
    output_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/new.pdb"
    output = open(output_file, "w")
    with open(pdb_file) as f:
        index = 0
        for line in f:
            # print(line)
            splitline = line.split()
            if len(line)>4 and line[0:4] == "ATOM":
                try:
                    atom_index=line[6:11].strip()
                    atom_type=line[12:16].strip()
                    res_type=line[17:20].strip()
                    chain=line[21].strip()
                    res_index=line[22:26].strip()
                    x=line[30:38].strip()
                    y=line[38:46].strip()
                    z=line[46:54].strip()
                    element = line[76:78].strip()
                except ValueError:
                    print(line)
                    raise
            else:
                continue
            res_index_zero_base = int(res_index) - 1

            r_im = res[res_index_zero_base-1]
            r_i = res[res_index_zero_base]
            try:
                r_ip = res[res_index_zero_base+1]
            except:
                r_ip = res[res_index_zero_base]  # won't be used
            if use_cis_proline and res_type == "IPR":
                n_coord = -0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord()
                c_coord = 0.2196*r_i['CA'].get_coord()+ 0.2300*r_ip['CA'].get_coord() + 0.5507*r_i['O'].get_coord()
                h_coord = -0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord()
            else:
                n_coord = 0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord()
                c_coord = 0.44365*r_i['CA'].get_coord()+ 0.23520*r_ip['CA'].get_coord()+ 0.32115 *r_i['O'].get_coord()
                h_coord = 0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord()

            line_list=list(line)
            index += 1
            line_list[6:11] = "{:5d}".format(index)
            if atom_type == "N":
                line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(n_coord[0]))
                line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(n_coord[1]))
                line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(n_coord[2])))
            if atom_type == "C":
                line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(c_coord[0]))
                line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(c_coord[1]))
                line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(c_coord[2])))
            if atom_type == "H":
                line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(h_coord[0]))
                line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(h_coord[1]))
                line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(h_coord[2])))
            new_line=''.join(line_list)
            output.write(new_line)
    index += 1
    line_list[6:11] = "{:5d}".format(index)
    line_list[0:4] = "TER "
    line_list[30:78] = " " * 48
    new_line=''.join(line_list)
    output.write(new_line)
    output.write("END\n")
    output.close()



In [177]:

    
use_cis_proline = False
output_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/new.pdb"
output = open(output_file, "w")
with open(pdb_file) as f:
    index = 0
    for line in f:
        # print(line)
        splitline = line.split()
        if len(line)>4 and line[0:4] == "ATOM":
            try:
                atom_index=line[6:11].strip()
                atom_type=line[12:16].strip()
                res_type=line[17:20].strip()
                chain=line[21].strip()
                res_index=line[22:26].strip()
                x=line[30:38].strip()
                y=line[38:46].strip()
                z=line[46:54].strip()
                element = line[76:78].strip()
            except ValueError:
                print(line)
                raise
        else:
            continue
#         if int(res_index) == terminal_residues[chain][0]:
#             awsem_atoms.remove("N")
#             awsem_atoms.remove("H")
#         if int(res_index) == terminal_residues[chain][1]:
#             awsem_atoms.remove("C")
        res_index_zero_base = int(res_index) - 1
        
        r_im = res[res_index_zero_base-1]
        r_i = res[res_index_zero_base]
        try:
            r_ip = res[res_index_zero_base+1]
        except:
            r_ip = res[res_index_zero_base]  # won't be used
        if use_cis_proline and res_type == "IPR":
            n_coord = -0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord()
            c_coord = 0.2196*r_i['CA'].get_coord()+ 0.2300*r_ip['CA'].get_coord() + 0.5507*r_i['O'].get_coord()
            h_coord = -0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord()
        else:
            n_coord = 0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord()
            c_coord = 0.44365*r_i['CA'].get_coord()+ 0.23520*r_ip['CA'].get_coord()+ 0.32115 *r_i['O'].get_coord()
            h_coord = 0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord()

        line_list=list(line)
        index += 1
        line_list[6:11] = "{:5d}".format(index)
        if atom_type == "N":
            line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(n_coord[0]))
            line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(n_coord[1]))
            line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(n_coord[2])))
        if atom_type == "C":
            line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(c_coord[0]))
            line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(c_coord[1]))
            line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(c_coord[2])))
        if atom_type == "H":
            line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(h_coord[0]))
            line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(h_coord[1]))
            line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(h_coord[2])))
        new_line=''.join(line_list)
        output.write(new_line)
index += 1
line_list[6:11] = "{:5d}".format(index)
line_list[0:4] = "TER "
line_list[30:78] = " " * 48
new_line=''.join(line_list)
output.write(new_line)
output.write("END\n")
output.close()



In [154]:

    
n_coord









    Out[154]:





array([  753.12573,   619.23346, -1735.6355 ], dtype=float32)



In [149]:

    
new_line









    Out[149]:





'ATOM  11109  N   IGL A 685    753.126 619.2331-1735.631.00  0.00           N  \n'



In [162]:

    
line









    Out[162]:





'ATOM  11113  O   IGL A 685     755.733 620.303-1734.37  1.00  0.00           O  \n'



In [ ]:



In [159]:

    
'{:.8s}'.format('{:8.3f}'.format(n_coord[2]))









    Out[159]:





'-1735.63'



In [135]:

    
print(f"www{str(a):.8s}ww{b:>8.3f}ww{c:>8.3f}ww")









    



www-1000.12ww -10.123ww-1000.120ww



In [93]:

    
direct_contact_table = np.zeros((20,20)) - 1 
for i, line in result.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    correct_ratio = line["correct_ratio"]
    direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] = correct_ratio
    if resi != resj:
        direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] = correct_ratio
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [80]:

    
direct_contact_table = np.zeros((20,20)) - 1 
for i, line in result.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    correct_ratio = line["correct_ratio"]
    direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] = correct_ratio
    if resi != resj:
        direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] = correct_ratio
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [72]:

    
result.reset_index()









    Out[72]:







  
    
      
      Resi
      Resj
      with_in_cutoff
      correct
      n_decoy
    
  
  
    
      0
      A
      A
      0
      0
      50
    
    
      1
      A
      D
      0
      0
      100
    
    
      2
      A
      E
      0
      0
      250
    
    
      3
      A
      G
      5
      0
      100
    
    
      4
      A
      H
      0
      0
      100
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      222
      Y
      Q
      2
      0
      300
    
    
      223
      Y
      R
      16
      0
      50
    
    
      224
      Y
      S
      0
      0
      50
    
    
      225
      Y
      V
      4
      0
      250
    
    
      226
      Y
      W
      0
      0
      150
    
  

227 rows × 5 columns



In [63]:

    
info.query("with_in_cutoff > 0")









    Out[63]:







  
    
      
      Protein
      i
      j
      Resi
      Resj
      dis_native
      dis0
      dis1
      dis2
      dis3
      ...
      dis42
      dis43
      dis44
      dis45
      dis46
      dis47
      dis48
      dis49
      with_in_cutoff
      correct
    
  
  
    
      1
      2a0b
      0
      96
      S
      E
      24.540379
      8.589619
      11.612119
      9.520317
      11.036819
      ...
      10.001908
      13.052809
      15.719584
      7.920152
      8.285571
      9.403168
      8.631104
      9.058750
      3
      0
    
    
      2
      2a0b
      0
      97
      S
      E
      17.403204
      8.989487
      11.158300
      9.647176
      11.726250
      ...
      7.322862
      10.191383
      9.798911
      8.174930
      8.526396
      11.080670
      11.505523
      9.939404
      2
      0
    
    
      4
      2a0b
      0
      100
      S
      E
      21.171255
      4.933434
      7.880610
      5.016576
      7.941772
      ...
      4.796510
      7.840240
      10.343412
      5.293333
      5.680943
      6.304268
      5.815565
      4.677626
      33
      0
    
    
      6
      2a0b
      0
      103
      S
      R
      22.197365
      8.802500
      4.923718
      7.762784
      6.222366
      ...
      8.946397
      5.851907
      11.568497
      7.744026
      7.826056
      8.860541
      5.331914
      6.970852
      15
      0
    
    
      7
      2a0b
      0
      104
      S
      H
      13.256738
      7.489680
      9.312119
      7.424554
      9.970190
      ...
      6.583136
      7.799269
      9.691411
      8.463796
      7.372634
      10.546615
      9.840426
      8.611886
      3
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      809
      2a0b
      79
      91
      I
      V
      5.729116
      6.418045
      8.555470
      6.038908
      8.960972
      ...
      5.884872
      5.512733
      5.969506
      6.249822
      7.903872
      5.836083
      6.846112
      5.806306
      21
      20
    
    
      811
      2a0b
      79
      94
      I
      W
      6.115420
      6.053108
      6.264494
      5.522557
      9.787152
      ...
      5.877768
      5.688467
      6.387781
      6.113961
      4.718327
      5.912821
      5.814579
      6.234229
      41
      40
    
    
      812
      2a0b
      79
      95
      I
      I
      5.385581
      7.151258
      8.017377
      7.467034
      7.508904
      ...
      5.204203
      6.583248
      5.792931
      7.581868
      9.660872
      4.388875
      8.686253
      5.543818
      11
      10
    
    
      814
      2a0b
      79
      98
      I
      M
      4.980760
      5.208953
      6.010066
      6.671427
      6.100369
      ...
      5.462386
      9.703018
      4.854737
      7.726511
      6.881110
      6.241440
      5.867614
      6.434775
      27
      26
    
    
      821
      2a0b
      84
      94
      L
      W
      7.445224
      7.125351
      5.421778
      6.440860
      5.713799
      ...
      6.226047
      5.620839
      5.281493
      7.350391
      7.135801
      6.470656
      6.404382
      6.633040
      28
      0
    
  

314 rows × 58 columns



In [5]:

    
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
s = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)



In [47]:

    
structure = s
kappa=5.0
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
# kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
    res1index = get_local_index(res1)
    res1chain = get_chain(res1)
    rho_i = cb_density[res1globalindex]
    for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
        res2index = get_local_index(res2)
        res2chain = get_chain(res2)
        res2globalindex = get_global_index(res_list, res2)
        rho_j = cb_density[res2globalindex]

        if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
            res1type = get_res_type(res_list, res1)
            res2type = get_res_type(res_list, res2)
            rij = get_interaction_distance(res1, res2)
            theta = interaction_well(rij, r_min, r_max, kappa)
            water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
            protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
            data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex])
            data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex])
            direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
            data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex])
            # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
            # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2"])



In [74]:



In [4]:

    
def get_interaction_data(structure):
    # get all the pair of interaction, direct and mediated. as a dataFrame.
    res_list = get_res_list(structure)
    neighbor_list = get_neighbor_list(structure)
    sequence = get_sequence_from_structure(structure)
    cb_density = calculate_cb_density(res_list, neighbor_list)
    r_min_direct = 2.5
    r_max_direct = 6.5
    r_min = 6.5
    r_max = 9.5
    kappa = 5.0
    min_seq_sep = 10
    density_threshold = 2.6
    density_kappa = 7.0
    # phi_mediated_contact_well = np.zeros((2, 20,20))
    v_mediated = 0
    data_ = []
    for res1globalindex, res1 in enumerate(res_list):
        res1index = get_local_index(res1)
        res1chain = get_chain(res1)
        rho_i = cb_density[res1globalindex]
        for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
            res2index = get_local_index(res2)
            res2chain = get_chain(res2)
            res2globalindex = get_global_index(res_list, res2)
            rho_j = cb_density[res2globalindex]

            if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
                res1type = get_res_type(res_list, res1)
                res2type = get_res_type(res_list, res2)
                rij = get_interaction_distance(res1, res2)
                theta = interaction_well(rij, r_min, r_max, kappa)
                water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
                protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
                data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij])
                data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij])
                direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
                data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij])
                # protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
                # water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
    data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r"])

    # contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
    # contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
    # contact_gammas["Type"] = contact_gammas["Interaction"]
    # a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
    # a["theta_gamma"] = a["Theta"] * a["Gamma"]
    return data


def get_contact_gamma_info(gammaFile):
    # check the gamma.
    # read in gamma, and sort by size.
    # gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
    gamma = np.loadtxt(gammaFile)

    res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
                            'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

    inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
    c = 0
    info_ = []
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            c += 1
    for i in range(20):
        for j in range(i, 20):
            info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
            if i != j:
                info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
            info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
            if i != j:
                info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
            c += 1
    contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
    return contact_gammas



In [30]:

    
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_larger_excl_withoutBurial/iter_larger_excl_withoutBurial_30"
contact_gammas = get_contact_gamma_info(gammaFile)

contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]

# pdb = "1sfp"
# pdb = "1who"
pdb = "1jon"
pdb = "1neu"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/crystal_structure.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data_native = get_interaction_data(structure)

a_native = data_native.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a_native["theta_gamma"] = a_native["Theta"] * a_native["Gamma"]



In [31]:

    
a[["Type", "theta_gamma"]].groupby("Type").sum()









    Out[31]:







  
    
      
      theta_gamma
    
    
      Type
      
    
  
  
    
      Direct
      -101.276373
    
    
      Protein
      -105.200971
    
    
      Water
      -1.311072



In [32]:

    
a_native[["Type", "theta_gamma"]].groupby("Type").sum()









    Out[32]:







  
    
      
      theta_gamma
    
    
      Type
      
    
  
  
    
      Direct
      -59.127062
    
    
      Protein
      -66.563254
    
    
      Water
      -4.710177



In [33]:

    
a[["Type", "theta_gamma"]].groupby("Type").sum().sum()









    Out[33]:





theta_gamma   -207.788416
dtype: float64



In [34]:

    
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)









    Out[34]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      276
      SER
      VAL
      Protein
      7
    
    
      264
      SER
      LYS
      Protein
      7
    
    
      140
      GLY
      VAL
      Protein
      7
    
    
      165
      ILE
      ILE
      Direct
      7
    
    
      364
      VAL
      SER
      Protein
      8



In [35]:

    
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)









    Out[35]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      165
      ILE
      ILE
      Direct
      -5.859840
    
    
      54
      ASP
      LYS
      Direct
      -4.141928
    
    
      181
      ILE
      VAL
      Direct
      -3.560194
    
    
      171
      ILE
      PHE
      Direct
      -3.555981
    
    
      29
      ARG
      PHE
      Protein
      -2.875323



In [36]:

    
b2_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2_native.sort_values("theta_gamma").tail(5)









    Out[36]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      299
      VAL
      SER
      Protein
      5
    
    
      142
      ILE
      PHE
      Protein
      5
    
    
      137
      ILE
      ILE
      Protein
      5
    
    
      307
      VAL
      VAL
      Protein
      5
    
    
      172
      PHE
      ASP
      Protein
      6



In [37]:

    
b3_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3_native.sort_values("theta_gamma").head(5)









    Out[37]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      250
      TRP
      ILE
      Direct
      -4.199720
    
    
      306
      VAL
      VAL
      Direct
      -3.221582
    
    
      180
      PHE
      ILE
      Direct
      -2.645379
    
    
      136
      ILE
      ILE
      Direct
      -2.453640
    
    
      292
      VAL
      LEU
      Direct
      -2.448768



In [60]:

    
gammaFile = "/Users/weilu/Research/server/mar_2020/cath_dataset_shuffle_optimization/optimization_iter0_com_density_rmin_2p5/saved_gammas/iter0_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)

contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]

pdb = "1sfp"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)

a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]



In [61]:

    
a[["Type", "theta_gamma"]].groupby("Type").sum()









    Out[61]:







  
    
      
      theta_gamma
    
    
      Type
      
    
  
  
    
      Direct
      -71.788934
    
    
      Protein
      -11.780707
    
    
      Water
      -2.330682



In [62]:

    
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)



In [63]:









    Out[63]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      111
      GLY
      TYR
      Protein
      5
    
    
      134
      ILE
      LEU
      Protein
      6
    
    
      132
      ILE
      ILE
      Protein
      6
    
    
      130
      ILE
      GLY
      Protein
      6
    
    
      146
      ILE
      VAL
      Protein
      6



In [20]:

    
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)



In [23]:









    Out[23]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      19
      CYS
      CYS
      Direct
      -25.324650
    
    
      149
      VAL
      LEU
      Direct
      -11.089641
    
    
      20
      CYS
      CYS
      Protein
      -6.585576
    
    
      26
      CYS
      LEU
      Protein
      -3.843926
    
    
      104
      PHE
      CYS
      Direct
      -3.533796



In [25]:

    
fastaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/3cyr/0/crystal_structure.fasta"
seq = read_fasta(fastaFile)



In [26]:

    
seq.count("C")









    Out[26]:





8



In [27]:

    
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"]  #dimer.
skip_pdb_list += ["1aqe"]  # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list



In [65]:

    
info_ = []
for pdb in pdb_list:
    seq = np.loadtxt(f"/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/database/S20_seq/{pdb}.seq", dtype=str)
    seq = str(seq)
    info_.append([pdb, seq, seq.count("C"), len(seq)])



In [66]:

    
data = pd.DataFrame(info_, columns=["Protein", "Seq", "Count", "Length"])



In [67]:

    
data.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/cys_count_data.csv")



In [47]:

    
data["Length"] = data["Seq"].apply(lambda x:len(x))



In [70]:

    
print(data.sort_values("Count").query("Count >= 6")["Protein"].to_list())









    



['1by2', '1rcb', '1hyp', '3lzt', '3cyr', '7rsa', '1rzl', '1b6e', '1poc', '1bea', '1poa']



In [64]:

    
data









    Out[64]:







  
    
      
      Res1
      Res2
      Type
      Theta
      Index1
      Index2
      r
    
  
  
    
      0
      LEU
      ALA
      Protein
      0.000
      0
      18
      11.045428
    
    
      1
      LEU
      ALA
      Water
      0.000
      0
      18
      11.045428
    
    
      2
      LEU
      ALA
      Direct
      0.000
      0
      18
      11.045428
    
    
      3
      LEU
      ARG
      Protein
      0.033
      0
      40
      9.838098
    
    
      4
      LEU
      ARG
      Water
      0.000
      0
      40
      9.838098
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      3097
      TYR
      GLU
      Water
      0.000
      89
      101
      15.198022
    
    
      3098
      TYR
      GLU
      Direct
      0.000
      89
      101
      15.198022
    
    
      3099
      ILE
      TYR
      Protein
      0.000
      90
      100
      10.454025
    
    
      3100
      ILE
      TYR
      Water
      0.000
      90
      100
      10.454025
    
    
      3101
      ILE
      TYR
      Direct
      0.000
      90
      100
      10.454025
    
  

3102 rows × 7 columns



In [20]:

    
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)



In [75]:

    
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]



In [77]:

    
a["theta_gamma"] = a["Theta"] * a["Gamma"]



In [78]:

    
a[["Type", "theta_gamma"]].groupby("Type").sum()









    Out[78]:







  
    
      
      theta_gamma
    
    
      Type
      
    
  
  
    
      Direct
      -949.732540
    
    
      Protein
      -479.435681
    
    
      Water
      -8.402658



In [79]:

    
a["theta_gamma"].sum()









    Out[79]:





-1437.5708789999999



In [80]:

    
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()



In [83]:

    
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()



In [91]:

    
b3.sort_values("theta_gamma").head(200)["theta_gamma"].sum()









    Out[91]:





-1483.202181



In [96]:

    
a.query("Res1 == 'TYR' and Res2 == 'CYS' and Theta > 0.1 and Type =='Direct'").sort_values("r")









    Out[96]:







  
    
      
      Res1
      Res2
      Type
      Theta
      Index1
      Index2
      r
      Interaction
      Index
      Gamma
      theta_gamma
    
  
  
    
      890
      TYR
      CYS
      Direct
      0.956
      23
      77
      2.808838
      Direct
      88
      -4.012
      -3.835472
    
    
      921
      TYR
      CYS
      Direct
      0.999
      66
      84
      3.204737
      Direct
      88
      -4.012
      -4.007988
    
    
      876
      TYR
      CYS
      Direct
      1.000
      2
      89
      3.921630
      Direct
      88
      -4.012
      -4.012000
    
    
      877
      TYR
      CYS
      Direct
      1.000
      2
      84
      3.969086
      Direct
      88
      -4.012
      -4.012000
    
    
      926
      TYR
      CYS
      Direct
      1.000
      68
      91
      4.097748
      Direct
      88
      -4.012
      -4.012000
    
    
      897
      TYR
      CYS
      Direct
      1.000
      26
      91
      4.154008
      Direct
      88
      -4.012
      -4.012000
    
    
      912
      TYR
      CYS
      Direct
      1.000
      62
      117
      4.176590
      Direct
      88
      -4.012
      -4.012000
    
    
      898
      TYR
      CYS
      Direct
      1.000
      26
      70
      4.354343
      Direct
      88
      -4.012
      -4.012000
    
    
      906
      TYR
      CYS
      Direct
      1.000
      50
      84
      4.464048
      Direct
      88
      -4.012
      -4.012000
    
    
      905
      TYR
      CYS
      Direct
      1.000
      50
      117
      4.704973
      Direct
      88
      -4.012
      -4.012000
    
    
      873
      TYR
      CYS
      Direct
      1.000
      2
      117
      4.796232
      Direct
      88
      -4.012
      -4.012000
    
    
      909
      TYR
      CYS
      Direct
      1.000
      50
      89
      4.796544
      Direct
      88
      -4.012
      -4.012000
    
    
      879
      TYR
      CYS
      Direct
      1.000
      2
      49
      4.818510
      Direct
      88
      -4.012
      -4.012000
    
    
      901
      TYR
      CYS
      Direct
      1.000
      26
      77
      4.863667
      Direct
      88
      -4.012
      -4.012000
    
    
      923
      TYR
      CYS
      Direct
      1.000
      66
      91
      4.958446
      Direct
      88
      -4.012
      -4.012000
    
    
      895
      TYR
      CYS
      Direct
      1.000
      26
      84
      4.969316
      Direct
      88
      -4.012
      -4.012000
    
    
      929
      TYR
      CYS
      Direct
      1.000
      104
      117
      4.990378
      Direct
      88
      -4.012
      -4.012000
    
    
      925
      TYR
      CYS
      Direct
      1.000
      68
      84
      5.031310
      Direct
      88
      -4.012
      -4.012000
    
    
      870
      TYR
      CYS
      Direct
      1.000
      2
      25
      5.094150
      Direct
      88
      -4.012
      -4.012000
    
    
      900
      TYR
      CYS
      Direct
      1.000
      26
      89
      5.218085
      Direct
      88
      -4.012
      -4.012000
    
    
      913
      TYR
      CYS
      Direct
      1.000
      62
      84
      5.235210
      Direct
      88
      -4.012
      -4.012000
    
    
      927
      TYR
      CYS
      Direct
      1.000
      68
      89
      5.246518
      Direct
      88
      -4.012
      -4.012000
    
    
      882
      TYR
      CYS
      Direct
      1.000
      2
      91
      5.403767
      Direct
      88
      -4.012
      -4.012000
    
    
      915
      TYR
      CYS
      Direct
      1.000
      62
      89
      5.464846
      Direct
      88
      -4.012
      -4.012000
    
    
      920
      TYR
      CYS
      Direct
      1.000
      66
      89
      5.552884
      Direct
      88
      -4.012
      -4.012000
    
    
      928
      TYR
      CYS
      Direct
      1.000
      68
      98
      5.720343
      Direct
      88
      -4.012
      -4.012000
    
    
      903
      TYR
      CYS
      Direct
      1.000
      26
      98
      5.720448
      Direct
      88
      -4.012
      -4.012000
    
    
      884
      TYR
      CYS
      Direct
      0.999
      23
      84
      5.763156
      Direct
      88
      -4.012
      -4.007988
    
    
      907
      TYR
      CYS
      Direct
      0.997
      50
      91
      5.911228
      Direct
      88
      -4.012
      -3.999964
    
    
      886
      TYR
      CYS
      Direct
      0.996
      23
      91
      5.938394
      Direct
      88
      -4.012
      -3.995952
    
    
      893
      TYR
      CYS
      Direct
      0.995
      23
      43
      5.965650
      Direct
      88
      -4.012
      -3.991940
    
    
      888
      TYR
      CYS
      Direct
      0.760
      23
      59
      6.384588
      Direct
      88
      -4.012
      -3.049120
    
    
      922
      TYR
      CYS
      Direct
      0.603
      66
      77
      6.458406
      Direct
      88
      -4.012
      -2.419236
    
    
      914
      TYR
      CYS
      Direct
      0.318
      62
      91
      6.576455
      Direct
      88
      -4.012
      -1.275816



In [86]:

    
b3.sort_values("theta_gamma").head(20)









    Out[86]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      342
      TYR
      CYS
      Direct
      -130.883476
    
    
      321
      TRP
      CYS
      Direct
      -106.422800
    
    
      113
      CYS
      TYR
      Protein
      -90.723716
    
    
      343
      TYR
      CYS
      Protein
      -87.706208
    
    
      112
      CYS
      TYR
      Direct
      -73.202952
    
    
      320
      TRP
      ASN
      Protein
      -58.802645
    
    
      226
      MET
      CYS
      Direct
      -50.142196
    
    
      246
      PHE
      CYS
      Direct
      -32.165329
    
    
      279
      SER
      CYS
      Direct
      -30.213630
    
    
      188
      ILE
      TYR
      Direct
      -28.798753
    
    
      333
      TRP
      TYR
      Direct
      -24.746634
    
    
      17
      ARG
      ASP
      Direct
      -24.667500
    
    
      59
      ASN
      TRP
      Protein
      -22.552750
    
    
      319
      TRP
      ARG
      Direct
      -22.380120
    
    
      346
      TYR
      ILE
      Direct
      -19.988483
    
    
      94
      CYS
      CYS
      Protein
      -19.848447
    
    
      324
      TRP
      ILE
      Direct
      -18.720945
    
    
      110
      CYS
      TRP
      Direct
      -17.923650
    
    
      19
      ARG
      CYS
      Direct
      -17.749700
    
    
      88
      CYS
      ARG
      Direct
      -17.403716



In [69]:

    
b2.sort_values("theta_gamma").tail(10)









    Out[69]:







  
    
      
      Res1
      Res2
      Type
      theta_gamma
    
  
  
    
      87
      CYS
      ALA
      Protein
      18
    
    
      112
      CYS
      TYR
      Direct
      19
    
    
      320
      TRP
      ASN
      Protein
      19
    
    
      336
      TYR
      ALA
      Protein
      20
    
    
      357
      TYR
      TYR
      Direct
      22
    
    
      343
      TYR
      CYS
      Protein
      28
    
    
      113
      CYS
      TYR
      Protein
      29
    
    
      321
      TRP
      CYS
      Direct
      30
    
    
      342
      TYR
      CYS
      Direct
      34
    
    
      94
      CYS
      CYS
      Protein
      45



In [82]:

    
a.query("Res1 == 'CYS' and Res2 == 'CYS' and Theta > 0.1")









    Out[82]:







  
    
      
      Res1
      Res2
      Type
      Theta
      Index1
      Index2
      r
      Interaction
      Index
      Gamma
      theta_gamma
    
  
  
    
      3153
      CYS
      CYS
      Protein
      1.000
      10
      25
      7.411365
      Protein
      284
      -0.477
      -0.477000
    
    
      3154
      CYS
      CYS
      Protein
      0.987
      10
      59
      9.068994
      Protein
      284
      -0.477
      -0.470799
    
    
      3155
      CYS
      CYS
      Protein
      1.000
      10
      27
      7.557992
      Protein
      284
      -0.477
      -0.477000
    
    
      3157
      CYS
      CYS
      Protein
      1.000
      10
      84
      8.094460
      Protein
      284
      -0.477
      -0.477000
    
    
      3158
      CYS
      CYS
      Protein
      0.938
      10
      77
      6.771117
      Protein
      284
      -0.477
      -0.447426
    
    
      3159
      CYS
      CYS
      Protein
      0.998
      10
      43
      7.104249
      Protein
      284
      -0.477
      -0.476046
    
    
      3160
      CYS
      CYS
      Protein
      0.999
      10
      42
      8.782217
      Protein
      284
      -0.477
      -0.476523
    
    
      3161
      CYS
      CYS
      Protein
      0.678
      10
      91
      9.425323
      Protein
      284
      -0.477
      -0.323406
    
    
      3162
      CYS
      CYS
      Protein
      0.288
      25
      59
      9.590501
      Protein
      284
      -0.477
      -0.137376
    
    
      3163
      CYS
      CYS
      Protein
      0.976
      25
      98
      9.128296
      Protein
      284
      -0.477
      -0.465552
    
    
      3164
      CYS
      CYS
      Protein
      1.000
      25
      117
      8.697128
      Protein
      284
      -0.477
      -0.477000
    
    
      3165
      CYS
      CYS
      Protein
      1.000
      25
      70
      7.770286
      Protein
      284
      -0.477
      -0.477000
    
    
      3169
      CYS
      CYS
      Protein
      0.859
      25
      49
      9.318975
      Protein
      284
      -0.477
      -0.409743
    
    
      3170
      CYS
      CYS
      Protein
      0.903
      25
      43
      9.277326
      Protein
      284
      -0.477
      -0.430731
    
    
      3174
      CYS
      CYS
      Protein
      1.000
      27
      84
      8.294059
      Protein
      284
      -0.477
      -0.477000
    
    
      3175
      CYS
      CYS
      Protein
      1.000
      27
      42
      8.483358
      Protein
      284
      -0.477
      -0.477000
    
    
      3176
      CYS
      CYS
      Protein
      1.000
      27
      91
      7.991209
      Protein
      284
      -0.477
      -0.477000
    
    
      3177
      CYS
      CYS
      Protein
      0.932
      27
      70
      6.761860
      Protein
      284
      -0.477
      -0.444564
    
    
      3178
      CYS
      CYS
      Protein
      0.994
      27
      59
      7.015271
      Protein
      284
      -0.477
      -0.474138
    
    
      3179
      CYS
      CYS
      Protein
      0.244
      27
      89
      9.613122
      Protein
      284
      -0.477
      -0.116388
    
    
      3182
      CYS
      CYS
      Protein
      1.000
      27
      98
      8.172283
      Protein
      284
      -0.477
      -0.477000
    
    
      3183
      CYS
      CYS
      Protein
      0.898
      27
      43
      6.717986
      Protein
      284
      -0.477
      -0.428346
    
    
      3186
      CYS
      CYS
      Protein
      0.999
      42
      117
      7.180018
      Protein
      284
      -0.477
      -0.476523
    
    
      3190
      CYS
      CYS
      Protein
      0.135
      42
      77
      9.685992
      Protein
      284
      -0.477
      -0.064395
    
    
      3194
      CYS
      CYS
      Protein
      0.999
      43
      117
      8.810677
      Protein
      284
      -0.477
      -0.476523
    
    
      3198
      CYS
      CYS
      Protein
      1.000
      43
      77
      7.933357
      Protein
      284
      -0.477
      -0.477000
    
    
      3200
      CYS
      CYS
      Protein
      0.990
      49
      59
      6.957804
      Protein
      284
      -0.477
      -0.472230
    
    
      3201
      CYS
      CYS
      Protein
      0.969
      49
      98
      9.154268
      Protein
      284
      -0.477
      -0.462213
    
    
      3204
      CYS
      CYS
      Protein
      1.000
      49
      89
      8.122187
      Protein
      284
      -0.477
      -0.477000
    
    
      3205
      CYS
      CYS
      Protein
      1.000
      49
      84
      8.692106
      Protein
      284
      -0.477
      -0.477000
    
    
      3207
      CYS
      CYS
      Protein
      1.000
      59
      117
      7.468670
      Protein
      284
      -0.477
      -0.477000
    
    
      3210
      CYS
      CYS
      Protein
      0.995
      59
      70
      8.971240
      Protein
      284
      -0.477
      -0.474615
    
    
      3212
      CYS
      CYS
      Protein
      1.000
      59
      77
      8.543577
      Protein
      284
      -0.477
      -0.477000
    
    
      3213
      CYS
      CYS
      Protein
      1.000
      59
      98
      8.630322
      Protein
      284
      -0.477
      -0.477000
    
    
      3215
      CYS
      CYS
      Protein
      1.000
      70
      117
      8.256154
      Protein
      284
      -0.477
      -0.477000
    
    
      3216
      CYS
      CYS
      Protein
      1.000
      70
      89
      8.483834
      Protein
      284
      -0.477
      -0.477000
    
    
      3217
      CYS
      CYS
      Protein
      0.986
      70
      84
      9.071579
      Protein
      284
      -0.477
      -0.470322
    
    
      3218
      CYS
      CYS
      Protein
      1.000
      70
      91
      7.877710
      Protein
      284
      -0.477
      -0.477000
    
    
      3219
      CYS
      CYS
      Protein
      0.883
      77
      98
      9.298271
      Protein
      284
      -0.477
      -0.421191
    
    
      3221
      CYS
      CYS
      Protein
      0.999
      77
      89
      8.816828
      Protein
      284
      -0.477
      -0.476523
    
    
      3222
      CYS
      CYS
      Protein
      0.964
      77
      91
      6.828478
      Protein
      284
      -0.477
      -0.459828
    
    
      3224
      CYS
      CYS
      Protein
      1.000
      84
      117
      8.211732
      Protein
      284
      -0.477
      -0.477000
    
    
      3225
      CYS
      CYS
      Protein
      0.999
      89
      117
      7.158068
      Protein
      284
      -0.477
      -0.476523
    
    
      3226
      CYS
      CYS
      Protein
      0.999
      91
      117
      8.794488
      Protein
      284
      -0.477
      -0.476523
    
    
      3227
      CYS
      CYS
      Protein
      1.000
      98
      117
      7.307405
      Protein
      284
      -0.477
      -0.477000
    
    
      3316
      CYS
      CYS
      Direct
      1.000
      25
      89
      3.975121
      Direct
      74
      2.484
      2.484000
    
    
      3318
      CYS
      CYS
      Direct
      1.000
      25
      77
      5.051316
      Direct
      74
      2.484
      2.484000
    
    
      3333
      CYS
      CYS
      Direct
      0.102
      27
      43
      6.717986
      Direct
      74
      2.484
      0.253368
    
    
      3342
      CYS
      CYS
      Direct
      0.167
      43
      59
      2.339643
      Direct
      74
      2.484
      0.414828



In [56]:

    
a.sort_values("theta_gamma")









    Out[56]:







  
    
      
      Res1
      Res2
      Type
      Theta
      Index1
      Index2
      Interaction
      Index
      Gamma
      theta_gamma
    
  
  
    
      1904
      GLN
      HIS
      Direct
      1.000
      9
      46
      Direct
      93
      -4.963
      -4.963000
    
    
      2490
      MET
      CYS
      Direct
      1.000
      7
      84
      Direct
      82
      -4.654
      -4.654000
    
    
      2501
      MET
      CYS
      Direct
      1.000
      7
      43
      Direct
      82
      -4.654
      -4.654000
    
    
      2499
      MET
      CYS
      Direct
      1.000
      7
      25
      Direct
      82
      -4.654
      -4.654000
    
    
      2491
      MET
      CYS
      Direct
      0.999
      7
      42
      Direct
      82
      -4.654
      -4.649346
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      149
      ASN
      ASN
      Direct
      0.960
      0
      112
      Direct
      39
      1.119
      1.074240
    
    
      5023
      SER
      THR
      Direct
      1.000
      32
      74
      Direct
      196
      1.075
      1.075000
    
    
      6450
      ASP
      TRP
      Protein
      0.998
      19
      60
      Protein
      281
      1.097
      1.094806
    
    
      3316
      CYS
      CYS
      Direct
      1.000
      25
      89
      Direct
      74
      2.484
      2.484000
    
    
      3318
      CYS
      CYS
      Direct
      1.000
      25
      77
      Direct
      74
      2.484
      2.484000
    
  

7521 rows × 10 columns



In [4]:

    
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1

pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
def get_contact_info(pdbFile, seq, frame_index=0):
    # frame_index = 0
    cutoff = 9.5
    MAX_OFFSET = 9
    parser = PDBParser()
    structure = parser.get_structure("x", pdbFile)
    models = list(structure.get_models())
    model = models[frame_index]
    all_residues = list(model.get_residues())
    info_ = []
    for i, res1 in enumerate(all_residues):
        if seq[i] == "G":
            a1 = res1["CA"]
        else:
            a1 = res1["CB"]
        for j, res2 in enumerate(all_residues):
            if seq[j] == "G":
                a2 = res2["CA"]
            else:
                a2 = res2["CB"]
            dis = a1 - a2
            if dis < cutoff and j - i > MAX_OFFSET:
                info_.append([pdb, i, j, seq[i], seq[j], dis])
    info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj", "Dis"])
    return info



In [82]:

    
data = pd.read_csv("training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"]  #dimer.
skip_pdb_list += ["1aqe"]  # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list



In [9]:

    
info_ = []
for pdb in pdb_list:
    pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
    seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
    info = get_contact_info(pdbFile, seq)
    info_.append(info)
info = pd.concat(info_)



In [10]:



In [14]:

    
direct_contact_table = np.zeros((20,20))
for i, line in info.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    dis = line["Dis"]
    if dis < 6.5:
        direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
        if resi != resj:
            direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1



In [16]:

    
direct_contact_table.min()









    Out[16]:





3.0



In [15]:

    
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [11]:

    
contact_table = np.zeros((20,20))
for i, line in info.iterrows():
    resi = line["Resi"]
    resj = line["Resj"]
    dis = line["Dis"]
    contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
    if resi != resj:
        contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1



In [12]:

    
len(info)









    Out[12]:





32267



In [13]:

    
contact_table.min()









    Out[13]:





12.0



In [115]:

    
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [109]:

    
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)



In [88]:

    
info.groupby(["Resi", "Resj"])["Dis"].count()









    Out[88]:





Resi  Resj
A     A       297
      C        60
      D        87
      E        88
      F       105
             ... 
Y     S        55
      T        61
      V       122
      W        24
      Y        46
Name: Dis, Length: 400, dtype: int64



In [81]:

    
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
get_contact_info(pdbFile, seq)









    Out[81]:







  
    
      
      Protein
      i
      j
      Resi
      Resj
      Dis
    
  
  
    
      0
      2a0b
      1
      104
      K
      H
      5.800971
    
    
      1
      2a0b
      1
      105
      K
      D
      8.981970
    
    
      2
      2a0b
      1
      108
      K
      V
      4.504978
    
    
      3
      2a0b
      1
      111
      K
      A
      8.126446
    
    
      4
      2a0b
      2
      108
      S
      V
      5.637322
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      267
      2a0b
      79
      94
      I
      W
      6.115420
    
    
      268
      2a0b
      79
      95
      I
      I
      5.385581
    
    
      269
      2a0b
      79
      98
      I
      M
      4.980760
    
    
      270
      2a0b
      81
      94
      S
      W
      8.732571
    
    
      271
      2a0b
      84
      94
      L
      W
      7.445224
    
  

272 rows × 6 columns



In [78]:

    
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1

pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")

frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
    if seq[i] == "G":
        a1 = res1["CA"]
    else:
        a1 = res1["CB"]
    for j, res2 in enumerate(all_residues):
        if seq[j] == "G":
            a2 = res2["CA"]
        else:
            a2 = res2["CB"]
        dis = a1 - a2
        if dis < cutoff and j - i > MAX_OFFSET:
            info_.append([pdb, i, j, seq[i], seq[j], dis])



In [74]:



In [6]:

    
def getContactMapFromPDB(pdbFile, seq, frame_index=0):
    cutoff = 9.5
    MAX_OFFSET = 9
    parser = PDBParser()
    structure = parser.get_structure("x", pdbFile)
    models = list(structure.get_models())
    model = models[frame_index]
    all_residues = list(model.get_residues())
    n = len(all_residues)
    contact_table = np.zeros((n,n))
    # print(pdb, n)#
    for i, res1 in enumerate(all_residues):
        if seq[i] == "G":
            a1 = res1["CA"]
        else:
            a1 = res1["CB"]
        for j, res2 in enumerate(all_residues):
            if seq[j] == "G":
                a2 = res2["CA"]
            else:
                a2 = res2["CB"]
            contact_table[i][j] = a1 - a2

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data



In [81]:

    
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
# pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    Out[81]:





<matplotlib.image.AxesImage at 0x11f8780b8>



In [7]:

    
pdb = "3lzt"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    Out[7]:





<matplotlib.image.AxesImage at 0x11cffc208>



In [66]:

    
pdb = "2a0b"

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)

pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)

combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    Out[66]:





<matplotlib.image.AxesImage at 0x120a5f240>



In [58]:

    
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)



In [63]:

    
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)



In [64]:

    
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    Out[64]:





<matplotlib.image.AxesImage at 0x122e16278>



In [56]:

    
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/native.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    if seq[i] == "G":
        a1 = res1["CA"]
    else:
        a1 = res1["CB"]
    for j, res2 in enumerate(all_residues):
        if seq[j] == "G":
            a2 = res2["CA"]
        else:
            a2 = res2["CB"]
        contact_table[i][j] = a1 - a2

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0



In [55]:









    Out[55]:





[<Model id=0>, <Model id=1>]



In [23]:

    
three_to_one("TRP")









    Out[23]:





'W'



In [25]:

    
seq.count("W")









    Out[25]:





1



In [ ]:

    
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"

cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    for j, res2 in enumerate(all_residues):
        contact_table[i][j] = res1["CA"]-res2["CA"]

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
data_ca = data



In [11]:

    
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/tmp.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)

data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data



In [12]:

    
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/original.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)

data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_original = data









    



/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [14]:

    
data_original.dtypes









    Out[14]:





i      int64
j      int64
p    float64
dtype: object



In [15]:

    
data_A.dtypes









    Out[15]:





i      int64
j      int64
p    float64
dtype: object



In [17]:

    
seq = "MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA"



In [18]:

    
len(seq)









    Out[18]:





807



In [33]:

    
seq









    Out[33]:





'MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA'



In [61]:

    
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p



In [72]:

    
data_s = data.astype(int)
t_s = (t>0.08).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-72-7d542ff5db06> in <module>
      1 data_s = data.astype(int)
      2 t_s = (t>0.08).astype(float)
----> 3 combined = data_s + t_s * 2
      4 from matplotlib import colors
      5 cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])

ValueError: operands could not be broadcast together with shapes (766,766) (807,807)



In [71]:

    
plt.imshow((t>0.08).astype(float), origin="bottom")
plt.colorbar()









    Out[71]:





<matplotlib.colorbar.Colorbar at 0x1a198c42b0>



In [70]:

    
plt.imshow(data, origin="bottom")









    Out[70]:





<matplotlib.image.AxesImage at 0x1a195e3fd0>



In [20]:

    
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()









    Out[20]:





<matplotlib.colorbar.Colorbar at 0x1a176c9fd0>



In [21]:

    
n = len(seq)
t = np.zeros((n,n))
for index, d in data_original.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p



In [26]:

    
plt.imshow((t>0.1).astype(float), origin="bottom")
plt.colorbar()









    Out[26]:





<matplotlib.colorbar.Colorbar at 0x1a184b3cc0>



In [ ]:



In [ ]:



In [7]:

    
len(a[5])









    Out[7]:





51



In [22]:

    
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)

fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)



In [86]:

    
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
    out.write(header.format(pdb))
    out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
    for index, d in data_A.iterrows():
    #     print(index)
        i = int(d["i"])
        j = int(d["j"])
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    for index, d in data_B.iterrows():
    #     print(index)
        i = int(d["i"]) + len(seq_A)
        j = int(d["j"]) + len(seq_A)
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    out.write("END\n")



In [80]:

    
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = round(d["p"], 8)
    s = int(d["s"])
    ss = int(d["ss"])



In [84]:

    
data_A









    Out[84]:







  
    
      
      i
      j
      s
      ss
      p
    
  
  
    
      0
      34
      59
      0.0
      8.0
      0.997731
    
    
      1
      38
      56
      0.0
      8.0
      0.980999
    
    
      2
      41
      52
      0.0
      8.0
      0.973823
    
    
      3
      38
      59
      0.0
      8.0
      0.966630
    
    
      4
      27
      66
      0.0
      8.0
      0.962626
    
    
      5
      34
      62
      0.0
      8.0
      0.961573
    
    
      6
      35
      59
      0.0
      8.0
      0.916884
    
    
      7
      41
      55
      0.0
      8.0
      0.895539
    
    
      8
      31
      63
      0.0
      8.0
      0.874988
    
    
      9
      30
      62
      0.0
      8.0
      0.842116
    
    
      10
      20
      73
      0.0
      8.0
      0.814816
    
    
      11
      27
      69
      0.0
      8.0
      0.767721
    
    
      12
      37
      55
      0.0
      8.0
      0.758386
    
    
      13
      83
      160
      0.0
      8.0
      0.728676
    
    
      14
      42
      52
      0.0
      8.0
      0.727045
    
    
      15
      23
      69
      0.0
      8.0
      0.709704
    
    
      16
      83
      161
      0.0
      8.0
      0.701026
    
    
      17
      37
      276
      0.0
      8.0
      0.693589
    
    
      18
      31
      62
      0.0
      8.0
      0.680381
    
    
      19
      31
      66
      0.0
      8.0
      0.678132
    
    
      20
      37
      275
      0.0
      8.0
      0.675751
    
    
      21
      41
      48
      0.0
      8.0
      0.668435
    
    
      22
      109
      195
      0.0
      8.0
      0.657830
    
    
      23
      217
      274
      0.0
      8.0
      0.654746
    
    
      24
      113
      199
      0.0
      8.0
      0.654502
    
    
      25
      105
      192
      0.0
      8.0
      0.653025
    
    
      26
      109
      196
      0.0
      8.0
      0.652981
    
    
      27
      102
      188
      0.0
      8.0
      0.652657
    
    
      28
      112
      196
      0.0
      8.0
      0.650273
    
    
      29
      106
      192
      0.0
      8.0
      0.650126
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      36555
      32
      53
      0.0
      8.0
      0.000235
    
    
      36556
      40
      63
      0.0
      8.0
      0.000228
    
    
      36557
      35
      53
      0.0
      8.0
      0.000224
    
    
      36558
      30
      56
      0.0
      8.0
      0.000219
    
    
      36559
      39
      54
      0.0
      8.0
      0.000218
    
    
      36560
      42
      62
      0.0
      8.0
      0.000209
    
    
      36561
      39
      53
      0.0
      8.0
      0.000201
    
    
      36562
      36
      65
      0.0
      8.0
      0.000194
    
    
      36563
      40
      58
      0.0
      8.0
      0.000191
    
    
      36564
      36
      56
      0.0
      8.0
      0.000183
    
    
      36565
      41
      61
      0.0
      8.0
      0.000179
    
    
      36566
      36
      64
      0.0
      8.0
      0.000146
    
    
      36567
      28
      60
      0.0
      8.0
      0.000140
    
    
      36568
      42
      61
      0.0
      8.0
      0.000136
    
    
      36569
      40
      60
      0.0
      8.0
      0.000136
    
    
      36570
      40
      62
      0.0
      8.0
      0.000133
    
    
      36571
      36
      53
      0.0
      8.0
      0.000128
    
    
      36572
      32
      61
      0.0
      8.0
      0.000121
    
    
      36573
      40
      57
      0.0
      8.0
      0.000088
    
    
      36574
      36
      60
      0.0
      8.0
      0.000080
    
    
      36575
      32
      57
      0.0
      8.0
      0.000078
    
    
      36576
      39
      62
      0.0
      8.0
      0.000075
    
    
      36577
      36
      57
      0.0
      8.0
      0.000066
    
    
      36578
      40
      61
      0.0
      8.0
      0.000064
    
    
      36579
      39
      57
      0.0
      8.0
      0.000064
    
    
      36580
      32
      60
      0.0
      8.0
      0.000058
    
    
      36581
      32
      56
      0.0
      8.0
      0.000049
    
    
      36582
      39
      58
      0.0
      8.0
      0.000049
    
    
      36583
      39
      61
      0.0
      8.0
      0.000045
    
    
      36584
      36
      61
      0.0
      8.0
      0.000044
    
  

36585 rows × 5 columns



In [79]:

    
round(p, 8)









    Out[79]:





0.0023355



In [23]:

    
seq = seq_A + seq_B



In [26]:

    
seq









    Out[26]:





'SMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNLYSSVWILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEADDRYICCRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSGHQKRKALKPTVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPILYALGASCHRPDKCCLGYQKRPLPQVLLSSWYPTSQLCSKPGVIFLTKRGRQVCADKSKDWVKKLMQQLPVTA'



In [27]:

    
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p



In [28]:

    
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()









    Out[28]:





<matplotlib.colorbar.Colorbar at 0x1a184bfb38>



In [29]:

    
def getSeqFromRaptorXContact(fileLocation):
    with open(fileLocation) as f:
        a = f.readlines()
    i = 4
    seq = ""
    assert a[i] == "MODEL 1\n"
    i += 1
    while True:
        line = a[i]
        if line[0].isdigit():
            break
        i += 1
        seq += line.strip()
    # print(i)
    return seq
def getContactMapFromPDB(pdbFile):
    cutoff = 9.5
    MAX_OFFSET = 6
    parser = PDBParser()
    structure = parser.get_structure('target', pdbFile)
    all_residues = list(structure.get_residues())
    n = len(all_residues)
    contact_table = np.zeros((n,n))
    # print(pdb, n)#
    for i, res1 in enumerate(all_residues):
        for j, res2 in enumerate(all_residues):
            contact_table[i][j] = res1["CA"]-res2["CA"]

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data



In [53]:

    
pdbFile = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4_A.pdb"
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
# chainA = structure[0]["A"]
# all_residues = list(chainA.get_residues())
all_residues = []
for res in structure.get_residues():
    if res.get_id()[0] == " ":
        all_residues.append(res)
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
    for j, res2 in enumerate(all_residues):
        contact_table[i][j] = res1["CA"]-res2["CA"]

data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
    remove_band += np.eye(n, k=i)
    remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0



In [60]:

    
plt.imshow(data, origin="bottom")









    Out[60]:





<matplotlib.image.AxesImage at 0x1a18819128>



In [58]:

    
data_s = data.astype(int)
t_s = (t>0.55).astype(float)



In [30]:

    
data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")









    



/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain A is discontinuous at line 8106.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain G is discontinuous at line 8197.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain A is discontinuous at line 8214.
  PDBConstructionWarning,
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/StructureBuilder.py:92: PDBConstructionWarning: WARNING: Chain G is discontinuous at line 8653.
  PDBConstructionWarning,






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-30-304a6c71b370> in <module>
----> 1 data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")

<ipython-input-29-efa1942577c9> in getContactMapFromPDB(pdbFile)
     25     for i, res1 in enumerate(all_residues):
     26         for j, res2 in enumerate(all_residues):
---> 27             contact_table[i][j] = res1["CA"]-res2["CA"]
     28 
     29     data = (contact_table < cutoff)

~/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/Entity.py in __getitem__(self, id)
     40     def __getitem__(self, id):
     41         """Return the child with given id."""
---> 42         return self.child_dict[id]
     43 
     44     def __delitem__(self, id):

KeyError: 'CA'



In [ ]:



In [30]:

    
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")



In [70]:

    
data_s = data.astype(int)
t_s = (t>0.55).astype(float)



In [72]:

    
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")









    Out[72]:





<matplotlib.image.AxesImage at 0x1a24ae82b0>



In [60]:

    
combined = data_s + t_s * 2



In [63]:

    
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)









    Out[63]:





<matplotlib.image.AxesImage at 0x1a235698d0>



In [87]:

    
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)



In [88]:

    
model = structure[0]



In [90]:

    
a = list(model.get_chains())



In [92]:

    
c = a[0]



In [94]:

    
c.id









    Out[94]:





'A'



In [95]:

    
c = "ALL"



In [99]:

    
"ABC" is not "ALL"









    Out[99]:





True



In [93]:

    
c.get_id()









    Out[93]:





'A'



In [55]:

    
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")









    Out[55]:





<matplotlib.image.AxesImage at 0x1a21a9a9b0>



In [43]:

    
plt.imshow(data_s, origin="bottom")









    Out[43]:





<matplotlib.image.AxesImage at 0x1a1cd67828>



In [44]:

    
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()









    Out[44]:





<matplotlib.image.AxesImage at 0x1a1d144e48>



In [16]:

    
346









    Out[16]:





70



In [17]:

    
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"



In [20]:

    
len(seq)









    Out[20]:





276



In [18]:

    
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-f0b1e6216f4f> in <module>
      4 data["i"] = data["i"].astype(int)
      5 data["j"] = data["j"].astype(int)
----> 6 n = int(info.query(f"Protein =='{pdb}'")["Length"])
      7 t = np.zeros((n,n))
      8 for index, d in data.iterrows():

NameError: name 'info' is not defined



In [ ]:

    
def getContactMap(pdb):
    fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
    data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
    data["i"] = data["i"].astype(int)
    data["j"] = data["j"].astype(int)
    n = int(info.query(f"Protein =='{pdb}'")["Length"])
    t = np.zeros((n,n))
    for index, d in data.iterrows():
    #     print(index)
        i = int(d["i"]) - 1
        j = int(d["j"]) - 1 
        p = d["p"]
    #     print(i,j,p)
        t[i,j] = p
        t[j,i] = p

	Res1	Res2	Type	Theta	Index1	Index2	r	Interaction	Index	Gamma	theta_gamma
0	ILE	ASN	Protein	0.509	0	101	9.496481	Protein	256	-0.463	-0.235667
1	ILE	ASN	Protein	0.981	32	101	9.105055	Protein	256	-0.463	-0.454203
2	ILE	ASN	Protein	0.000	47	86	11.481540	Protein	256	-0.463	-0.000000
3	ILE	ASN	Protein	0.548	47	92	9.480688	Protein	256	-0.463	-0.253724
4	ILE	ASN	Protein	0.000	49	86	14.068905	Protein	256	-0.463	-0.000000
...	...	...	...	...	...	...	...	...	...	...	...
4129	ASN	PHE	Water	0.000	92	117	8.285746	Water	470	0.174	0.000000
4130	ASN	PHE	Direct	0.000	92	117	8.285746	Direct	50	0.113	0.000000
4131	ASN	GLU	Protein	0.000	92	118	10.309171	Protein	253	-0.033	-0.000000
4132	ASN	GLU	Water	0.000	92	118	10.309171	Water	463	-0.182	-0.000000
4133	ASN	GLU	Direct	0.000	92	118	10.309171	Direct	43	-0.151	-0.000000

	Resi	Resj	with_in_cutoff	correct	n_decoy
0	A	A	0	0	50
1	A	D	0	0	100
2	A	E	0	0	250
3	A	G	5	0	100
4	A	H	0	0	100
...	...	...	...	...	...
222	Y	Q	2	0	300
223	Y	R	16	0	50
224	Y	S	0	0	50
225	Y	V	4	0	250
226	Y	W	0	0	150

	Protein	i	j	Resi	Resj	dis_native	dis0	dis1	dis2	dis3	...	dis42	dis43	dis44	dis45	dis46	dis47	dis48	dis49	with_in_cutoff	correct
1	2a0b	0	96	S	E	24.540379	8.589619	11.612119	9.520317	11.036819	...	10.001908	13.052809	15.719584	7.920152	8.285571	9.403168	8.631104	9.058750	3	0
2	2a0b	0	97	S	E	17.403204	8.989487	11.158300	9.647176	11.726250	...	7.322862	10.191383	9.798911	8.174930	8.526396	11.080670	11.505523	9.939404	2	0
4	2a0b	0	100	S	E	21.171255	4.933434	7.880610	5.016576	7.941772	...	4.796510	7.840240	10.343412	5.293333	5.680943	6.304268	5.815565	4.677626	33	0
6	2a0b	0	103	S	R	22.197365	8.802500	4.923718	7.762784	6.222366	...	8.946397	5.851907	11.568497	7.744026	7.826056	8.860541	5.331914	6.970852	15	0
7	2a0b	0	104	S	H	13.256738	7.489680	9.312119	7.424554	9.970190	...	6.583136	7.799269	9.691411	8.463796	7.372634	10.546615	9.840426	8.611886	3	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
809	2a0b	79	91	I	V	5.729116	6.418045	8.555470	6.038908	8.960972	...	5.884872	5.512733	5.969506	6.249822	7.903872	5.836083	6.846112	5.806306	21	20
811	2a0b	79	94	I	W	6.115420	6.053108	6.264494	5.522557	9.787152	...	5.877768	5.688467	6.387781	6.113961	4.718327	5.912821	5.814579	6.234229	41	40
812	2a0b	79	95	I	I	5.385581	7.151258	8.017377	7.467034	7.508904	...	5.204203	6.583248	5.792931	7.581868	9.660872	4.388875	8.686253	5.543818	11	10
814	2a0b	79	98	I	M	4.980760	5.208953	6.010066	6.671427	6.100369	...	5.462386	9.703018	4.854737	7.726511	6.881110	6.241440	5.867614	6.434775	27	26
821	2a0b	84	94	L	W	7.445224	7.125351	5.421778	6.440860	5.713799	...	6.226047	5.620839	5.281493	7.350391	7.135801	6.470656	6.404382	6.633040	28	0

	Res1	Res2	Type	theta_gamma
165	ILE	ILE	Direct	-5.859840
54	ASP	LYS	Direct	-4.141928
181	ILE	VAL	Direct	-3.560194
171	ILE	PHE	Direct	-3.555981
29	ARG	PHE	Protein	-2.875323

	Res1	Res2	Type	theta_gamma
250	TRP	ILE	Direct	-4.199720
306	VAL	VAL	Direct	-3.221582
180	PHE	ILE	Direct	-2.645379
136	ILE	ILE	Direct	-2.453640
292	VAL	LEU	Direct	-2.448768

	Res1	Res2	Type	theta_gamma
19	CYS	CYS	Direct	-25.324650
149	VAL	LEU	Direct	-11.089641
20	CYS	CYS	Protein	-6.585576
26	CYS	LEU	Protein	-3.843926
104	PHE	CYS	Direct	-3.533796

	Res1	Res2	Type	Theta	Index1	Index2	r
0	LEU	ALA	Protein	0.000	0	18	11.045428
1	LEU	ALA	Water	0.000	0	18	11.045428
2	LEU	ALA	Direct	0.000	0	18	11.045428
3	LEU	ARG	Protein	0.033	0	40	9.838098
4	LEU	ARG	Water	0.000	0	40	9.838098
...	...	...	...	...	...	...	...
3097	TYR	GLU	Water	0.000	89	101	15.198022
3098	TYR	GLU	Direct	0.000	89	101	15.198022
3099	ILE	TYR	Protein	0.000	90	100	10.454025
3100	ILE	TYR	Water	0.000	90	100	10.454025
3101	ILE	TYR	Direct	0.000	90	100	10.454025

	Res1	Res2	Type	Theta	Index1	Index2	r	Interaction	Index	Gamma	theta_gamma
890	TYR	CYS	Direct	0.956	23	77	2.808838	Direct	88	-4.012	-3.835472
921	TYR	CYS	Direct	0.999	66	84	3.204737	Direct	88	-4.012	-4.007988
876	TYR	CYS	Direct	1.000	2	89	3.921630	Direct	88	-4.012	-4.012000
877	TYR	CYS	Direct	1.000	2	84	3.969086	Direct	88	-4.012	-4.012000
926	TYR	CYS	Direct	1.000	68	91	4.097748	Direct	88	-4.012	-4.012000
897	TYR	CYS	Direct	1.000	26	91	4.154008	Direct	88	-4.012	-4.012000
912	TYR	CYS	Direct	1.000	62	117	4.176590	Direct	88	-4.012	-4.012000
898	TYR	CYS	Direct	1.000	26	70	4.354343	Direct	88	-4.012	-4.012000
906	TYR	CYS	Direct	1.000	50	84	4.464048	Direct	88	-4.012	-4.012000
905	TYR	CYS	Direct	1.000	50	117	4.704973	Direct	88	-4.012	-4.012000
873	TYR	CYS	Direct	1.000	2	117	4.796232	Direct	88	-4.012	-4.012000
909	TYR	CYS	Direct	1.000	50	89	4.796544	Direct	88	-4.012	-4.012000
879	TYR	CYS	Direct	1.000	2	49	4.818510	Direct	88	-4.012	-4.012000
901	TYR	CYS	Direct	1.000	26	77	4.863667	Direct	88	-4.012	-4.012000
923	TYR	CYS	Direct	1.000	66	91	4.958446	Direct	88	-4.012	-4.012000
895	TYR	CYS	Direct	1.000	26	84	4.969316	Direct	88	-4.012	-4.012000
929	TYR	CYS	Direct	1.000	104	117	4.990378	Direct	88	-4.012	-4.012000
925	TYR	CYS	Direct	1.000	68	84	5.031310	Direct	88	-4.012	-4.012000
870	TYR	CYS	Direct	1.000	2	25	5.094150	Direct	88	-4.012	-4.012000
900	TYR	CYS	Direct	1.000	26	89	5.218085	Direct	88	-4.012	-4.012000
913	TYR	CYS	Direct	1.000	62	84	5.235210	Direct	88	-4.012	-4.012000
927	TYR	CYS	Direct	1.000	68	89	5.246518	Direct	88	-4.012	-4.012000
882	TYR	CYS	Direct	1.000	2	91	5.403767	Direct	88	-4.012	-4.012000
915	TYR	CYS	Direct	1.000	62	89	5.464846	Direct	88	-4.012	-4.012000
920	TYR	CYS	Direct	1.000	66	89	5.552884	Direct	88	-4.012	-4.012000
928	TYR	CYS	Direct	1.000	68	98	5.720343	Direct	88	-4.012	-4.012000
903	TYR	CYS	Direct	1.000	26	98	5.720448	Direct	88	-4.012	-4.012000
884	TYR	CYS	Direct	0.999	23	84	5.763156	Direct	88	-4.012	-4.007988
907	TYR	CYS	Direct	0.997	50	91	5.911228	Direct	88	-4.012	-3.999964
886	TYR	CYS	Direct	0.996	23	91	5.938394	Direct	88	-4.012	-3.995952
893	TYR	CYS	Direct	0.995	23	43	5.965650	Direct	88	-4.012	-3.991940
888	TYR	CYS	Direct	0.760	23	59	6.384588	Direct	88	-4.012	-3.049120
922	TYR	CYS	Direct	0.603	66	77	6.458406	Direct	88	-4.012	-2.419236
914	TYR	CYS	Direct	0.318	62	91	6.576455	Direct	88	-4.012	-1.275816

	Res1	Res2	Type	theta_gamma
342	TYR	CYS	Direct	-130.883476
321	TRP	CYS	Direct	-106.422800
113	CYS	TYR	Protein	-90.723716
343	TYR	CYS	Protein	-87.706208
112	CYS	TYR	Direct	-73.202952
320	TRP	ASN	Protein	-58.802645
226	MET	CYS	Direct	-50.142196
246	PHE	CYS	Direct	-32.165329
279	SER	CYS	Direct	-30.213630
188	ILE	TYR	Direct	-28.798753
333	TRP	TYR	Direct	-24.746634
17	ARG	ASP	Direct	-24.667500
59	ASN	TRP	Protein	-22.552750
319	TRP	ARG	Direct	-22.380120
346	TYR	ILE	Direct	-19.988483
94	CYS	CYS	Protein	-19.848447
324	TRP	ILE	Direct	-18.720945
110	CYS	TRP	Direct	-17.923650
19	ARG	CYS	Direct	-17.749700
88	CYS	ARG	Direct	-17.403716

	Res1	Res2	Type	Theta	Index1	Index2	r	Interaction	Index	Gamma	theta_gamma
3153	CYS	CYS	Protein	1.000	10	25	7.411365	Protein	284	-0.477	-0.477000
3154	CYS	CYS	Protein	0.987	10	59	9.068994	Protein	284	-0.477	-0.470799
3155	CYS	CYS	Protein	1.000	10	27	7.557992	Protein	284	-0.477	-0.477000
3157	CYS	CYS	Protein	1.000	10	84	8.094460	Protein	284	-0.477	-0.477000
3158	CYS	CYS	Protein	0.938	10	77	6.771117	Protein	284	-0.477	-0.447426
3159	CYS	CYS	Protein	0.998	10	43	7.104249	Protein	284	-0.477	-0.476046
3160	CYS	CYS	Protein	0.999	10	42	8.782217	Protein	284	-0.477	-0.476523
3161	CYS	CYS	Protein	0.678	10	91	9.425323	Protein	284	-0.477	-0.323406
3162	CYS	CYS	Protein	0.288	25	59	9.590501	Protein	284	-0.477	-0.137376
3163	CYS	CYS	Protein	0.976	25	98	9.128296	Protein	284	-0.477	-0.465552
3164	CYS	CYS	Protein	1.000	25	117	8.697128	Protein	284	-0.477	-0.477000
3165	CYS	CYS	Protein	1.000	25	70	7.770286	Protein	284	-0.477	-0.477000
3169	CYS	CYS	Protein	0.859	25	49	9.318975	Protein	284	-0.477	-0.409743
3170	CYS	CYS	Protein	0.903	25	43	9.277326	Protein	284	-0.477	-0.430731
3174	CYS	CYS	Protein	1.000	27	84	8.294059	Protein	284	-0.477	-0.477000
3175	CYS	CYS	Protein	1.000	27	42	8.483358	Protein	284	-0.477	-0.477000
3176	CYS	CYS	Protein	1.000	27	91	7.991209	Protein	284	-0.477	-0.477000
3177	CYS	CYS	Protein	0.932	27	70	6.761860	Protein	284	-0.477	-0.444564
3178	CYS	CYS	Protein	0.994	27	59	7.015271	Protein	284	-0.477	-0.474138
3179	CYS	CYS	Protein	0.244	27	89	9.613122	Protein	284	-0.477	-0.116388
3182	CYS	CYS	Protein	1.000	27	98	8.172283	Protein	284	-0.477	-0.477000
3183	CYS	CYS	Protein	0.898	27	43	6.717986	Protein	284	-0.477	-0.428346
3186	CYS	CYS	Protein	0.999	42	117	7.180018	Protein	284	-0.477	-0.476523
3190	CYS	CYS	Protein	0.135	42	77	9.685992	Protein	284	-0.477	-0.064395
3194	CYS	CYS	Protein	0.999	43	117	8.810677	Protein	284	-0.477	-0.476523
3198	CYS	CYS	Protein	1.000	43	77	7.933357	Protein	284	-0.477	-0.477000
3200	CYS	CYS	Protein	0.990	49	59	6.957804	Protein	284	-0.477	-0.472230
3201	CYS	CYS	Protein	0.969	49	98	9.154268	Protein	284	-0.477	-0.462213
3204	CYS	CYS	Protein	1.000	49	89	8.122187	Protein	284	-0.477	-0.477000
3205	CYS	CYS	Protein	1.000	49	84	8.692106	Protein	284	-0.477	-0.477000
3207	CYS	CYS	Protein	1.000	59	117	7.468670	Protein	284	-0.477	-0.477000
3210	CYS	CYS	Protein	0.995	59	70	8.971240	Protein	284	-0.477	-0.474615
3212	CYS	CYS	Protein	1.000	59	77	8.543577	Protein	284	-0.477	-0.477000
3213	CYS	CYS	Protein	1.000	59	98	8.630322	Protein	284	-0.477	-0.477000
3215	CYS	CYS	Protein	1.000	70	117	8.256154	Protein	284	-0.477	-0.477000
3216	CYS	CYS	Protein	1.000	70	89	8.483834	Protein	284	-0.477	-0.477000
3217	CYS	CYS	Protein	0.986	70	84	9.071579	Protein	284	-0.477	-0.470322
3218	CYS	CYS	Protein	1.000	70	91	7.877710	Protein	284	-0.477	-0.477000
3219	CYS	CYS	Protein	0.883	77	98	9.298271	Protein	284	-0.477	-0.421191
3221	CYS	CYS	Protein	0.999	77	89	8.816828	Protein	284	-0.477	-0.476523
3222	CYS	CYS	Protein	0.964	77	91	6.828478	Protein	284	-0.477	-0.459828
3224	CYS	CYS	Protein	1.000	84	117	8.211732	Protein	284	-0.477	-0.477000
3225	CYS	CYS	Protein	0.999	89	117	7.158068	Protein	284	-0.477	-0.476523
3226	CYS	CYS	Protein	0.999	91	117	8.794488	Protein	284	-0.477	-0.476523
3227	CYS	CYS	Protein	1.000	98	117	7.307405	Protein	284	-0.477	-0.477000
3316	CYS	CYS	Direct	1.000	25	89	3.975121	Direct	74	2.484	2.484000
3318	CYS	CYS	Direct	1.000	25	77	5.051316	Direct	74	2.484	2.484000
3333	CYS	CYS	Direct	0.102	27	43	6.717986	Direct	74	2.484	0.253368
3342	CYS	CYS	Direct	0.167	43	59	2.339643	Direct	74	2.484	0.414828

	Resi	Resj	with_in_cutoff	correct	n_decoy
0	A	A	0	0	50
1	A	D	0	0	100
2	A	E	0	0	250
3	A	G	5	0	100
4	A	H	0	0	100
...	...	...	...	...	...
222	Y	Q	2	0	300
223	Y	R	16	0	50
224	Y	S	0	0	50
225	Y	V	4	0	250
226	Y	W	0	0	150

	Res1	Res2	Type	Theta	Index1	Index2	Interaction	Index	Gamma	theta_gamma
1904	GLN	HIS	Direct	1.000	9	46	Direct	93	-4.963	-4.963000
2490	MET	CYS	Direct	1.000	7	84	Direct	82	-4.654	-4.654000
2501	MET	CYS	Direct	1.000	7	43	Direct	82	-4.654	-4.654000
2499	MET	CYS	Direct	1.000	7	25	Direct	82	-4.654	-4.654000
2491	MET	CYS	Direct	0.999	7	42	Direct	82	-4.654	-4.649346
...	...	...	...	...	...	...	...	...	...	...
149	ASN	ASN	Direct	0.960	0	112	Direct	39	1.119	1.074240
5023	SER	THR	Direct	1.000	32	74	Direct	196	1.075	1.075000
6450	ASP	TRP	Protein	0.998	19	60	Protein	281	1.097	1.094806
3316	CYS	CYS	Direct	1.000	25	89	Direct	74	2.484	2.484000
3318	CYS	CYS	Direct	1.000	25	77	Direct	74	2.484	2.484000

	Protein	i	j	Resi	Resj	Dis
0	2a0b	1	104	K	H	5.800971
1	2a0b	1	105	K	D	8.981970
2	2a0b	1	108	K	V	4.504978
3	2a0b	1	111	K	A	8.126446
4	2a0b	2	108	S	V	5.637322
...	...	...	...	...	...	...
267	2a0b	79	94	I	W	6.115420
268	2a0b	79	95	I	I	5.385581
269	2a0b	79	98	I	M	4.980760
270	2a0b	81	94	S	W	8.732571
271	2a0b	84	94	L	W	7.445224

	i	j	s	ss	p
0	34	59	0.0	8.0	0.997731
1	38	56	0.0	8.0	0.980999
2	41	52	0.0	8.0	0.973823
3	38	59	0.0	8.0	0.966630
4	27	66	0.0	8.0	0.962626
5	34	62	0.0	8.0	0.961573
6	35	59	0.0	8.0	0.916884
7	41	55	0.0	8.0	0.895539
8	31	63	0.0	8.0	0.874988
9	30	62	0.0	8.0	0.842116
10	20	73	0.0	8.0	0.814816
11	27	69	0.0	8.0	0.767721
12	37	55	0.0	8.0	0.758386
13	83	160	0.0	8.0	0.728676
14	42	52	0.0	8.0	0.727045
15	23	69	0.0	8.0	0.709704
16	83	161	0.0	8.0	0.701026
17	37	276	0.0	8.0	0.693589
18	31	62	0.0	8.0	0.680381
19	31	66	0.0	8.0	0.678132
20	37	275	0.0	8.0	0.675751
21	41	48	0.0	8.0	0.668435
22	109	195	0.0	8.0	0.657830
23	217	274	0.0	8.0	0.654746
24	113	199	0.0	8.0	0.654502
25	105	192	0.0	8.0	0.653025
26	109	196	0.0	8.0	0.652981
27	102	188	0.0	8.0	0.652657
28	112	196	0.0	8.0	0.650273
29	106	192	0.0	8.0	0.650126
...	...	...	...	...	...
36555	32	53	0.0	8.0	0.000235
36556	40	63	0.0	8.0	0.000228
36557	35	53	0.0	8.0	0.000224
36558	30	56	0.0	8.0	0.000219
36559	39	54	0.0	8.0	0.000218
36560	42	62	0.0	8.0	0.000209
36561	39	53	0.0	8.0	0.000201
36562	36	65	0.0	8.0	0.000194
36563	40	58	0.0	8.0	0.000191
36564	36	56	0.0	8.0	0.000183
36565	41	61	0.0	8.0	0.000179
36566	36	64	0.0	8.0	0.000146
36567	28	60	0.0	8.0	0.000140
36568	42	61	0.0	8.0	0.000136
36569	40	60	0.0	8.0	0.000136
36570	40	62	0.0	8.0	0.000133
36571	36	53	0.0	8.0	0.000128
36572	32	61	0.0	8.0	0.000121
36573	40	57	0.0	8.0	0.000088
36574	36	60	0.0	8.0	0.000080
36575	32	57	0.0	8.0	0.000078
36576	39	62	0.0	8.0	0.000075
36577	36	57	0.0	8.0	0.000066
36578	40	61	0.0	8.0	0.000064
36579	39	57	0.0	8.0	0.000064
36580	32	60	0.0	8.0	0.000058
36581	32	56	0.0	8.0	0.000049
36582	39	58	0.0	8.0	0.000049
36583	39	61	0.0	8.0	0.000045
36584	36	61	0.0	8.0	0.000044

	Resi	Resj	with_in_cutoff	correct	n_decoy
0	A	A	0	0	50
1	A	D	0	0	100
2	A	E	0	0	250
3	A	G	5	0	100
4	A	H	0	0	100
...	...	...	...	...	...
222	Y	Q	2	0	300
223	Y	R	16	0	50
224	Y	S	0	0	50
225	Y	V	4	0	250
226	Y	W	0	0	150