In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3
In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [20]:
def get_cb_atom(res, resType):
if resType == "G":
atom = res["CA"]
else:
atom = res["CB"]
return atom
In [40]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
def get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0):
# frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
decoy_structure = parser.get_structure("x", movieFile)
decoy_models = list(decoy_structure.get_models())
info_ = []
for i, res1 in enumerate(all_residues):
a1 = get_cb_atom(res1, seq[i])
for j, res2 in enumerate(all_residues):
a2 = get_cb_atom(res2, seq[j])
dis = a1 - a2
dis_list = [dis]
for decoy_frame_index in range(-50, 0):
decoy_model = decoy_models[decoy_frame_index]
decoy_all_residues = list(decoy_model.get_residues())
assert len(all_residues) == len(decoy_all_residues)
decoy_a1 = get_cb_atom(decoy_all_residues[i], seq[i])
decoy_a2 = get_cb_atom(decoy_all_residues[j], seq[j])
decoy_dis = decoy_a1 - decoy_a2
dis_list.append(decoy_dis)
with_in_cutoff = any(dis<cutoff for dis in dis_list)
if j - i > MAX_OFFSET and with_in_cutoff:
info_.append([pdb, i, j, seq[i], seq[j]] + dis_list)
info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj"] + ["dis_native"] + [f"dis{i}" for i in range(0,50)])
return info
In [41]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
info = get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0)
In [83]:
info_ = []
for i, pdb in enumerate(pdb_list):
print(i)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
movieFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_2/{pdb}/0/movie.pdb"
info = get_native_and_decoy_contact_info(pdbFile, movieFile, seq, frame_index=0)
info_.append(info)
info = pd.concat(info_)
In [85]:
info.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/dis_info.csv")
In [87]:
info = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/dis_info.csv", index_col=0)
In [180]:
a = pd.read_csv("/Users/weilu/opt/parameters/side_chain/cbd_cbd_real_contact_symmetric.csv")
In [38]:
a
Out[38]:
In [187]:
res1_name = "ARG"
res2_name = "VAL"
a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
Out[187]:
In [188]:
a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
Out[188]:
In [189]:
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
r_min = float(b["r_min"])
r_max = float(b["r_max"])
In [199]:
In [202]:
from Bio.PDB.Polypeptide import three_to_one
In [204]:
three_to_one("SER")
Out[204]:
In [201]:
In [205]:
gamma_se_map_1_letter = { 'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4,
'Q': 5, 'E': 6, 'G': 7, 'H': 8, 'I': 9,
'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14,
'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19}
def three_to_index(resName):
return gamma_se_map_1_letter[three_to_one(resName)]
In [207]:
three_to_index("GLY")
Out[207]:
In [208]:
r_min_direct_table = np.zeros((20, 20))
r_min_direct_table[three_to_index("GLY"),:] = 2.5
r_min_direct_table[:,three_to_index("GLY")] = 2.5
r_max_direct_table = np.zeros((20, 20))
r_max_direct_table[three_to_index("GLY"),:] = 6.5
r_max_direct_table[:,three_to_index("GLY")] = 6.5
r_min_mediated_table = np.zeros((20, 20))
r_min_mediated_table[three_to_index("GLY"),:] = 6.5
r_min_mediated_table[:,three_to_index("GLY")] = 6.5
r_max_mediated_table = np.zeros((20, 20))
r_max_mediated_table[three_to_index("GLY"),:] = 9.5
r_max_mediated_table[:,three_to_index("GLY")] = 9.5
for i, line in a.iterrows():
res1 = line["ResName1"]
res2 = line["ResName2"]
r_min = line["r_min"]
r_max = line["r_max"]
r_min_direct_table[three_to_index(res1)][three_to_index(res2)] = r_min - 0.5
r_min_direct_table[three_to_index(res2)][three_to_index(res1)] = r_min - 0.5
r_max_direct_table[three_to_index(res1)][three_to_index(res2)] = r_max + 1.5
r_max_direct_table[three_to_index(res2)][three_to_index(res1)] = r_max + 1.5
r_min_mediated_table[three_to_index(res1)][three_to_index(res2)] = r_max + 1.5
r_min_mediated_table[three_to_index(res2)][three_to_index(res1)] = r_max + 1.5
r_max_mediated_table[three_to_index(res1)][three_to_index(res2)] = r_max + 4.5
r_max_mediated_table[three_to_index(res2)][three_to_index(res1)] = r_max + 4.5
In [194]:
a.query(f"ResName1=='GLY'")
Out[194]:
In [182]:
a["r_diff"] = a["r_max"] - a["r_min"]
In [184]:
a["r_min"].hist()
Out[184]:
In [183]:
a["r_diff"].hist()
Out[183]:
In [88]:
info["with_in_cutoff"] = info.apply(lambda x: sum(x[5:]<6.5), axis=1)
In [89]:
cutoff = 6.5
info["correct"] = info.apply(lambda x: sum((x[5]<cutoff) & (x[6:]<cutoff)), axis=1)
In [90]:
info["n_decoy"] = 50
In [91]:
result = info.query("with_in_cutoff > 0").groupby(["Resi", "Resj"])[["with_in_cutoff", "correct", "n_decoy"]].sum().reset_index()
In [92]:
result["correct_ratio"] = result["correct"] / result["n_decoy"]
In [134]:
a = -1000.123
b = -10.1234
c = -1000.12
print(f"www{a:>8.2f}ww{b:>8.3f}ww{c:>8.3f}ww")
print('{:.8s}'.format('{:0.4f}'.format(a)))
In [ ]:
def prepare_virtual_sites(pdb_file, use_cis_proline=False):
parser = PDBParser(QUIET=True)
structure=parser.get_structure('X',pdb_file,)
for model in structure:
for chain in model:
r_im={}
r_i={}
for residue in chain:
r_im=r_i
r_i={}
for atom in residue:
r_i[atom.get_name()]=atom
if use_cis_proline and residue.get_resname() == "IPR":
if 'N' in r_i:
r_i['N'].set_coord(-0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord())
if 'C' in r_im:
r_im['C'].set_coord(0.2196*r_im['CA'].get_coord()+ 0.2300*r_i['CA'].get_coord() + 0.5507*r_im['O'].get_coord())
if 'H' in r_i:
r_i['H'].set_coord(-0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord())
else:
if 'N' in r_i:
r_i['N'].set_coord(0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord())
if 'C' in r_im:
r_im['C'].set_coord(0.44365*r_im['CA'].get_coord()+ 0.23520*r_i['CA'].get_coord()+ 0.32115 *r_im['O'].get_coord())
if 'H' in r_i:
r_i['H'].set_coord(0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord())
io = PDBIO()
io.set_structure(structure)
io.save(pdb_file)
In [171]:
pdb_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/C1903-openmmawsem.pdb"
parser = PDBParser(QUIET=True)
structure=parser.get_structure('X',pdb_file,)
res = list(structure.get_residues())
In [172]:
In [ ]:
def prepare_virtual_sites(pdb_file, use_cis_proline=False):
output_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/new.pdb"
output = open(output_file, "w")
with open(pdb_file) as f:
index = 0
for line in f:
# print(line)
splitline = line.split()
if len(line)>4 and line[0:4] == "ATOM":
try:
atom_index=line[6:11].strip()
atom_type=line[12:16].strip()
res_type=line[17:20].strip()
chain=line[21].strip()
res_index=line[22:26].strip()
x=line[30:38].strip()
y=line[38:46].strip()
z=line[46:54].strip()
element = line[76:78].strip()
except ValueError:
print(line)
raise
else:
continue
res_index_zero_base = int(res_index) - 1
r_im = res[res_index_zero_base-1]
r_i = res[res_index_zero_base]
try:
r_ip = res[res_index_zero_base+1]
except:
r_ip = res[res_index_zero_base] # won't be used
if use_cis_proline and res_type == "IPR":
n_coord = -0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord()
c_coord = 0.2196*r_i['CA'].get_coord()+ 0.2300*r_ip['CA'].get_coord() + 0.5507*r_i['O'].get_coord()
h_coord = -0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord()
else:
n_coord = 0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord()
c_coord = 0.44365*r_i['CA'].get_coord()+ 0.23520*r_ip['CA'].get_coord()+ 0.32115 *r_i['O'].get_coord()
h_coord = 0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord()
line_list=list(line)
index += 1
line_list[6:11] = "{:5d}".format(index)
if atom_type == "N":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(n_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(n_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(n_coord[2])))
if atom_type == "C":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(c_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(c_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(c_coord[2])))
if atom_type == "H":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(h_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(h_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(h_coord[2])))
new_line=''.join(line_list)
output.write(new_line)
index += 1
line_list[6:11] = "{:5d}".format(index)
line_list[0:4] = "TER "
line_list[30:78] = " " * 48
new_line=''.join(line_list)
output.write(new_line)
output.write("END\n")
output.close()
In [177]:
use_cis_proline = False
output_file = "/Users/weilu/Research/server/mar_2020/C1903/test2/new.pdb"
output = open(output_file, "w")
with open(pdb_file) as f:
index = 0
for line in f:
# print(line)
splitline = line.split()
if len(line)>4 and line[0:4] == "ATOM":
try:
atom_index=line[6:11].strip()
atom_type=line[12:16].strip()
res_type=line[17:20].strip()
chain=line[21].strip()
res_index=line[22:26].strip()
x=line[30:38].strip()
y=line[38:46].strip()
z=line[46:54].strip()
element = line[76:78].strip()
except ValueError:
print(line)
raise
else:
continue
# if int(res_index) == terminal_residues[chain][0]:
# awsem_atoms.remove("N")
# awsem_atoms.remove("H")
# if int(res_index) == terminal_residues[chain][1]:
# awsem_atoms.remove("C")
res_index_zero_base = int(res_index) - 1
r_im = res[res_index_zero_base-1]
r_i = res[res_index_zero_base]
try:
r_ip = res[res_index_zero_base+1]
except:
r_ip = res[res_index_zero_base] # won't be used
if use_cis_proline and res_type == "IPR":
n_coord = -0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord()
c_coord = 0.2196*r_i['CA'].get_coord()+ 0.2300*r_ip['CA'].get_coord() + 0.5507*r_i['O'].get_coord()
h_coord = -0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord()
else:
n_coord = 0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord()
c_coord = 0.44365*r_i['CA'].get_coord()+ 0.23520*r_ip['CA'].get_coord()+ 0.32115 *r_i['O'].get_coord()
h_coord = 0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord()
line_list=list(line)
index += 1
line_list[6:11] = "{:5d}".format(index)
if atom_type == "N":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(n_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(n_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(n_coord[2])))
if atom_type == "C":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(c_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(c_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(c_coord[2])))
if atom_type == "H":
line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(h_coord[0]))
line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(h_coord[1]))
line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(h_coord[2])))
new_line=''.join(line_list)
output.write(new_line)
index += 1
line_list[6:11] = "{:5d}".format(index)
line_list[0:4] = "TER "
line_list[30:78] = " " * 48
new_line=''.join(line_list)
output.write(new_line)
output.write("END\n")
output.close()
In [154]:
n_coord
Out[154]:
In [149]:
new_line
Out[149]:
In [162]:
line
Out[162]:
In [ ]:
In [159]:
'{:.8s}'.format('{:8.3f}'.format(n_coord[2]))
Out[159]:
In [135]:
print(f"www{str(a):.8s}ww{b:>8.3f}ww{c:>8.3f}ww")
In [93]:
direct_contact_table = np.zeros((20,20)) - 1
for i, line in result.iterrows():
resi = line["Resi"]
resj = line["Resj"]
correct_ratio = line["correct_ratio"]
direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] = correct_ratio
if resi != resj:
direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] = correct_ratio
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [80]:
direct_contact_table = np.zeros((20,20)) - 1
for i, line in result.iterrows():
resi = line["Resi"]
resj = line["Resj"]
correct_ratio = line["correct_ratio"]
direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] = correct_ratio
if resi != resj:
direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] = correct_ratio
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [72]:
result.reset_index()
Out[72]:
In [63]:
info.query("with_in_cutoff > 0")
Out[63]:
In [5]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
s = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [47]:
structure = s
kappa=5.0
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
# kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2"])
In [74]:
In [4]:
def get_interaction_data(structure):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [30]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_larger_excl_withoutBurial/iter_larger_excl_withoutBurial_30"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
# pdb = "1sfp"
# pdb = "1who"
pdb = "1jon"
pdb = "1neu"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/crystal_structure.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data_native = get_interaction_data(structure)
a_native = data_native.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a_native["theta_gamma"] = a_native["Theta"] * a_native["Gamma"]
In [31]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[31]:
In [32]:
a_native[["Type", "theta_gamma"]].groupby("Type").sum()
Out[32]:
In [33]:
a[["Type", "theta_gamma"]].groupby("Type").sum().sum()
Out[33]:
In [34]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
Out[34]:
In [35]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
Out[35]:
In [36]:
b2_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2_native.sort_values("theta_gamma").tail(5)
Out[36]:
In [37]:
b3_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3_native.sort_values("theta_gamma").head(5)
Out[37]:
In [60]:
gammaFile = "/Users/weilu/Research/server/mar_2020/cath_dataset_shuffle_optimization/optimization_iter0_com_density_rmin_2p5/saved_gammas/iter0_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
pdb = "1sfp"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [61]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[61]:
In [62]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
In [63]:
Out[63]:
In [20]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
In [23]:
Out[23]:
In [25]:
fastaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/3cyr/0/crystal_structure.fasta"
seq = read_fasta(fastaFile)
In [26]:
seq.count("C")
Out[26]:
In [27]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [65]:
info_ = []
for pdb in pdb_list:
seq = np.loadtxt(f"/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/database/S20_seq/{pdb}.seq", dtype=str)
seq = str(seq)
info_.append([pdb, seq, seq.count("C"), len(seq)])
In [66]:
data = pd.DataFrame(info_, columns=["Protein", "Seq", "Count", "Length"])
In [67]:
data.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/cys_count_data.csv")
In [47]:
data["Length"] = data["Seq"].apply(lambda x:len(x))
In [70]:
print(data.sort_values("Count").query("Count >= 6")["Protein"].to_list())
In [64]:
data
Out[64]:
In [20]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
In [75]:
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [77]:
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [78]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[78]:
In [79]:
a["theta_gamma"].sum()
Out[79]:
In [80]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
In [83]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
In [91]:
b3.sort_values("theta_gamma").head(200)["theta_gamma"].sum()
Out[91]:
In [96]:
a.query("Res1 == 'TYR' and Res2 == 'CYS' and Theta > 0.1 and Type =='Direct'").sort_values("r")
Out[96]:
In [86]:
b3.sort_values("theta_gamma").head(20)
Out[86]:
In [69]:
b2.sort_values("theta_gamma").tail(10)
Out[69]:
In [82]:
a.query("Res1 == 'CYS' and Res2 == 'CYS' and Theta > 0.1")
Out[82]:
In [56]:
a.sort_values("theta_gamma")
Out[56]:
In [4]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
def get_contact_info(pdbFile, seq, frame_index=0):
# frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj", "Dis"])
return info
In [82]:
data = pd.read_csv("training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [9]:
info_ = []
for pdb in pdb_list:
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
info = get_contact_info(pdbFile, seq)
info_.append(info)
info = pd.concat(info_)
In [10]:
In [14]:
direct_contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
if dis < 6.5:
direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [16]:
direct_contact_table.min()
Out[16]:
In [15]:
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [11]:
contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [12]:
len(info)
Out[12]:
In [13]:
contact_table.min()
Out[13]:
In [115]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [109]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [88]:
info.groupby(["Resi", "Resj"])["Dis"].count()
Out[88]:
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
get_contact_info(pdbFile, seq)
Out[81]:
In [78]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
In [74]:
In [6]:
def getContactMapFromPDB(pdbFile, seq, frame_index=0):
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
# pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[81]:
In [7]:
pdb = "3lzt"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[7]:
In [66]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[66]:
In [58]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
In [63]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
In [64]:
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[64]:
In [56]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/native.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [55]:
Out[55]:
In [23]:
three_to_one("TRP")
Out[23]:
In [25]:
seq.count("W")
Out[25]:
In [ ]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
data_ca = data
In [11]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/tmp.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
In [12]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/original.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_original = data
In [14]:
data_original.dtypes
Out[14]:
In [15]:
data_A.dtypes
Out[15]:
In [17]:
seq = "MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA"
In [18]:
len(seq)
Out[18]:
In [33]:
seq
Out[33]:
In [61]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [72]:
data_s = data.astype(int)
t_s = (t>0.08).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
In [71]:
plt.imshow((t>0.08).astype(float), origin="bottom")
plt.colorbar()
Out[71]:
In [70]:
plt.imshow(data, origin="bottom")
Out[70]:
In [20]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[20]:
In [21]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_original.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [26]:
plt.imshow((t>0.1).astype(float), origin="bottom")
plt.colorbar()
Out[26]:
In [ ]:
In [ ]:
In [7]:
len(a[5])
Out[7]:
In [22]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)
In [86]:
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
out.write(header.format(pdb))
out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"])
j = int(d["j"])
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) + len(seq_A)
j = int(d["j"]) + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
out.write("END\n")
In [80]:
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
In [84]:
data_A
Out[84]:
In [79]:
round(p, 8)
Out[79]:
In [23]:
seq = seq_A + seq_B
In [26]:
seq
Out[26]:
In [27]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [28]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[28]:
In [29]:
def getSeqFromRaptorXContact(fileLocation):
with open(fileLocation) as f:
a = f.readlines()
i = 4
seq = ""
assert a[i] == "MODEL 1\n"
i += 1
while True:
line = a[i]
if line[0].isdigit():
break
i += 1
seq += line.strip()
# print(i)
return seq
def getContactMapFromPDB(pdbFile):
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [53]:
pdbFile = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4_A.pdb"
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
# chainA = structure[0]["A"]
# all_residues = list(chainA.get_residues())
all_residues = []
for res in structure.get_residues():
if res.get_id()[0] == " ":
all_residues.append(res)
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [60]:
plt.imshow(data, origin="bottom")
Out[60]:
In [58]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")
In [ ]:
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")
In [70]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [72]:
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")
Out[72]:
In [60]:
combined = data_s + t_s * 2
In [63]:
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[63]:
In [87]:
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
In [88]:
model = structure[0]
In [90]:
a = list(model.get_chains())
In [92]:
c = a[0]
In [94]:
c.id
Out[94]:
In [95]:
c = "ALL"
In [99]:
"ABC" is not "ALL"
Out[99]:
In [93]:
c.get_id()
Out[93]:
In [55]:
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")
Out[55]:
In [43]:
plt.imshow(data_s, origin="bottom")
Out[43]:
In [44]:
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()
Out[44]:
In [16]:
346
Out[16]:
In [17]:
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
In [20]:
len(seq)
Out[20]:
In [18]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [ ]:
def getContactMap(pdb):
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p