In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3
In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [4]:
def get_interaction_data(structure):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
# if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
if res1.resname == res2.resname:
if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
continue
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [6]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [19]:
In [72]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1fna/cbd_1fna.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [21]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
def calculate_property_density(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
In [99]:
def get_environment(structure):
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
return environment_info
In [118]:
data_list = []
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna.csv", index_col=0)
data_envr = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna_environment.csv", index_col=0)
data_envr["Density_H"] = data_envr["Density_H"].round()
data_envr["Density_P"] = data_envr["Density_P"].round()
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
data_list.append(data_.assign(Protein=pdb))
In [122]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data.csv", index_col=0)
In [209]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
In [210]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[210]:
In [203]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[203]:
In [212]:
In [158]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1pne/cbd_1pne.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [159]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
In [197]:
def calculate_property_density_debug(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
theta = interaction_well(rij, rmin, 6.5, 5)
if res1globalindex == 64:
if hasProperty * theta > 0.1:
print(res1index, res1.resname, res2.resname, res2index, rij, hasProperty, theta)
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
In [198]:
calculate_property_density_debug(res_list, neighbor_list, isP)[64]
Out[198]:
In [164]:
environment_info.query("index==108")
Out[164]:
In [195]:
data2.query("ResId2=='65' and Protein == '1pne'")
Out[195]:
In [221]:
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[221]:
In [ ]:
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
In [154]:
a.sort_values("D_P")
Out[154]:
In [214]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[214]:
In [151]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[151]:
In [ ]:
In [140]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[140]:
In [152]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")
Out[152]:
In [139]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")
Out[139]:
In [ ]:
In [109]:
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
Out[109]:
In [111]:
data_with_info.shape
Out[111]:
In [39]:
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
In [75]:
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round()
density_P = calculate_property_density(res_list, neighbor_list, isP).round()
In [76]:
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
In [77]:
data_with_info = data.merge(environment_info, left_on="Index1", right_on="index").merge(environment_info, left_on="Index2", right_on="index")
In [ ]:
In [80]:
a = data_with_info.query("Type=='Direct' and Theta > 1e-1")
In [87]:
a.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[87]:
In [112]:
a.shape
Out[112]:
In [88]:
a.query("Res1 =='LEU' and Res2 == 'ILE'")
Out[88]:
In [70]:
data.merge(environment_info, left_on="Index2", right_on="index")
Out[70]:
In [71]:
data
Out[71]:
In [66]:
environment_info
Out[66]:
In [ ]:
In [57]:
density_H
Out[57]:
In [50]:
density_P.round(1)
Out[50]:
In [51]:
cb_density.round(1)
Out[51]:
In [31]:
# get environmental information
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r"])
In [47]:
structure = s
kappa=5.0
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
# kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2"])
In [74]:
In [4]:
In [30]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_larger_excl_withoutBurial/iter_larger_excl_withoutBurial_30"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
# pdb = "1sfp"
# pdb = "1who"
pdb = "1jon"
pdb = "1neu"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/crystal_structure.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data_native = get_interaction_data(structure)
a_native = data_native.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a_native["theta_gamma"] = a_native["Theta"] * a_native["Gamma"]
In [31]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[31]:
In [32]:
a_native[["Type", "theta_gamma"]].groupby("Type").sum()
Out[32]:
In [33]:
a[["Type", "theta_gamma"]].groupby("Type").sum().sum()
Out[33]:
In [34]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
Out[34]:
In [35]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
Out[35]:
In [36]:
b2_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2_native.sort_values("theta_gamma").tail(5)
Out[36]:
In [37]:
b3_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3_native.sort_values("theta_gamma").head(5)
Out[37]:
In [60]:
gammaFile = "/Users/weilu/Research/server/mar_2020/cath_dataset_shuffle_optimization/optimization_iter0_com_density_rmin_2p5/saved_gammas/iter0_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
pdb = "1sfp"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [61]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[61]:
In [62]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
In [63]:
Out[63]:
In [20]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
In [23]:
Out[23]:
In [25]:
fastaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/3cyr/0/crystal_structure.fasta"
seq = read_fasta(fastaFile)
In [26]:
seq.count("C")
Out[26]:
In [27]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [65]:
info_ = []
for pdb in pdb_list:
seq = np.loadtxt(f"/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/database/S20_seq/{pdb}.seq", dtype=str)
seq = str(seq)
info_.append([pdb, seq, seq.count("C"), len(seq)])
In [66]:
data = pd.DataFrame(info_, columns=["Protein", "Seq", "Count", "Length"])
In [67]:
data.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/cys_count_data.csv")
In [47]:
data["Length"] = data["Seq"].apply(lambda x:len(x))
In [70]:
print(data.sort_values("Count").query("Count >= 6")["Protein"].to_list())
In [64]:
data
Out[64]:
In [20]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
In [75]:
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [77]:
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [78]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[78]:
In [79]:
a["theta_gamma"].sum()
Out[79]:
In [80]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
In [83]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
In [91]:
b3.sort_values("theta_gamma").head(200)["theta_gamma"].sum()
Out[91]:
In [96]:
a.query("Res1 == 'TYR' and Res2 == 'CYS' and Theta > 0.1 and Type =='Direct'").sort_values("r")
Out[96]:
In [86]:
b3.sort_values("theta_gamma").head(20)
Out[86]:
In [69]:
b2.sort_values("theta_gamma").tail(10)
Out[69]:
In [82]:
a.query("Res1 == 'CYS' and Res2 == 'CYS' and Theta > 0.1")
Out[82]:
In [56]:
a.sort_values("theta_gamma")
Out[56]:
In [4]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
def get_contact_info(pdbFile, seq, frame_index=0):
# frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj", "Dis"])
return info
In [82]:
data = pd.read_csv("training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [9]:
info_ = []
for pdb in pdb_list:
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
info = get_contact_info(pdbFile, seq)
info_.append(info)
info = pd.concat(info_)
In [10]:
In [14]:
direct_contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
if dis < 6.5:
direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [16]:
direct_contact_table.min()
Out[16]:
In [15]:
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [11]:
contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [12]:
len(info)
Out[12]:
In [13]:
contact_table.min()
Out[13]:
In [115]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [109]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [88]:
info.groupby(["Resi", "Resj"])["Dis"].count()
Out[88]:
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
get_contact_info(pdbFile, seq)
Out[81]:
In [78]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
In [74]:
In [6]:
def getContactMapFromPDB(pdbFile, seq, frame_index=0):
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
# pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[81]:
In [7]:
pdb = "3lzt"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[7]:
In [66]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[66]:
In [58]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
In [63]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
In [64]:
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[64]:
In [56]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/native.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [55]:
Out[55]:
In [23]:
three_to_one("TRP")
Out[23]:
In [25]:
seq.count("W")
Out[25]:
In [ ]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
data_ca = data
In [11]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/tmp.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
In [12]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/original.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_original = data
In [14]:
data_original.dtypes
Out[14]:
In [15]:
data_A.dtypes
Out[15]:
In [17]:
seq = "MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA"
In [18]:
len(seq)
Out[18]:
In [33]:
seq
Out[33]:
In [61]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [72]:
data_s = data.astype(int)
t_s = (t>0.08).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
In [71]:
plt.imshow((t>0.08).astype(float), origin="bottom")
plt.colorbar()
Out[71]:
In [70]:
plt.imshow(data, origin="bottom")
Out[70]:
In [20]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[20]:
In [21]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_original.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [26]:
plt.imshow((t>0.1).astype(float), origin="bottom")
plt.colorbar()
Out[26]:
In [ ]:
In [ ]:
In [7]:
len(a[5])
Out[7]:
In [22]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)
In [86]:
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
out.write(header.format(pdb))
out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"])
j = int(d["j"])
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) + len(seq_A)
j = int(d["j"]) + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
out.write("END\n")
In [80]:
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
In [84]:
data_A
Out[84]:
In [79]:
round(p, 8)
Out[79]:
In [23]:
seq = seq_A + seq_B
In [26]:
seq
Out[26]:
In [27]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [28]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[28]:
In [29]:
def getSeqFromRaptorXContact(fileLocation):
with open(fileLocation) as f:
a = f.readlines()
i = 4
seq = ""
assert a[i] == "MODEL 1\n"
i += 1
while True:
line = a[i]
if line[0].isdigit():
break
i += 1
seq += line.strip()
# print(i)
return seq
def getContactMapFromPDB(pdbFile):
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [53]:
pdbFile = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4_A.pdb"
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
# chainA = structure[0]["A"]
# all_residues = list(chainA.get_residues())
all_residues = []
for res in structure.get_residues():
if res.get_id()[0] == " ":
all_residues.append(res)
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [60]:
plt.imshow(data, origin="bottom")
Out[60]:
In [58]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")
In [ ]:
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")
In [70]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [72]:
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")
Out[72]:
In [60]:
combined = data_s + t_s * 2
In [63]:
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[63]:
In [87]:
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
In [88]:
model = structure[0]
In [90]:
a = list(model.get_chains())
In [92]:
c = a[0]
In [94]:
c.id
Out[94]:
In [95]:
c = "ALL"
In [99]:
"ABC" is not "ALL"
Out[99]:
In [93]:
c.get_id()
Out[93]:
In [55]:
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")
Out[55]:
In [43]:
plt.imshow(data_s, origin="bottom")
Out[43]:
In [44]:
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()
Out[44]:
In [16]:
346
Out[16]:
In [17]:
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
In [20]:
len(seq)
Out[20]:
In [18]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [ ]:
def getContactMap(pdb):
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p