In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3
In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [4]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_native_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_native = a
data_native.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[4]:
In [5]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_iterative = a
data_iterative.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[5]:
In [12]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_old = a
data_old.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[12]:
In [13]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data2 = a
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[13]:
In [9]:
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[9]:
In [28]:
b = data2.query("Res1=='SER' and Res2=='ILE'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[28]:
In [33]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")[:-100].hist("Theta", bins=50)
Out[33]:
In [16]:
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[16]:
In [14]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[14]:
In [28]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data_old.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[28]:
In [25]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[25]:
In [30]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[30]:
In [30]:
info = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/info_collection.csv")
In [33]:
info = info.query("Steps==2001")[["Q", "Run", "Protein", "Folder"]].reset_index(drop=True)
In [34]:
info.query("Folder")
Out[34]:
In [38]:
data_iterative = data_iterative.merge(info, on=["Protein", "Run"])
In [43]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q < 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[43]:
In [44]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q > 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[44]:
In [42]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[42]:
In [40]:
sns.scatterplot("D_H", "D_P", hue="Q", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[40]:
In [153]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)
Out[153]:
In [10]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)
Out[10]:
In [101]:
pre = "/Users/weilu/Research/server/mar_2020/environmental_information/"
pdb = "1akr"
ii = 1
data = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}.csv", index_col=0)
data_envr = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}_environment.csv", index_col=0)
# data_envr["Density_H"] = data_envr["Density_H"].round()
# data_envr["Density_P"] = data_envr["Density_P"].round()
data_with_info = data.merge(data_envr, how='left', left_on="Index1", right_on="index").merge(data_envr, how='left', left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 5e-2 and Type == 'Direct'").reset_index(drop=True)
# data_ = data_with_info
In [102]:
data_.query(f"Res1=='{res1}' and Res2=='{res2}'")
Out[102]:
In [128]:
# true positive
pdb = "1akr"
res1 = "LEU"
res2 = "LEU"
run = 0
a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
In [119]:
contacts = set(b["Index1"].astype(str) + "_" + b["Index2"].astype(str))
In [120]:
contacts
Out[120]:
In [6]:
def isTP(contact, native_contacts):
if contact in native_contacts:
return "TP"
if contact not in native_contacts:
return "FP"
res1 = "THR"
res2 = "PRO"
run = 0
pdb = "1akr"
pdb_list = data_iterative["Protein"].unique()
new_d = []
for run in range(2):
for pdb in pdb_list:
a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
native_contacts = set(a["Index1"].astype(str) + "_" + a["Index2"].astype(str))
b["Contact"] = b["Index1"].astype(str) + "_" + b["Index2"].astype(str)
b["isTP"] = b["Contact"].apply(isTP, native_contacts=native_contacts)
new_d.append(b)
new_d = pd.concat(new_d).reset_index(drop=True)
In [7]:
new_d
Out[7]:
In [ ]:
In [ ]:
In [125]:
native_contacts
Out[125]:
In [131]:
In [143]:
In [8]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_iter3/saved_gammas/iter3_z_weighted_2_cutoff400_impose_Aprime_constraint"
gamma_info = get_contact_gamma_info(gammaFile)
In [9]:
gamma_info.query("Interaction=='Direct'").sort_values("Gamma")
Out[9]:
In [11]:
gamma_info.query("Interaction=='Direct' and Res1=='L'")
Out[11]:
In [2]:
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [59]:
data_native
Out[59]:
In [54]:
data_native.query(f"Res1=='{res1}' and Res2=='{res2}'")
Out[54]:
In [24]:
data_native.query("Res1=='LEU' and Res2=='LEU' and D_H > 9")
Out[24]:
In [23]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[23]:
In [22]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
sns.scatterplot("D_H", "D_P", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[22]:
In [39]:
a = data_native.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="reg")
Out[39]:
In [20]:
cbd_info = pd.read_csv("/Users/weilu/opt/parameters/side_chain/cbd_cbd_real_contact_symmetric.csv")
In [22]:
cbd_info.query("ResName1 == 'SER'")
Out[22]:
In [ ]:
def get_r_min_max(a, res1, res2, type="Direct"):
res1_name = res1.get_resname()
res2_name = res2.get_resname()
if type == "Direct":
if res1_name == "GLY" or res2_name == "GLY":
r_min_res1_res2 = 2.5
r_max_res1_res2 = 6.5
else:
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
try:
r_min_res1_res2 = float(b["r_min"]) - 0.5
r_max_res1_res2 = float(b["r_max"]) + 1.5
except:
print("problem", b)
else:
if res1_name == "GLY" or res2_name == "GLY":
r_min_res1_res2 = 6.5
r_max_res1_res2 = 9.5
else:
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
try:
r_min_res1_res2 = float(b["r_max"]) + 1.5
r_max_res1_res2 = float(b["r_max"]) + 4.5
except:
print(b)
return r_min_res1_res2, r_max_res1_res2
def get_interaction_data_with_cbd_info(structure, cbd_info):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
# cb_density = calculate_cb_density(res_list, neighbor_list)
cb_density = calculate_cb_density_com_wellCenter(res_list, neighbor_list, cbd_info)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
# if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
if res1.resname == res2.resname:
if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
continue
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
# theta = interaction_well(rij, r_min, r_max, kappa)
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Mediated")
theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
direct_theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def calculate_property_density_with_cbd_info(res_list, neighbor_list, propertyTable, cbd_info, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
density[res1globalindex] += hasProperty * interaction_well(rij, r_min_res1_res2, r_max_res1_res2, 5)
return density
def get_environment_with_cbd_info(structure, cbd_info):
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density_with_cbd_info(res_list, neighbor_list, isH, cbd_info).round(3)
density_P = calculate_property_density_with_cbd_info(res_list, neighbor_list, isP, cbd_info).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
return environment_info
In [ ]:
In [6]:
def get_interaction_data(structure):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
# if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
if res1.resname == res2.resname:
if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
continue
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [6]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [19]:
In [72]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1fna/cbd_1fna.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [21]:
In [99]:
def calculate_property_density(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
def get_environment(structure):
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
return environment_info
In [118]:
data_list = []
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna.csv", index_col=0)
data_envr = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna_environment.csv", index_col=0)
data_envr["Density_H"] = data_envr["Density_H"].round()
data_envr["Density_P"] = data_envr["Density_P"].round()
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
data_list.append(data_.assign(Protein=pdb))
In [122]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data.csv", index_col=0)
In [209]:
In [210]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[210]:
In [203]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[203]:
In [212]:
In [158]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1pne/cbd_1pne.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [159]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
In [197]:
def calculate_property_density_debug(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
theta = interaction_well(rij, rmin, 6.5, 5)
if res1globalindex == 64:
if hasProperty * theta > 0.1:
print(res1index, res1.resname, res2.resname, res2index, rij, hasProperty, theta)
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
In [198]:
calculate_property_density_debug(res_list, neighbor_list, isP)[64]
Out[198]:
In [164]:
environment_info.query("index==108")
Out[164]:
In [195]:
data2.query("ResId2=='65' and Protein == '1pne'")
Out[195]:
In [221]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[221]:
In [13]:
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
In [154]:
a.sort_values("D_P")
Out[154]:
In [214]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[214]:
In [151]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[151]:
In [ ]:
In [140]:
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[140]:
In [152]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")
Out[152]:
In [139]:
sns.jointplot("D_H", "D_P", data=a, kind="hex")
Out[139]:
In [ ]:
In [109]:
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
Out[109]:
In [111]:
data_with_info.shape
Out[111]:
In [39]:
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
In [75]:
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round()
density_P = calculate_property_density(res_list, neighbor_list, isP).round()
In [76]:
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
In [77]:
data_with_info = data.merge(environment_info, left_on="Index1", right_on="index").merge(environment_info, left_on="Index2", right_on="index")
In [ ]:
In [80]:
a = data_with_info.query("Type=='Direct' and Theta > 1e-1")
In [87]:
a.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[87]:
In [112]:
a.shape
Out[112]:
In [88]:
a.query("Res1 =='LEU' and Res2 == 'ILE'")
Out[88]:
In [70]:
data.merge(environment_info, left_on="Index2", right_on="index")
Out[70]:
In [71]:
data
Out[71]:
In [66]:
environment_info
Out[66]:
In [ ]:
In [57]:
density_H
Out[57]:
In [50]:
density_P.round(1)
Out[50]:
In [51]:
cb_density.round(1)
Out[51]:
In [31]:
# get environmental information
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r"])
In [47]:
structure = s
kappa=5.0
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
# kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+2.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2"])
In [74]:
In [4]:
In [30]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_larger_excl_withoutBurial/iter_larger_excl_withoutBurial_30"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
# pdb = "1sfp"
# pdb = "1who"
pdb = "1jon"
pdb = "1neu"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_1_stronger_exclude_withoutBurial/{pdb}/0/crystal_structure.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data_native = get_interaction_data(structure)
a_native = data_native.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a_native["theta_gamma"] = a_native["Theta"] * a_native["Gamma"]
In [31]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[31]:
In [32]:
a_native[["Type", "theta_gamma"]].groupby("Type").sum()
Out[32]:
In [33]:
a[["Type", "theta_gamma"]].groupby("Type").sum().sum()
Out[33]:
In [34]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
Out[34]:
In [35]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
Out[35]:
In [36]:
b2_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2_native.sort_values("theta_gamma").tail(5)
Out[36]:
In [37]:
b3_native = a_native.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3_native.sort_values("theta_gamma").head(5)
Out[37]:
In [60]:
gammaFile = "/Users/weilu/Research/server/mar_2020/cath_dataset_shuffle_optimization/optimization_iter0_com_density_rmin_2p5/saved_gammas/iter0_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
pdb = "1sfp"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/{pdb}/0/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [61]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[61]:
In [62]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
b2.sort_values("theta_gamma").tail(5)
In [63]:
Out[63]:
In [20]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
b3.sort_values("theta_gamma").head(5)
In [23]:
Out[23]:
In [25]:
fastaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_0_stronger_exclude_volume/3cyr/0/crystal_structure.fasta"
seq = read_fasta(fastaFile)
In [26]:
seq.count("C")
Out[26]:
In [27]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_run/training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [65]:
info_ = []
for pdb in pdb_list:
seq = np.loadtxt(f"/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/database/S20_seq/{pdb}.seq", dtype=str)
seq = str(seq)
info_.append([pdb, seq, seq.count("C"), len(seq)])
In [66]:
data = pd.DataFrame(info_, columns=["Protein", "Seq", "Count", "Length"])
In [67]:
data.to_csv("/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/cys_count_data.csv")
In [47]:
data["Length"] = data["Seq"].apply(lambda x:len(x))
In [70]:
print(data.sort_values("Count").query("Count >= 6")["Protein"].to_list())
In [64]:
data
Out[64]:
In [20]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
contact_gammas = get_contact_gamma_info(gammaFile)
In [75]:
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
contact_gammas["Type"] = contact_gammas["Interaction"]
a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [77]:
a["theta_gamma"] = a["Theta"] * a["Gamma"]
In [78]:
a[["Type", "theta_gamma"]].groupby("Type").sum()
Out[78]:
In [79]:
a["theta_gamma"].sum()
Out[79]:
In [80]:
b2 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].count().reset_index()
In [83]:
b3 = a.query("Theta > 0.1").groupby(["Res1", "Res2", "Type"])["theta_gamma"].sum().reset_index()
In [91]:
b3.sort_values("theta_gamma").head(200)["theta_gamma"].sum()
Out[91]:
In [96]:
a.query("Res1 == 'TYR' and Res2 == 'CYS' and Theta > 0.1 and Type =='Direct'").sort_values("r")
Out[96]:
In [86]:
b3.sort_values("theta_gamma").head(20)
Out[86]:
In [69]:
b2.sort_values("theta_gamma").tail(10)
Out[69]:
In [82]:
a.query("Res1 == 'CYS' and Res2 == 'CYS' and Theta > 0.1")
Out[82]:
In [56]:
a.sort_values("theta_gamma")
Out[56]:
In [4]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
def get_contact_info(pdbFile, seq, frame_index=0):
# frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
info = pd.DataFrame(info_, columns=["Protein", "i", "j", "Resi", "Resj", "Dis"])
return info
In [82]:
data = pd.read_csv("training_set.csv")
specific_decoys = data.query("Length < 150 and Length > 70").reset_index(drop=True)
pdb_list = specific_decoys["Protein"].to_list()
pdb_list = [a.lower() for a in pdb_list]
skip_pdb_list = ["1puc", "1skz"]
skip_pdb_list += ["1msc", "1fmb", "1gvp", "2tgi", "1whi", "1baj", "1rmd", "1div"] #dimer.
skip_pdb_list += ["1aqe"] # lots of ligand
filtered_pdb_list = [x for x in pdb_list if x not in skip_pdb_list]
pdb_list = filtered_pdb_list
In [9]:
info_ = []
for pdb in pdb_list:
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
info = get_contact_info(pdbFile, seq)
info_.append(info)
info = pd.concat(info_)
In [10]:
In [14]:
direct_contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
if dis < 6.5:
direct_contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
direct_contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [16]:
direct_contact_table.min()
Out[16]:
In [15]:
plt.imshow(direct_contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [11]:
contact_table = np.zeros((20,20))
for i, line in info.iterrows():
resi = line["Resi"]
resj = line["Resj"]
dis = line["Dis"]
contact_table[d1_to_index[resi]][d1_to_index[resj]] += 1
if resi != resj:
contact_table[d1_to_index[resj]][d1_to_index[resi]] += 1
In [12]:
len(info)
Out[12]:
In [13]:
contact_table.min()
Out[13]:
In [115]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [109]:
plt.imshow(contact_table, origin=0)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
_ = plt.yticks(np.arange(20), aa3)
In [88]:
info.groupby(["Resi", "Resj"])["Dis"].count()
Out[88]:
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
get_contact_info(pdbFile, seq)
Out[81]:
In [78]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
info_ = []
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
dis = a1 - a2
if dis < cutoff and j - i > MAX_OFFSET:
info_.append([pdb, i, j, seq[i], seq[j], dis])
In [74]:
In [6]:
def getContactMapFromPDB(pdbFile, seq, frame_index=0):
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [81]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
# pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/movie.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[81]:
In [7]:
pdb = "3lzt"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[7]:
In [66]:
pdb = "2a0b"
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/cbd-openmmawsem.pdb"
seq = read_fasta(f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/{pdb}/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
pdbFile = f"/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/{pdb}/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[66]:
In [58]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
data_native = getContactMapFromPDB(pdbFile, seq)
In [63]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/lastFrame.pdb"
# pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/test/native.pdb"
data_predicted = getContactMapFromPDB(pdbFile, seq, frame_index=-1)
In [64]:
combined = data_native + data_predicted * 2
from matplotlib import colors
# white means not present in both
# red means present in the first but not the second
# blue means present in the second but not the first
# black means present in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[64]:
In [56]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_start_native_iter5/1baj/0/native.pdb"
seq = read_fasta("/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/crystal_structure.fasta")
frame_index = 0
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
models = list(structure.get_models())
model = models[frame_index]
all_residues = list(model.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
if seq[i] == "G":
a1 = res1["CA"]
else:
a1 = res1["CB"]
for j, res2 in enumerate(all_residues):
if seq[j] == "G":
a2 = res2["CA"]
else:
a2 = res2["CB"]
contact_table[i][j] = a1 - a2
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [55]:
Out[55]:
In [23]:
three_to_one("TRP")
Out[23]:
In [25]:
seq.count("W")
Out[25]:
In [ ]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1baj/cbd-openmmawsem.pdb"
cutoff = 9.5
MAX_OFFSET = 9
parser = PDBParser()
structure = parser.get_structure("x", pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
data_ca = data
In [11]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/tmp.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
In [12]:
fileLocation = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/original.spotcon"
data = pd.read_csv(fileLocation, skiprows=5, sep="\s+", names=["i","j","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_original = data
In [14]:
data_original.dtypes
Out[14]:
In [15]:
data_A.dtypes
Out[15]:
In [17]:
seq = "MSKLTTGSFSIEDLESVQITINNIVGAAKEAAEEKEKELVNAGPTLFPGLEGYRDDWNFKLLDRYEPVITPMCDQCCYCTYGPCDLSGNKRGACGIDMKGHNGREFFLRVITGTACHAAHGRHLLDHLIEKYGEDLPLTLGQSNVLTPNITISTGLSPKTLGEVKPAMEYVEEQLTQLLATVHAGQESAEIDYDSKALFSGSLDHVGMEISDIVQVAAYDFPKADPEAPLVEIGMGTIDKSKPFLCVIGHNVAGVTYMMDYMEDNNLTDKMEIAGLCCTAIDLTRYKEADRRPPYAKVIGSMSKELKVIRSGMPDVIVVDEQCVRGDIVPEAQKLKIPVIASNPKIMYGLPNRTDADVDETMEELKSGKIPGCVMLDYDKLGELCVRLTMEMAPIRDAAGITALPTDEELVNMVAKCADCGACLLACPEEIDIPEAMGFAKKGDFSYFEEIHDTCIGCRRCEQVCKKEIPILNVIEKIAQKQIAEEKGLMRAGRGQVSDAEIRAEGLNLVMGTTPGIIAIIGCPNYAGGTKDVYYIAEEFLKRNFIVVTTGCGAMDIGMFKDADGKTLYERFPGGFQCGGLANIGSCVSNAHITGAAEKVAAIFAQRTLEGNLAEIGDYILNRVGACGLAWGAFSQKASSIGTGCNIFGIPAVLGPHSSKYRRALIAKTYEEDKWKVYDARNGQEMPIPPAPEFLLTTAETWQEAIPMMAKACIRPSDNSMGRAIKLTHWMELHKKYLGGKEPEDWWKFVRTEADLPLATREALLKELEKEHGWEIDWKRKKIISGPKIKFDVSAQPTNLKRLCKEA"
In [18]:
len(seq)
Out[18]:
In [33]:
seq
Out[33]:
In [61]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [72]:
data_s = data.astype(int)
t_s = (t>0.08).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
In [71]:
plt.imshow((t>0.08).astype(float), origin="bottom")
plt.colorbar()
Out[71]:
In [70]:
plt.imshow(data, origin="bottom")
Out[70]:
In [20]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[20]:
In [21]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_original.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [26]:
plt.imshow((t>0.1).astype(float), origin="bottom")
plt.colorbar()
Out[26]:
In [ ]:
In [ ]:
In [7]:
len(a[5])
Out[7]:
In [22]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)
In [86]:
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
out.write(header.format(pdb))
out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"])
j = int(d["j"])
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) + len(seq_A)
j = int(d["j"]) + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
out.write(f"{i} {j} {s} {ss} {p}\n")
out.write("END\n")
In [80]:
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = round(d["p"], 8)
s = int(d["s"])
ss = int(d["ss"])
In [84]:
data_A
Out[84]:
In [79]:
round(p, 8)
Out[79]:
In [23]:
seq = seq_A + seq_B
In [26]:
seq
Out[26]:
In [27]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
for index, d in data_B.iterrows():
# print(index)
i = int(d["i"]) - 1 + len(seq_A)
j = int(d["j"]) - 1 + len(seq_A)
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [28]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()
Out[28]:
In [29]:
def getSeqFromRaptorXContact(fileLocation):
with open(fileLocation) as f:
a = f.readlines()
i = 4
seq = ""
assert a[i] == "MODEL 1\n"
i += 1
while True:
line = a[i]
if line[0].isdigit():
break
i += 1
seq += line.strip()
# print(i)
return seq
def getContactMapFromPDB(pdbFile):
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
all_residues = list(structure.get_residues())
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
return data
In [53]:
pdbFile = "/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4_A.pdb"
cutoff = 9.5
MAX_OFFSET = 6
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
# chainA = structure[0]["A"]
# all_residues = list(chainA.get_residues())
all_residues = []
for res in structure.get_residues():
if res.get_id()[0] == " ":
all_residues.append(res)
n = len(all_residues)
contact_table = np.zeros((n,n))
# print(pdb, n)#
for i, res1 in enumerate(all_residues):
for j, res2 in enumerate(all_residues):
contact_table[i][j] = res1["CA"]-res2["CA"]
data = (contact_table < cutoff)
remove_band = np.eye(n)
for i in range(1, MAX_OFFSET):
remove_band += np.eye(n, k=i)
remove_band += np.eye(n, k=-i)
data[remove_band==1] = 0
In [60]:
plt.imshow(data, origin="bottom")
Out[60]:
In [58]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/feb_2020/SPOT-Contact-Helical-New/outputs/3cf4.pdb")
In [ ]:
In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")
In [70]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)
In [72]:
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")
Out[72]:
In [60]:
combined = data_s + t_s * 2
In [63]:
combined = data_s + t_s * 2
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
Out[63]:
In [87]:
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)
In [88]:
model = structure[0]
In [90]:
a = list(model.get_chains())
In [92]:
c = a[0]
In [94]:
c.id
Out[94]:
In [95]:
c = "ALL"
In [99]:
"ABC" is not "ALL"
Out[99]:
In [93]:
c.get_id()
Out[93]:
In [55]:
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")
Out[55]:
In [43]:
plt.imshow(data_s, origin="bottom")
Out[43]:
In [44]:
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()
Out[44]:
In [16]:
346
Out[16]:
In [17]:
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
In [20]:
len(seq)
Out[20]:
In [18]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p
In [ ]:
def getContactMap(pdb):
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
# print(index)
i = int(d["i"]) - 1
j = int(d["j"]) - 1
p = d["p"]
# print(i,j,p)
t[i,j] = p
t[j,i] = p