In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
from Bio.PDB.Polypeptide import d1_to_index
from Bio.PDB.Polypeptide import dindex_to_1
from Bio.PDB.Polypeptide import aa3
In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [4]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath_with_cbd_info_complete.csv", index_col=0)
# a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
# a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data2 = a
In [8]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
d_P = 0
d_H = 0
for res in res_type_map_HP.keys():
density_x = f"Density_{res}_x"
density_y = f"Density_{res}_y"
if res_type_map_HP[res] == 1:
d_P += data2[density_x] + data2[density_y]
else:
d_H += data2[density_x] + data2[density_y]
data2["D_H"] = d_H
data2["D_P"] = d_P
data2["D_H_minus_P"] = data2["D_H"] - data2["D_P"]
In [10]:
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
data_selected["D_H_minus_P"].hist(bins=100)
Out[10]:
In [12]:
data_selected = data2.reset_index(drop=True)
data_selected["D_H_minus_P"].hist(bins=50)
Out[12]:
In [18]:
from sklearn.cluster import KMeans
random_state = 170
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(a)
In [30]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(a)
kmeans.labels_
kmeans.cluster_centers_
Out[30]:
In [36]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=1).fit_transform(a)
X_embedded.shape
Out[36]:
In [41]:
a= data_selected.iloc[:,10:30].values
X_embedded = TSNE(n_components=1).fit_transform(a)
data_selected["tsne"] = X_embedded
data_selected = data_selected.sort_values("tsne").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.04)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [ ]:
In [25]:
data_selected
Out[25]:
In [13]:
from sklearn.manifold import MDS
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
embedding = MDS(n_components=1)
X_transformed = embedding.fit_transform(a)
X_transformed.shape
Out[13]:
In [17]:
data_selected
Out[17]:
In [16]:
data_selected["MDS"] = X_transformed
data_selected = data_selected.sort_values("MDS").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.05)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [ ]:
In [8]:
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.05)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [42]:
data_selected = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.005)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [42]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
d_P = 0
d_H = 0
for res in res_type_map_HP.keys():
density_x = f"Density_{res}_x"
density_y = f"Density_{res}_y"
if res_type_map_HP[res] == 1:
d_H += data2[density_x] + data2[density_y]
else:
d_P += data2[density_x] + data2[density_y]
data2["D_H"] = d_H
data2["D_P"] = d_P
In [55]:
res_type_map_poloar_noplar = {
'C': 0, 'M': 0, 'F': 0, 'I': 0, 'L': 0, 'V': 0, 'W': 0, 'G': 0, 'P': 0, 'A': 0,
'Y': 1, 'H': 1, 'T': 1, 'D': 1, 'E': 1, 'N': 1, 'Q': 1, 'R': 1, 'K': 1, 'S': 1
}
d_noP = 0
d_P = 0
for res in res_type_map_poloar_noplar.keys():
density_x = f"Density_{res}_x"
density_y = f"Density_{res}_y"
if res_type_map_poloar_noplar[res] == 1:
d_P += data2[density_x] + data2[density_y]
else:
d_noP += data2[density_x] + data2[density_y]
data2["D_NoPolar"] = d_noP
data2["D_Polar"] = d_P
In [4]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data2 = a
In [56]:
data2["D_NoPolar_minus_Polar"] = data2["D_NoPolar"] - data2["D_Polar"]
In [45]:
data2["D_H_minus_P"] = data2["D_H"] - data2["D_P"]
In [ ]:
["I", "L", "V"]
In [54]:
data_selected.iloc[:,10:30].sum().sort_values()
Out[54]:
In [60]:
data2["D_NoPolar_minus_Polar"].hist(bins=30)
Out[60]:
In [59]:
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
data_selected["D_NoPolar_minus_Polar"].hist(bins=30)
Out[59]:
In [51]:
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
data_selected["D_H_minus_P"].hist(bins=100)
Out[51]:
In [46]:
data2["D_H_minus_P"].hist(bins=50)
Out[46]:
In [43]:
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=b, kind="scatter")
Out[43]:
In [49]:
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=b, kind="scatter")
Out[49]:
In [5]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[5]:
In [ ]:
In [6]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[6]:
In [82]:
res1 = "THR"
res2 = "PRO"
data_selected = data2.query(f"Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.1)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [88]:
plt.imshow(a-b, aspect=0.1, vmin=-3, vmax=3, cmap="seismic")
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [81]:
b= data_selected.iloc[:,31:51].values
plt.imshow(b, aspect=0.1)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [60]:
In [ ]:
data_selected = data2.query("Res1=='ALA' and Res2=='ALA'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.005)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [90]:
data_selected = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.005)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [96]:
(data_selected.iloc[:,10:30] > 0.8).sum()
Out[96]:
In [94]:
data_selected.iloc[:,10:30].sum()
Out[94]:
In [95]:
data_selected = data2.query("Res1=='LEU' and Res2=='LEU'").sort_values(["Density_I_x", "Density_V_x"]).reset_index(drop=True)
a= data_selected.iloc[:,10:30].values
plt.imshow(a, aspect=0.005)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [75]:
a= data_selected.iloc[:,31:51].values
plt.imshow(a, aspect=0.001)
plt.colorbar()
_ = plt.xticks(np.arange(20), aa3)
In [74]:
data2.iloc[:,31:52]
Out[74]:
In [ ]:
In [32]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
res1 = "CYS"
res2 = "ASP"
for i in range(20):
res1 = one_to_three(dindex_to_1[i])
b = data2.query(f"Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
sns_plot = sns.jointplot("D_H", "D_P", data=b, kind="kde", xlim=(0,15), ylim=(0,20))
plt.title(f"{res1}_{res2}")
sns_plot.savefig(f"/Users/weilu/Research/data/environment_information/{res1}_{res2}.png")
plt.close()
In [34]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
res1 = "CYS"
res2 = "ASP"
for i in range(20):
res1 = one_to_three(dindex_to_1[i])
b = data2.query(f"Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
sns_plot = sns.jointplot("D_H", "D_P", data=b, kind="kde")
plt.title(f"{res1}_{res2}")
sns_plot.savefig(f"/Users/weilu/Research/data/environment_information/{res1}_{res2}_bound.png")
plt.close()
In [36]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
res1 = "CYS"
res2 = "ASP"
for i in range(20):
res1 = one_to_three(dindex_to_1[i])
b = data2.query(f"Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
sns_plot = sns.jointplot("D_H", "D_P", data=b, kind="scatter")
plt.title(f"{res1}_{res2}")
sns_plot.savefig(f"/Users/weilu/Research/data/environment_information/{res1}_{res2}_scatter.png")
plt.close()
In [37]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='LYS' and Res2=='ASP'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[37]:
In [35]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='LYS' and Res2=='ASP'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=b, kind="scatter")
Out[35]:
In [33]:
data2["D_H"].hist(bins=50)
Out[33]:
In [30]:
data2["D_P"].hist(bins=50)
Out[30]:
In [31]:
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='CYS' and Res2=='ASP'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=b, kind="kde", xlim=(0,15), ylim=(0,20))
Out[31]:
In [12]:
X = data2.query("Res1=='CYS' and Res2=='ASP'")[["D_H", "D_P"]].values
In [ ]:
In [ ]:
In [ ]:
In [4]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_native_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_native = a
data_native.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[4]:
In [5]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_iterative_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_iterative = a
data_iterative.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[5]:
In [12]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data_old = a
data_old.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[12]:
In [13]:
Out[13]:
In [9]:
a = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath_with_cbd_info.csv", index_col=0)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
data2 = a
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='VAL' and Res2=='LEU'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[9]:
In [28]:
b = data2.query("Res1=='SER' and Res2=='ILE'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[28]:
In [33]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")[:-100].hist("Theta", bins=50)
Out[33]:
In [16]:
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[16]:
In [14]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
b = data2.query("Res1=='GLU' and Res2=='ALA'").reset_index(drop=True)
b["D_H"] = b["Density_H_x"] + b["Density_H_y"]
b["D_P"] = b["Density_P_x"] + b["Density_P_y"]
sns.jointplot("D_H", "D_P", data=b, kind="kde")
Out[14]:
In [28]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data_old.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[28]:
In [25]:
# data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
# data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
a = data2.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[25]:
In [30]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[30]:
In [30]:
info = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/info_collection.csv")
In [33]:
info = info.query("Steps==2001")[["Q", "Run", "Protein", "Folder"]].reset_index(drop=True)
In [34]:
info.query("Folder")
Out[34]:
In [38]:
data_iterative = data_iterative.merge(info, on=["Protein", "Run"])
In [43]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q < 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[43]:
In [44]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU' and Q > 0.5").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[44]:
In [42]:
a = data_iterative.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
sns.jointplot("D_H", "D_P", data=a, kind="kde")
Out[42]:
In [40]:
sns.scatterplot("D_H", "D_P", hue="Q", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[40]:
In [153]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)
Out[153]:
In [10]:
sns.scatterplot("D_H", "D_P", hue="isTP", data=new_d, alpha=0.5)
Out[10]:
In [101]:
pre = "/Users/weilu/Research/server/mar_2020/environmental_information/"
pdb = "1akr"
ii = 1
data = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}.csv", index_col=0)
data_envr = pd.read_csv(f"{pre}/iterative/{pdb}_{ii}_environment.csv", index_col=0)
# data_envr["Density_H"] = data_envr["Density_H"].round()
# data_envr["Density_P"] = data_envr["Density_P"].round()
data_with_info = data.merge(data_envr, how='left', left_on="Index1", right_on="index").merge(data_envr, how='left', left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 5e-2 and Type == 'Direct'").reset_index(drop=True)
# data_ = data_with_info
In [102]:
data_.query(f"Res1=='{res1}' and Res2=='{res2}'")
Out[102]:
In [128]:
# true positive
pdb = "1akr"
res1 = "LEU"
res2 = "LEU"
run = 0
a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
In [119]:
contacts = set(b["Index1"].astype(str) + "_" + b["Index2"].astype(str))
In [120]:
contacts
Out[120]:
In [6]:
def isTP(contact, native_contacts):
if contact in native_contacts:
return "TP"
if contact not in native_contacts:
return "FP"
res1 = "THR"
res2 = "PRO"
run = 0
pdb = "1akr"
pdb_list = data_iterative["Protein"].unique()
new_d = []
for run in range(2):
for pdb in pdb_list:
a = data_native.query(f"Protein=='{pdb}'and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
b = data_iterative.query(f"Protein=='{pdb}' and Run=='{run}' and Res1=='{res1}' and Res2=='{res2}'").reset_index(drop=True)
native_contacts = set(a["Index1"].astype(str) + "_" + a["Index2"].astype(str))
b["Contact"] = b["Index1"].astype(str) + "_" + b["Index2"].astype(str)
b["isTP"] = b["Contact"].apply(isTP, native_contacts=native_contacts)
new_d.append(b)
new_d = pd.concat(new_d).reset_index(drop=True)
In [7]:
new_d
Out[7]:
In [ ]:
In [ ]:
In [125]:
native_contacts
Out[125]:
In [131]:
In [143]:
In [8]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_iter3/saved_gammas/iter3_z_weighted_2_cutoff400_impose_Aprime_constraint"
gamma_info = get_contact_gamma_info(gammaFile)
In [9]:
gamma_info.query("Interaction=='Direct'").sort_values("Gamma")
Out[9]:
In [11]:
gamma_info.query("Interaction=='Direct' and Res1=='L'")
Out[11]:
In [2]:
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [59]:
data_native
Out[59]:
In [54]:
data_native.query(f"Res1=='{res1}' and Res2=='{res2}'")
Out[54]:
In [24]:
data_native.query("Res1=='LEU' and Res2=='LEU' and D_H > 9")
Out[24]:
In [23]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[23]:
In [22]:
sns.scatterplot("D_H", "D_P", data=data_native.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
sns.scatterplot("D_H", "D_P", data=data_iterative.query("Res1=='LEU' and Res2=='LEU'"), alpha=0.5)
Out[22]:
In [39]:
a = data_native.query("Res1=='LEU' and Res2=='LEU'").reset_index(drop=True)
a["D_H"] = a["Density_H_x"] + a["Density_H_y"]
a["D_P"] = a["Density_P_x"] + a["Density_P_y"]
sns.jointplot("D_H", "D_P", data=a, kind="reg")
Out[39]:
In [20]:
cbd_info = pd.read_csv("/Users/weilu/opt/parameters/side_chain/cbd_cbd_real_contact_symmetric.csv")
In [22]:
cbd_info.query("ResName1 == 'SER'")
Out[22]:
In [ ]:
def get_r_min_max(a, res1, res2, type="Direct"):
res1_name = res1.get_resname()
res2_name = res2.get_resname()
if type == "Direct":
if res1_name == "GLY" or res2_name == "GLY":
r_min_res1_res2 = 2.5
r_max_res1_res2 = 6.5
else:
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
try:
r_min_res1_res2 = float(b["r_min"]) - 0.5
r_max_res1_res2 = float(b["r_max"]) + 1.5
except:
print("problem", b)
else:
if res1_name == "GLY" or res2_name == "GLY":
r_min_res1_res2 = 6.5
r_max_res1_res2 = 9.5
else:
b = a.query(f"ResName1=='{res1_name}' and ResName2=='{res2_name}'")
if len(b) == 0:
b = a.query(f"ResName1=='{res2_name}' and ResName2=='{res1_name}'")
try:
r_min_res1_res2 = float(b["r_max"]) + 1.5
r_max_res1_res2 = float(b["r_max"]) + 4.5
except:
print(b)
return r_min_res1_res2, r_max_res1_res2
def get_interaction_data_with_cbd_info(structure, cbd_info):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
# cb_density = calculate_cb_density(res_list, neighbor_list)
cb_density = calculate_cb_density_com_wellCenter(res_list, neighbor_list, cbd_info)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
# if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
if res1.resname == res2.resname:
if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
continue
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
# theta = interaction_well(rij, r_min, r_max, kappa)
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Mediated")
theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
direct_theta = interaction_well(rij, r_min_res1_res2, r_max_res1_res2, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def calculate_property_density_with_cbd_info(res_list, neighbor_list, propertyTable, cbd_info, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
r_min_res1_res2, r_max_res1_res2 = get_r_min_max(cbd_info, res1, res2, type="Direct")
density[res1globalindex] += hasProperty * interaction_well(rij, r_min_res1_res2, r_max_res1_res2, 5)
return density
def get_environment_with_cbd_info(structure, cbd_info):
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density_with_cbd_info(res_list, neighbor_list, isH, cbd_info).round(3)
density_P = calculate_property_density_with_cbd_info(res_list, neighbor_list, isP, cbd_info).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
return environment_info
In [ ]:
In [6]:
def get_interaction_data(structure):
# get all the pair of interaction, direct and mediated. as a dataFrame.
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
cb_density = calculate_cb_density(res_list, neighbor_list)
r_min_direct = 2.5
r_max_direct = 6.5
r_min = 6.5
r_max = 9.5
kappa = 5.0
min_seq_sep = 10
density_threshold = 2.6
density_kappa = 7.0
# phi_mediated_contact_well = np.zeros((2, 20,20))
v_mediated = 0
data_ = []
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
rho_i = cb_density[res1globalindex]
for res2 in get_neighbors_within_radius(neighbor_list, res1, r_max+4.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
rho_j = cb_density[res2globalindex]
# if res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex):
if abs(res2globalindex - res1globalindex) >= min_seq_sep or (res1chain != res2chain):
if res1.resname == res2.resname:
if not (res2globalindex - res1globalindex >= min_seq_sep or (res1chain != res2chain and res2globalindex > res1globalindex)):
continue
res1type = get_res_type(res_list, res1)
res2type = get_res_type(res_list, res2)
rij = get_interaction_distance(res1, res2)
theta = interaction_well(rij, r_min, r_max, kappa)
water_theta = prot_water_switchFunc_sigmaWater(rho_i, rho_j, density_threshold, density_kappa) * theta
protein_theta = prot_water_switchFunc_sigmaProt(rho_i, rho_j, density_threshold, density_kappa) * theta
data_.append([res1.resname, res2.resname, "Protein", round(protein_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
data_.append([res1.resname, res2.resname, "Water", round(water_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
direct_theta = interaction_well(rij, r_min_direct, r_max_direct, kappa)
data_.append([res1.resname, res2.resname, "Direct", round(direct_theta, 3), res1globalindex, res2globalindex, rij, res1index, res2index])
# protein_gamma = protein_gamma_ijm[0][res1type][res2type]*k_hypercharge
# water_gamma = water_gamma_ijm[0][res1type][res2type]*k_hypercharge
data = pd.DataFrame(data_, columns=["Res1", "Res2", "Type", "Theta", "Index1", "Index2", "r", "ResId1", "ResId2"])
# contact_gammas["Res1"] = contact_gammas.apply(lambda x: one_to_three(x["Res1"]), axis=1)
# contact_gammas["Res2"] = contact_gammas.apply(lambda x: one_to_three(x["Res2"]), axis=1)
# contact_gammas["Type"] = contact_gammas["Interaction"]
# a = data.merge(contact_gammas, on=["Res1", "Res2", "Type"])
# a["theta_gamma"] = a["Theta"] * a["Gamma"]
return data
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [6]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/iteration_new_4_without_burial/1poa/1/lastFrame.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [19]:
In [72]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1fna/cbd_1fna.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [21]:
In [99]:
def calculate_property_density(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
def get_environment(structure):
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
return environment_info
In [118]:
data_list = []
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna.csv", index_col=0)
data_envr = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/1fna_environment.csv", index_col=0)
data_envr["Density_H"] = data_envr["Density_H"].round()
data_envr["Density_P"] = data_envr["Density_P"].round()
data_with_info = data.merge(data_envr, left_on="Index1", right_on="index").merge(data_envr, left_on="Index2", right_on="index")
data_ = data_with_info.query("Theta > 1e-1 and Type == 'Direct'").reset_index(drop=True)
data_list.append(data_.assign(Protein=pdb))
In [122]:
data = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data.csv", index_col=0)
In [209]:
In [210]:
data2 = pd.read_csv("/Users/weilu/Research/server/mar_2020/environmental_information/all_data_no_round_cath.csv", index_col=0)
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[210]:
In [203]:
data2.groupby(["Res1", "Res2"])["Theta"].count().reset_index().sort_values("Theta")
Out[203]:
In [212]:
In [158]:
pdbFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_run/setups/1pne/cbd_1pne.pdb"
parser = PDBParser()
structure = parser.get_structure("X", pdbFile)
data = get_interaction_data(structure)
In [159]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
isH = {}
isP = {}
for i in range(20):
isH[dindex_to_1[i]] = res_type_map_HP[dindex_to_1[i]]
isP[dindex_to_1[i]] = 1 - res_type_map_HP[dindex_to_1[i]]
res_list = get_res_list(structure)
neighbor_list = get_neighbor_list(structure)
sequence = get_sequence_from_structure(structure)
density_H = calculate_property_density(res_list, neighbor_list, isH).round(3)
density_P = calculate_property_density(res_list, neighbor_list, isP).round(3)
environment_info = pd.DataFrame([density_H, density_P], index=["Density_H", "Density_P"]).T.reset_index()
In [197]:
def calculate_property_density_debug(res_list, neighbor_list, propertyTable, min_seq_sep=2, rmin=2.5):
num_residues = len(res_list)
density = np.zeros(num_residues)
for res1globalindex, res1 in enumerate(res_list):
res1index = get_local_index(res1)
res1chain = get_chain(res1)
for res2 in get_neighbors_within_radius(neighbor_list, res1, 9.0):
res2index = get_local_index(res2)
res2chain = get_chain(res2)
res2globalindex = get_global_index(res_list, res2)
if abs(res2index - res1index) >= min_seq_sep or (res1chain != res2chain):
rij = get_interaction_distance(res1, res2)
hasProperty = propertyTable[three_to_one(res2.resname)]
theta = interaction_well(rij, rmin, 6.5, 5)
if res1globalindex == 64:
if hasProperty * theta > 0.1:
print(res1index, res1.resname, res2.resname, res2index, rij, hasProperty, theta)
density[res1globalindex] += hasProperty * interaction_well(rij, rmin, 6.5, 5)
return density
In [13]:
text = '''\
slurm-615289.out:slurmstepd: error: *** JOB 615289 ON bc8u7n4 CANCELLED AT 2020-04-20T07:47:48 DUE TO TIME LIMIT ***
slurm-615289.out:slurmstepd: error: *** STEP 615289.0 ON bc8u7n4 CANCELLED AT 2020-04-20T07:47:48 DUE TO TIME LIMIT ***
slurm-615293.out:slurmstepd: error: *** JOB 615293 ON bc6u27n2 CANCELLED AT 2020-04-20T07:52:18 DUE TO TIME LIMIT ***
slurm-615293.out:slurmstepd: error: *** STEP 615293.0 ON bc6u27n2 CANCELLED AT 2020-04-20T07:52:18 DUE TO TIME LIMIT ***
slurm-615296.out:slurmstepd: error: *** STEP 615296.0 ON bc6u23n1 CANCELLED AT 2020-04-20T08:07:20 DUE TO TIME LIMIT ***
slurm-615296.out:slurmstepd: error: *** JOB 615296 ON bc6u23n1 CANCELLED AT 2020-04-20T08:07:20 DUE TO TIME LIMIT ***
slurm-615297.out:slurmstepd: error: *** JOB 615297 ON bc6u23n3 CANCELLED AT 2020-04-20T08:07:50 DUE TO TIME LIMIT ***
slurm-615297.out:slurmstepd: error: *** STEP 615297.0 ON bc6u23n3 CANCELLED AT 2020-04-20T08:07:50 DUE TO TIME LIMIT ***
slurm-615305.out:slurmstepd: error: *** JOB 615305 ON bc6u27n3 CANCELLED AT 2020-04-20T08:40:55 DUE TO TIME LIMIT ***
slurm-615305.out:slurmstepd: error: *** STEP 615305.0 ON bc6u27n3 CANCELLED AT 2020-04-20T08:40:55 DUE TO TIME LIMIT ***
slurm-615306.out:slurmstepd: error: *** JOB 615306 ON bc8u7n1 CANCELLED AT 2020-04-20T08:42:25 DUE TO TIME LIMIT ***
slurm-615306.out:slurmstepd: error: *** STEP 615306.0 ON bc8u7n1 CANCELLED AT 2020-04-20T08:42:25 DUE TO TIME LIMIT ***
slurm-615314.out:slurmstepd: error: *** STEP 615314.0 ON bc6u23n4 CANCELLED AT 2020-04-20T09:02:26 DUE TO TIME LIMIT ***
slurm-615314.out:slurmstepd: error: *** JOB 615314 ON bc6u23n4 CANCELLED AT 2020-04-20T09:02:26 DUE TO TIME LIMIT ***
slurm-615323.out:slurmstepd: error: *** JOB 615323 ON bc8u15n5 CANCELLED AT 2020-04-20T09:23:30 DUE TO TIME LIMIT ***
slurm-615323.out:slurmstepd: error: *** STEP 615323.0 ON bc8u15n5 CANCELLED AT 2020-04-20T09:23:30 DUE TO TIME LIMIT ***
slurm-615332.out:slurmstepd: error: *** JOB 615332 ON bc8u7n7 CANCELLED AT 2020-04-20T10:00:31 DUE TO TIME LIMIT ***
slurm-615332.out:slurmstepd: error: *** STEP 615332.0 ON bc8u7n7 CANCELLED AT 2020-04-20T10:00:31 DUE TO TIME LIMIT ***
slurm-615335.out:slurmstepd: error: *** JOB 615335 ON bc8u7n7 CANCELLED AT 2020-04-20T10:09:03 DUE TO TIME LIMIT ***
slurm-615335.out:slurmstepd: error: *** STEP 615335.0 ON bc8u7n7 CANCELLED AT 2020-04-20T10:09:03 DUE TO TIME LIMIT ***
slurm-615337.out:slurmstepd: error: *** JOB 615337 ON bc6u23n3 CANCELLED AT 2020-04-20T10:20:04 DUE TO TIME LIMIT ***
slurm-615337.out:slurmstepd: error: *** STEP 615337.0 ON bc6u23n3 CANCELLED AT 2020-04-20T10:20:04 DUE TO TIME LIMIT ***
slurm-615346.out:slurmstepd: error: *** JOB 615346 ON bc6u15n4 CANCELLED AT 2020-04-20T10:44:10 DUE TO TIME LIMIT ***
slurm-615346.out:slurmstepd: error: *** STEP 615346.0 ON bc6u15n4 CANCELLED AT 2020-04-20T10:44:10 DUE TO TIME LIMIT ***
slurm-615352.out:slurmstepd: error: *** JOB 615352 ON bc8u7n5 CANCELLED AT 2020-04-20T10:47:10 DUE TO TIME LIMIT ***
slurm-615352.out:slurmstepd: error: *** STEP 615352.0 ON bc8u7n5 CANCELLED AT 2020-04-20T10:47:10 DUE TO TIME LIMIT ***
slurm-615354.out:slurmstepd: error: *** JOB 615354 ON bc6u23n7 CANCELLED AT 2020-04-20T10:51:10 DUE TO TIME LIMIT ***
slurm-615354.out:slurmstepd: error: *** STEP 615354.0 ON bc6u23n7 CANCELLED AT 2020-04-20T10:51:10 DUE TO TIME LIMIT ***
slurm-615359.out:slurmstepd: error: *** STEP 615359.0 ON bc6u23n7 CANCELLED AT 2020-04-20T11:08:13 DUE TO TIME LIMIT ***
slurm-615359.out:slurmstepd: error: *** JOB 615359 ON bc6u23n7 CANCELLED AT 2020-04-20T11:08:13 DUE TO TIME LIMIT ***
slurm-615372.out:slurmstepd: error: *** STEP 615372.0 ON bc6u11n3 CANCELLED AT 2020-04-20T11:32:49 DUE TO TIME LIMIT ***
slurm-615372.out:slurmstepd: error: *** JOB 615372 ON bc6u11n3 CANCELLED AT 2020-04-20T11:32:49 DUE TO TIME LIMIT ***
slurm-615374.out:slurmstepd: error: *** JOB 615374 ON bc6u27n3 CANCELLED AT 2020-04-20T11:36:50 DUE TO TIME LIMIT ***
slurm-615374.out:slurmstepd: error: *** STEP 615374.0 ON bc6u27n3 CANCELLED AT 2020-04-20T11:36:50 DUE TO TIME LIMIT ***
slurm-615395.out:slurmstepd: error: *** STEP 615395.0 ON bc9u7n6 CANCELLED AT 2020-04-20T12:32:24 DUE TO TIME LIMIT ***
slurm-615395.out:slurmstepd: error: *** JOB 615395 ON bc9u7n6 CANCELLED AT 2020-04-20T12:32:24 DUE TO TIME LIMIT ***
slurm-615396.out:slurmstepd: error: *** JOB 615396 ON bc9u7n6 CANCELLED AT 2020-04-20T12:39:55 DUE TO TIME LIMIT ***
slurm-615396.out:slurmstepd: error: *** STEP 615396.0 ON bc9u7n6 CANCELLED AT 2020-04-20T12:39:55 DUE TO TIME LIMIT ***
slurm-615397.out:slurmstepd: error: *** STEP 615397.0 ON bc9u7n6 CANCELLED AT 2020-04-20T12:40:25 DUE TO TIME LIMIT ***
slurm-615397.out:slurmstepd: error: *** JOB 615397 ON bc9u7n6 CANCELLED AT 2020-04-20T12:40:25 DUE TO TIME LIMIT ***
slurm-615406.out:slurmstepd: error: *** JOB 615406 ON bc6u19n8 CANCELLED AT 2020-04-20T13:07:30 DUE TO TIME LIMIT ***
slurm-615406.out:slurmstepd: error: *** STEP 615406.0 ON bc6u19n8 CANCELLED AT 2020-04-20T13:07:30 DUE TO TIME LIMIT ***
slurm-615417.out:slurmstepd: error: *** JOB 615417 ON bc9u11n1 CANCELLED AT 2020-04-20T13:40:37 DUE TO TIME LIMIT ***
slurm-615417.out:slurmstepd: error: *** STEP 615417.0 ON bc9u11n1 CANCELLED AT 2020-04-20T13:40:37 DUE TO TIME LIMIT ***
slurm-615446.out:slurmstepd: error: *** STEP 615446.0 ON bc8u15n6 CANCELLED AT 2020-04-20T15:25:52 DUE TO TIME LIMIT ***
slurm-615446.out:slurmstepd: error: *** JOB 615446 ON bc8u15n6 CANCELLED AT 2020-04-20T15:25:52 DUE TO TIME LIMIT ***
slurm-615447.out:slurmstepd: error: *** JOB 615447 ON bc8u15n6 CANCELLED AT 2020-04-20T15:38:54 DUE TO TIME LIMIT ***
slurm-615447.out:slurmstepd: error: *** STEP 615447.0 ON bc8u15n6 CANCELLED AT 2020-04-20T15:38:54 DUE TO TIME LIMIT ***
slurm-615448.out:slurmstepd: error: *** JOB 615448 ON bc6u15n6 CANCELLED AT 2020-04-20T15:40:54 DUE TO TIME LIMIT ***
slurm-615448.out:slurmstepd: error: *** STEP 615448.0 ON bc6u15n6 CANCELLED AT 2020-04-20T15:40:54 DUE TO TIME LIMIT ***
slurm-615449.out:slurmstepd: error: *** STEP 615449.0 ON bc6u19n5 CANCELLED AT 2020-04-20T15:43:54 DUE TO TIME LIMIT ***
slurm-615449.out:slurmstepd: error: *** JOB 615449 ON bc6u19n5 CANCELLED AT 2020-04-20T15:43:54 DUE TO TIME LIMIT ***
slurm-615450.out:slurmstepd: error: *** JOB 615450 ON bc9u23n7 CANCELLED AT 2020-04-20T15:44:24 DUE TO TIME LIMIT ***
slurm-615450.out:slurmstepd: error: *** STEP 615450.0 ON bc9u23n7 CANCELLED AT 2020-04-20T15:44:24 DUE TO TIME LIMIT ***'''
In [11]:
text = '''\
slurm-614068.out:slurmstepd: error: *** JOB 614068 ON bc6u15n1 CANCELLED AT 2020-04-19T01:49:02 DUE TO TIME LIMIT ***
slurm-614068.out:slurmstepd: error: *** STEP 614068.0 ON bc6u15n1 CANCELLED AT 2020-04-19T01:49:02 DUE TO TIME LIMIT ***
slurm-614069.out:slurmstepd: error: *** JOB 614069 ON bc6u11n6 CANCELLED AT 2020-04-19T01:49:02 DUE TO TIME LIMIT ***
slurm-614069.out:slurmstepd: error: *** STEP 614069.0 ON bc6u11n6 CANCELLED AT 2020-04-19T01:49:02 DUE TO TIME LIMIT ***
slurm-614100.out:slurmstepd: error: *** JOB 614100 ON bc8u7n7 CANCELLED AT 2020-04-19T01:49:32 DUE TO TIME LIMIT ***
slurm-614100.out:slurmstepd: error: *** STEP 614100.0 ON bc8u7n7 CANCELLED AT 2020-04-19T01:49:32 DUE TO TIME LIMIT ***
slurm-614129.out:slurmstepd: error: *** JOB 614129 ON bc9u7n6 CANCELLED AT 2020-04-19T01:53:32 DUE TO TIME LIMIT ***
slurm-614129.out:slurmstepd: error: *** STEP 614129.0 ON bc9u7n6 CANCELLED AT 2020-04-19T01:53:32 DUE TO TIME LIMIT ***
slurm-614133.out:slurmstepd: error: *** JOB 614133 ON bc9u7n7 CANCELLED AT 2020-04-19T02:04:03 DUE TO TIME LIMIT ***
slurm-614133.out:slurmstepd: error: *** STEP 614133.0 ON bc9u7n7 CANCELLED AT 2020-04-19T02:04:03 DUE TO TIME LIMIT ***
slurm-614134.out:slurmstepd: error: *** JOB 614134 ON bc9u7n7 CANCELLED AT 2020-04-19T02:06:33 DUE TO TIME LIMIT ***
slurm-614134.out:slurmstepd: error: *** STEP 614134.0 ON bc9u7n7 CANCELLED AT 2020-04-19T02:06:33 DUE TO TIME LIMIT ***
slurm-614137.out:slurmstepd: error: *** STEP 614137.0 ON bc9u19n8 CANCELLED AT 2020-04-19T02:10:03 DUE TO TIME LIMIT ***
slurm-614137.out:slurmstepd: error: *** JOB 614137 ON bc9u19n8 CANCELLED AT 2020-04-19T02:10:03 DUE TO TIME LIMIT ***
slurm-614139.out:slurmstepd: error: *** JOB 614139 ON bc9u7n7 CANCELLED AT 2020-04-19T02:21:05 DUE TO TIME LIMIT ***
slurm-614139.out:slurmstepd: error: *** STEP 614139.0 ON bc9u7n7 CANCELLED AT 2020-04-19T02:21:05 DUE TO TIME LIMIT ***
slurm-614145.out:slurmstepd: error: *** JOB 614145 ON bc8u23n5 CANCELLED AT 2020-04-19T02:28:06 DUE TO TIME LIMIT ***
slurm-614145.out:slurmstepd: error: *** STEP 614145.0 ON bc8u23n5 CANCELLED AT 2020-04-19T02:28:06 DUE TO TIME LIMIT ***
slurm-614146.out:slurmstepd: error: *** JOB 614146 ON bc8u23n5 CANCELLED AT 2020-04-19T02:30:06 DUE TO TIME LIMIT ***
slurm-614146.out:slurmstepd: error: *** STEP 614146.0 ON bc8u23n5 CANCELLED AT 2020-04-19T02:30:06 DUE TO TIME LIMIT ***
slurm-614147.out:slurmstepd: error: *** JOB 614147 ON bc8u23n5 CANCELLED AT 2020-04-19T02:32:37 DUE TO TIME LIMIT ***
slurm-614147.out:slurmstepd: error: *** STEP 614147.0 ON bc8u23n5 CANCELLED AT 2020-04-19T02:32:37 DUE TO TIME LIMIT ***
slurm-614148.out:slurmstepd: error: *** JOB 614148 ON bc8u23n5 CANCELLED AT 2020-04-19T02:33:07 DUE TO TIME LIMIT ***
slurm-614148.out:slurmstepd: error: *** STEP 614148.0 ON bc8u23n5 CANCELLED AT 2020-04-19T02:33:07 DUE TO TIME LIMIT ***
slurm-614162.out:slurmstepd: error: *** JOB 614162 ON bc9u19n5 CANCELLED AT 2020-04-19T03:09:42 DUE TO TIME LIMIT ***
slurm-614162.out:slurmstepd: error: *** STEP 614162.0 ON bc9u19n5 CANCELLED AT 2020-04-19T03:09:42 DUE TO TIME LIMIT ***
slurm-614163.out:slurmstepd: error: *** JOB 614163 ON bc9u23n2 CANCELLED AT 2020-04-19T03:14:44 DUE TO TIME LIMIT ***
slurm-614163.out:slurmstepd: error: *** STEP 614163.0 ON bc9u23n2 CANCELLED AT 2020-04-19T03:14:44 DUE TO TIME LIMIT ***
slurm-614172.out:slurmstepd: error: *** JOB 614172 ON bc9u19n5 CANCELLED AT 2020-04-19T03:46:26 DUE TO TIME LIMIT ***
slurm-614172.out:slurmstepd: error: *** STEP 614172.0 ON bc9u19n5 CANCELLED AT 2020-04-19T03:46:26 DUE TO TIME LIMIT ***
slurm-614175.out:slurmstepd: error: *** JOB 614175 ON bc6u23n3 CANCELLED AT 2020-04-19T04:00:00 DUE TO TIME LIMIT ***
slurm-614175.out:slurmstepd: error: *** STEP 614175.0 ON bc6u23n3 CANCELLED AT 2020-04-19T04:00:00 DUE TO TIME LIMIT ***
slurm-614177.out:slurmstepd: error: *** STEP 614177.0 ON bc6u23n1 CANCELLED AT 2020-04-19T04:04:00 DUE TO TIME LIMIT ***
slurm-614177.out:slurmstepd: error: *** JOB 614177 ON bc6u23n1 CANCELLED AT 2020-04-19T04:04:00 DUE TO TIME LIMIT ***
slurm-614183.out:slurmstepd: error: *** STEP 614183.0 ON bc8u23n2 CANCELLED AT 2020-04-19T04:35:38 DUE TO TIME LIMIT ***
slurm-614183.out:slurmstepd: error: *** JOB 614183 ON bc8u23n2 CANCELLED AT 2020-04-19T04:35:38 DUE TO TIME LIMIT ***
slurm-614186.out:slurmstepd: error: *** JOB 614186 ON bc8u23n2 CANCELLED AT 2020-04-19T04:44:43 DUE TO TIME LIMIT ***
slurm-614186.out:slurmstepd: error: *** STEP 614186.0 ON bc8u23n2 CANCELLED AT 2020-04-19T04:44:43 DUE TO TIME LIMIT ***
slurm-614192.out:slurmstepd: error: *** JOB 614192 ON bc9u19n2 CANCELLED AT 2020-04-19T05:44:55 DUE TO TIME LIMIT ***
slurm-614192.out:slurmstepd: error: *** STEP 614192.0 ON bc9u19n2 CANCELLED AT 2020-04-19T05:44:55 DUE TO TIME LIMIT ***
slurm-614194.out:slurmstepd: error: *** STEP 614194.0 ON bc9u19n8 CANCELLED AT 2020-04-19T06:01:28 DUE TO TIME LIMIT ***
slurm-614194.out:slurmstepd: error: *** JOB 614194 ON bc9u19n8 CANCELLED AT 2020-04-19T06:01:28 DUE TO TIME LIMIT ***
slurm-614196.out:slurmstepd: error: *** STEP 614196.0 ON bc9u19n8 CANCELLED AT 2020-04-19T06:14:59 DUE TO TIME LIMIT ***
slurm-614196.out:slurmstepd: error: *** JOB 614196 ON bc9u19n8 CANCELLED AT 2020-04-19T06:14:59 DUE TO TIME LIMIT ***
slurm-614197.out:slurmstepd: error: *** JOB 614197 ON bc9u7n3 CANCELLED AT 2020-04-19T06:16:59 DUE TO TIME LIMIT ***
slurm-614197.out:slurmstepd: error: *** STEP 614197.0 ON bc9u7n3 CANCELLED AT 2020-04-19T06:16:59 DUE TO TIME LIMIT ***
slurm-614199.out:slurmstepd: error: *** JOB 614199 ON bc8u15n4 CANCELLED AT 2020-04-19T06:34:00 DUE TO TIME LIMIT ***
slurm-614199.out:slurmstepd: error: *** STEP 614199.0 ON bc8u15n4 CANCELLED AT 2020-04-19T06:34:00 DUE TO TIME LIMIT ***
slurm-614200.out:slurmstepd: error: *** STEP 614200.0 ON bc8u23n4 CANCELLED AT 2020-04-19T06:40:31 DUE TO TIME LIMIT ***
slurm-614200.out:slurmstepd: error: *** JOB 614200 ON bc8u23n4 CANCELLED AT 2020-04-19T06:40:31 DUE TO TIME LIMIT ***
slurm-614201.out:slurmstepd: error: *** STEP 614201.0 ON bc8u23n4 CANCELLED AT 2020-04-19T06:55:02 DUE TO TIME LIMIT ***
slurm-614201.out:slurmstepd: error: *** JOB 614201 ON bc8u23n4 CANCELLED AT 2020-04-19T06:55:02 DUE TO TIME LIMIT ***
slurm-614202.out:slurmstepd: error: *** STEP 614202.0 ON bc8u23n4 CANCELLED AT 2020-04-19T06:55:31 DUE TO TIME LIMIT ***
slurm-614202.out:slurmstepd: error: *** JOB 614202 ON bc8u23n4 CANCELLED AT 2020-04-19T06:55:31 DUE TO TIME LIMIT ***
slurm-614204.out:slurmstepd: error: *** JOB 614204 ON bc8u23n4 CANCELLED AT 2020-04-19T07:10:38 DUE TO TIME LIMIT ***
slurm-614204.out:slurmstepd: error: *** STEP 614204.0 ON bc8u23n4 CANCELLED AT 2020-04-19T07:10:38 DUE TO TIME LIMIT ***
slurm-614208.out:slurmstepd: error: *** STEP 614208.0 ON bc6u27n5 CANCELLED AT 2020-04-19T07:26:10 DUE TO TIME LIMIT ***
slurm-614208.out:slurmstepd: error: *** JOB 614208 ON bc6u27n5 CANCELLED AT 2020-04-19T07:26:10 DUE TO TIME LIMIT ***
slurm-614209.out:slurmstepd: error: *** JOB 614209 ON bc8u7n8 CANCELLED AT 2020-04-19T07:29:41 DUE TO TIME LIMIT ***
slurm-614209.out:slurmstepd: error: *** STEP 614209.0 ON bc8u7n8 CANCELLED AT 2020-04-19T07:29:41 DUE TO TIME LIMIT ***
slurm-614212.out:slurmstepd: error: *** STEP 614212.0 ON bc8u19n8 CANCELLED AT 2020-04-19T07:43:41 DUE TO TIME LIMIT ***
slurm-614212.out:slurmstepd: error: *** JOB 614212 ON bc8u19n8 CANCELLED AT 2020-04-19T07:43:41 DUE TO TIME LIMIT ***
slurm-614214.out:slurmstepd: error: *** JOB 614214 ON bc8u19n8 CANCELLED AT 2020-04-19T07:48:41 DUE TO TIME LIMIT ***
slurm-614214.out:slurmstepd: error: *** STEP 614214.0 ON bc8u19n8 CANCELLED AT 2020-04-19T07:48:41 DUE TO TIME LIMIT ***
slurm-614218.out:slurmstepd: error: *** JOB 614218 ON bc8u23n3 CANCELLED AT 2020-04-19T08:06:45 DUE TO TIME LIMIT ***
slurm-614218.out:slurmstepd: error: *** STEP 614218.0 ON bc8u23n3 CANCELLED AT 2020-04-19T08:06:45 DUE TO TIME LIMIT ***
slurm-614219.out:slurmstepd: error: *** STEP 614219.0 ON bc8u23n3 CANCELLED AT 2020-04-19T08:09:15 DUE TO TIME LIMIT ***
slurm-614219.out:slurmstepd: error: *** JOB 614219 ON bc8u23n3 CANCELLED AT 2020-04-19T08:09:15 DUE TO TIME LIMIT ***
slurm-614235.out:slurmstepd: error: *** JOB 614235 ON bc8u15n4 CANCELLED AT 2020-04-19T09:01:55 DUE TO TIME LIMIT ***
slurm-614235.out:slurmstepd: error: *** STEP 614235.0 ON bc8u15n4 CANCELLED AT 2020-04-19T09:01:55 DUE TO TIME LIMIT ***
slurm-614236.out:slurmstepd: error: *** STEP 614236.0 ON bc6u23n7 CANCELLED AT 2020-04-19T09:05:27 DUE TO TIME LIMIT ***
slurm-614236.out:slurmstepd: error: *** JOB 614236 ON bc6u23n7 CANCELLED AT 2020-04-19T09:05:27 DUE TO TIME LIMIT ***
slurm-614237.out:slurmstepd: error: *** STEP 614237.0 ON bc8u23n7 CANCELLED AT 2020-04-19T09:17:58 DUE TO TIME LIMIT ***
slurm-614237.out:slurmstepd: error: *** JOB 614237 ON bc8u23n7 CANCELLED AT 2020-04-19T09:17:58 DUE TO TIME LIMIT ***
slurm-614246.out:slurmstepd: error: *** STEP 614246.0 ON bc8u15n5 CANCELLED AT 2020-04-19T10:01:07 DUE TO TIME LIMIT ***
slurm-614246.out:slurmstepd: error: *** JOB 614246 ON bc8u15n5 CANCELLED AT 2020-04-19T10:01:07 DUE TO TIME LIMIT ***
slurm-614257.out:slurmstepd: error: *** JOB 614257 ON bc6u23n5 CANCELLED AT 2020-04-19T10:44:42 DUE TO TIME LIMIT ***
slurm-614257.out:slurmstepd: error: *** STEP 614257.0 ON bc6u23n5 CANCELLED AT 2020-04-19T10:44:42 DUE TO TIME LIMIT ***'''
In [14]:
import re
# a = text.split("\n")[0]
b = [(re.search('slurm-(.*)\.out', s)).group(1) for s in text.split("\n")]
print(b)
In [19]:
In [21]:
import re
s = a
# result = re.search('slurm-(.*)\.out', s)
result = re.search('proteins_name_list_(.*)\.txt', s)
print(result.group(0))
In [13]:
b = [(re.search('slurm-(.*)\.out', s)).group(1) for s in text.split("\n")]
In [15]:
b = list(set(b))
In [17]:
len(b)
Out[17]:
In [22]:
print(b)
In [ ]:
a = "/home/wl45/opt/compute_phis.py -m 0 proteins_name_list/proteins_name_list_311.txt"
In [10]:
result.group(0)
Out[10]:
In [ ]: