In [2]:
from pyCodeLib import *
import warnings
import glob
import re
import numpy as np
import pandas as pd
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
warnings.filterwarnings('ignore')
# sys.path.insert(0, MYHOME)
%load_ext autoreload
%autoreload 2
In [4]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
In [5]:
normalized_mutli_iter0.shape
Out[5]:
In [8]:
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
In [1]:
a = [0]*5
In [2]:
a[1] = 3
In [5]:
six_letter_code_combinations = ['000004', '000013', '000022', '000031', '000040', '000103', '000112', '000121', '000130', '000202', '000211', '000220', '000301', '000310', '000400', '001003', '001012', '001021', '001030', '001102', '001111', '001120', '001201', '001210', '001300', '002002', '002011', '002020', '002101', '002110', '002200', '003001', '003010', '003100', '004000', '010003', '010012', '010021', '010030', '010102', '010111', '010120', '010201', '010210', '010300', '011002', '011011', '011020', '011101', '011110', '011200', '012001', '012010', '012100', '013000', '020002', '020011', '020020', '020101', '020110', '020200', '021001', '021010', '021100', '022000', '030001', '030010', '030100', '031000', '040000', '100003', '100012', '100021', '100030', '100102', '100111', '100120', '100201', '100210', '100300', '101002', '101011', '101020', '101101', '101110', '101200', '102001', '102010', '102100', '103000', '110002', '110011', '110020', '110101', '110110', '110200', '111001', '111010', '111100', '112000', '120001', '120010', '120100', '121000', '130000', '200002', '200011', '200020', '200101', '200110', '200200', '201001', '201010', '201100', '202000', '210001', '210010', '210100', '211000', '220000', '300001', '300010', '300100', '301000', '310000', '400000']
In [7]:
from datetime import datetime
In [10]:
datetime.now()
Out[10]:
In [6]:
len(six_letter_code_combinations)
Out[6]:
In [4]:
i_count = 0
for i in range(5):
for j in range(5):
for k in range(5):
for l in range(5):
for m in range(5):
for n in range(5):
if i+j+k+l+m+n != 4:
continue
i_count += 1
print(i_count)
In [9]:
relaitve_k.shape
Out[9]:
In [12]:
iter1 = np.zeros(normalized_mutli_iter0.shape)
In [ ]:
In [210]:
# relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group_correct_phi/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
n = 2
direct_k = np.zeros((n,n))
c = 0
for i in range(n):
for j in range(i, n):
direct_k[i][j] = relaitve_k[c]
if i != j:
direct_k[j][i] = direct_k[i][j]
c += 1
protein_mediated = np.zeros((n,n))
for i in range(n):
for j in range(i, n):
protein_mediated[i][j] = relaitve_k[c]
if i != j:
protein_mediated[j][i] = protein_mediated[i][j]
c += 1
water_mediated = np.zeros((n,n))
for i in range(n):
for j in range(i, n):
water_mediated[i][j] = relaitve_k[c]
if i != j:
water_mediated[j][i] = water_mediated[i][j]
c += 1
burial = np.zeros(n)
for i in range(n):
burial[i] = relaitve_k[c]
c += 1
In [212]:
relaitve_k
Out[212]:
In [27]:
direct_k
Out[27]:
In [213]:
protein_mediated
Out[213]:
In [214]:
water_mediated
Out[214]:
In [215]:
burial
Out[215]:
In [24]:
relaitve_k
Out[24]:
In [32]:
normalized_mutli_iter0[:10]
Out[32]:
In [37]:
res_type_map_letters[0]
Out[37]:
In [39]:
res_type_map_HP = {
'C': 0,
'M': 0,
'F': 0,
'I': 0,
'L': 0,
'V': 0,
'W': 0,
'Y': 0,
'A': 1,
'H': 1,
'T': 1,
'G': 1,
'P': 1,
'D': 1,
'E': 1,
'N': 1,
'Q': 1,
'R': 1,
'K': 1,
'S': 1
}
In [131]:
iter1 = np.zeros(normalized_mutli_iter0.shape)
c = 0
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = normalized_mutli_iter0[c] * direct_k[type1][type2]
c += 1
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = normalized_mutli_iter0[c] * protein_mediated[type1][type2]
c += 1
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = normalized_mutli_iter0[c] * water_mediated[type1][type2]
c += 1
for i in range(3):
for j in range(20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = normalized_mutli_iter0[c] * burial[type2]
c += 1
print(c)
In [349]:
def getIterAfterGroupOptimization(pre, relaitve_k):
n = 2
direct_k = np.zeros((n,n))
c = 0
for i in range(n):
for j in range(i, n):
direct_k[i][j] = relaitve_k[c]
if i != j:
direct_k[j][i] = direct_k[i][j]
c += 1
protein_mediated = np.zeros((n,n))
for i in range(n):
for j in range(i, n):
protein_mediated[i][j] = relaitve_k[c]
if i != j:
protein_mediated[j][i] = protein_mediated[i][j]
c += 1
water_mediated = np.zeros((n,n))
for i in range(n):
for j in range(i, n):
water_mediated[i][j] = relaitve_k[c]
if i != j:
water_mediated[j][i] = water_mediated[i][j]
c += 1
burial = np.zeros(n)
for i in range(n):
burial[i] = relaitve_k[c]
c += 1
iter1 = np.zeros(pre.shape)
c = 0
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = pre[c] * direct_k[type1][type2]
c += 1
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = pre[c] * protein_mediated[type1][type2]
c += 1
for i in range(20):
type1 = res_type_map_HP[res_type_map_letters[i]]
for j in range(i, 20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = pre[c] * water_mediated[type1][type2]
c += 1
for i in range(3):
for j in range(20):
type2 = res_type_map_HP[res_type_map_letters[j]]
iter1[c] = pre[c] * burial[type2]
c += 1
print(c)
# normalized_iter1 = iter1 * np.std(pre) / np.std(iter1)
return iter1
In [380]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_only_iter0/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
# t = t * np.std(normalized_mutli_iter0) / np.std(t)
np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only", t)
In [313]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
# t = t * np.std(normalized_mutli_iter0) / np.std(t)
np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_correct_test_2", t)
In [246]:
np.mean(t)
Out[246]:
In [209]:
relaitve_k
Out[209]:
In [192]:
cutoff600 = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600")
In [193]:
np.mean(cutoff600)
Out[193]:
In [198]:
centered = (cutoff600 - np.mean(cutoff600) + np.mean(original) ) * np.std(original) / np.std(cutoff600)
In [199]:
np.std(centered)
Out[199]:
In [200]:
np.mean(centered)
Out[200]:
In [203]:
np.std(original)
Out[203]:
In [202]:
np.mean(original)
Out[202]:
In [182]:
centered = normalized_mutli_iter0 - np.mean(normalized_mutli_iter0) + np.mean(original)
In [185]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/multi_original_centered", centered)
In [ ]:
In [183]:
np.mean(centered)
Out[183]:
In [179]:
np.mean(normalized_mutli_iter0)
Out[179]:
In [181]:
np.mean(original)
Out[181]:
In [159]:
np.sum(normalized_iter1_2- normalized_iter1_3)
Out[159]:
In [180]:
original = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma")
In [ ]:
In [172]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_3", normalized_iter1_3)
In [ ]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group_correct_phi/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
In [137]:
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
In [132]:
np.std(iter1)
Out[132]:
In [133]:
normalized_iter1 = iter1 * np.std(normalized_mutli_iter0) / np.std(iter1)
In [147]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_2", -normalized_iter1)
In [151]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_check", normalized_iter1)
In [382]:
from collections import defaultdict
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
"new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
"test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
pdb_list_dic = {"../iterative_optimization_old_set":"old",
"../iterative_optimization_new_temp_range":"new",
"../iterative_optimization_biased_sampling":"new"}
pdb_list_dic_rev = {"old":"iterative_optimization_old_set",
"new":"iterative_optimization_new_temp_range"}
iteration_source_dic = {"bias_2":"../iterative_optimization_biased_sampling",
"bias_old_gamma":"../iterative_optimization_biased_sampling",
"iter1_with_bias_96percent":"../iterative_optimization_new_temp_range",
"iter1_with_bias_98percent":"../iterative_optimization_new_temp_range",
"new_iter1_0":"../iterative_optimization_new_temp_range",
"new_iter1_90":"../iterative_optimization_new_temp_range",
"new_iter1_96":"../iterative_optimization_new_temp_range",
"new_iter1_98":"../iterative_optimization_new_temp_range",
"new_iter1_combined_on_B":"../iterative_optimization_new_temp_range",
"new_iter2_8":"../iterative_optimization_new_temp_range",
"new_iter2_10":"../iterative_optimization_new_temp_range",
"old_new_iter2_8":"../iterative_optimization_new_temp_range",
"new_iter3_10":"../iterative_optimization_old_set",
"single":"../iterative_optimization_old_set",
"iter4_30":"../iterative_optimization_old_set",
"iter4_6":"../iterative_optimization_old_set",
"iter4_13":"../iterative_optimization_old_set",
"iter5_30":"../iterative_optimization_old_set",
"iter6_30":"../iterative_optimization_old_set",
"noFrag":"../iterative_optimization_old_set",
"iter0_normalized_noFrag":"../iterative_optimization_combined_train_set",
"iter1_normalized_noFrag":"../iterative_optimization_combined_train_set",
"iter2_normalized_noFrag":"../iterative_optimization_combined_train_set",
"iter3_normalized_noFrag":"../iterative_optimization_combined_train_set",
"iter4_normalized_noFrag":"../iterative_optimization_combined_train_set",
"iter3_normalized_noFrag_90":"../iterative_optimization_combined_train_set",
"iter5_normalized_noFrag":"../iterative_optimization_combined_train_set_with_frag",
"original":"../iterative_optimization_combined_train_set_with_frag",
"iter6_normalized_noFrag":"../iterative_optimization_combined_train_set_with_frag",
"iter0":"../iterative_optimization_combined_train_set_with_frag",
"without_contact":"../iterative_optimization_combined_train_set_with_frag",
"original_with_rg":"../iterative_optimization_combined_train_set_with_frag",
"iter1_with_rg":"../iterative_optimization_combined_train_set_with_frag",
"iter6_with_rg":"../iterative_optimization_combined_train_set_with_frag",
"iter2_with_rg":"../iterative_optimization_combined_train_set_with_frag",
"iter3_with_rg":"../iterative_optimization_combined_train_set_with_frag",
"iter2_with_rg_90":"../iterative_optimization_combined_train_set_with_frag",
"iter3_with_rg_less_frag":"../iterative_optimization_combined_train_set_with_frag",
"multi_iter0":"../iterative_optimization_combined_train_set_with_frag",
"multi_iter1":"../iterative_optimization_combined_train_set_with_frag",
"multi_groupedNorm":"../iterative_optimization_combined_train_set_with_frag",
"multi_iter2":"../iterative_optimization_combined_train_set_with_frag",
"multi_groupedNorm_check":"../iterative_optimization_combined_train_set_with_frag",
}
pdb_list_dic = {"../iterative_optimization_old_set":"old",
"../iterative_optimization_new_temp_range":"new",
"../iterative_optimization_biased_sampling":"new",
"../iterative_optimization_combined_train_set":"combined",
"../iterative_optimization_combined_train_set_with_frag":"combined"}
# new_simulation_list = ["iter1_with_bias_96percent", "new_iter2_10"]
# old_protein_simulation_list = ["single", "new_iter3_10"]
# new_simulation_list = ["bias_2","bias_old_gamma", "iter1_with_bias_96percent", "iter1_with_bias_98percent", "new_iter2_10", "new_iter1_90", "new_iter2_8", "old_new_iter2_8"]
# old_protein_simulation_list = ["noFrag", "iter6_30", "iter5_30", "single", "new_iter3_10", "iter4_30", "iter4_6", "iter4_13"]
# combined_simulation_list = ["iter5_normalized_noFrag", "original", "iter0_normalized_noFrag", "iter1_normalized_noFrag", "iter2_normalized_noFrag", "iter3_normalized_noFrag", "iter4_normalized_noFrag", "iter3_normalized_noFrag_90"]
# new_data = ["iter5_normalized_noFrag", "original"]
new_simulation_list = []
old_protein_simulation_list = []
combined_simulation_list = ["multi_iter1", "iter3_with_rg_less_frag", "multi_iter0", "iter3_with_rg", "iter2_with_rg_90", "iter2_with_rg", "iter0", "without_contact", "original_with_rg", "iter1_with_rg", "iter6_with_rg"]
# new_data = ["multi_iter1"]
combined_simulation_list = ["multi_iter1", "iter3_with_rg_less_frag", "multi_iter0", "without_contact", "original_with_rg"]
# combined_simulation_list = ["multi_iter0"]
# combined_simulation_list = ["multi_groupedNorm"]
# combined_simulation_list = ["multi_iter2"]
new_data = []
simulation_location_list_dic = defaultdict(list)
for p in dataset["new"]:
name = p.lower()[:4]
simulation_location_list_dic[name] += new_simulation_list
for p in dataset["old"]:
name = p.lower()[:4]
simulation_location_list_dic[name] += old_protein_simulation_list
for p in dataset["combined"]:
name = p.lower()[:4]
simulation_location_list_dic[name] += combined_simulation_list
# simulation_location_list = ["multi_iter1"]
cwd = os.getcwd()
print(cwd)
Run = 30
decoy_n = 1000
n = len(combined_simulation_list)
In [370]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[370]:
In [372]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_apr26"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[372]:
In [375]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[375]:
In [374]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[374]:
In [376]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_iter1_apr26"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[376]:
In [377]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_apr26_15"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[377]:
In [383]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[383]:
In [384]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_iter0_only_15"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[384]:
In [381]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[381]:
In [371]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
t = t * np.std(normalized_mutli_iter0) / np.std(t)
new_gamma = t
np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_apr26", t)
In [353]:
direct = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_pairwise_contact_well_1r69_multi_iter1_native_4.5_6.5_5.0_10")
burial = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_burial_well_1r69_multi_iter1_native_4.0")
phi = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_normalize_relative_k_1r69_multi_iter1_native_1")
burial_HP = phi[-2:]
In [302]:
np.sum(relaitve_k[-2:] * burial_HP)
Out[302]:
In [256]:
relaitve_k[-2:]
Out[256]:
In [328]:
np.sum(normalized_mutli_iter0[630:] * burial)
Out[328]:
In [329]:
np.sum(phi[-2:])
Out[329]:
In [330]:
np.sum(normalized_mutli_iter0[:210] * direct)
Out[330]:
In [331]:
np.sum(phi[:3])
Out[331]:
In [354]:
np.sum(new_gamma[:210] * direct)
Out[354]:
In [355]:
np.sum(relaitve_k[:3] * phi[:3])
Out[355]:
In [356]:
np.sum(new_gamma[630:] * burial)
Out[356]:
In [357]:
np.sum(relaitve_k[-2:] * burial_HP)
Out[357]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [242]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[242]:
In [243]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_group_correct_phi_2/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[243]:
In [ ]:
In [175]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[175]:
In [236]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test_2"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[236]:
In [226]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[226]:
In [208]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[208]:
In [177]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[177]:
In [164]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/iter_multi_iter2"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[164]:
In [173]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_3"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[173]:
In [168]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_group_correct_phi_2/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[168]:
In [165]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_multi_iter1_correct_phi/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[165]:
In [166]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_2"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[166]:
In [205]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/multi_original_centered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[205]:
In [171]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[171]:
In [170]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/iter_multi_iter1_correct_real"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[170]:
In [163]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data
Out[163]:
In [59]:
iter1[630:]
Out[59]:
In [61]:
# now, positive means favored.
rhoGamma = pd.DataFrame(-iter1[630:].reshape(3,20).T, columns=["rho1", "rho2", "rho3"]).reset_index()
rhoGamma["oneLetter"] = rhoGamma["index"].apply(lambda x: inverse_res_type_map[x])
rhoGamma["Residue"] = rhoGamma["index"].apply(lambda x: one_to_three(inverse_res_type_map[x]))
rhoGamma = rhoGamma[["Residue", "rho1", "rho2", "rho3", "index", "oneLetter"]]
g = rhoGamma[["rho1", "rho2", "rho3"]].values
# np.savetxt("/Users/weilu/Research/server/feb_2019/burial_only_gamma.dat", g, fmt='%7.4f')
# rhoGamma
rhoGamma["hydrophobicityOrder"] = rhoGamma["oneLetter"].apply(lambda x: hydrophobicity_map[x])
rhoGamma.sort_values("hydrophobicityOrder")
Out[61]:
In [56]:
plt.plot(iter1.flatten())
Out[56]:
In [57]:
plt.plot(normalized_mutli_iter0.flatten())
Out[57]:
In [49]:
plot_contact_well(iter1[:210], inferBound=True, vmin=-2, vmax=2)
In [48]:
plot_contact_well(iter1[210:420], inferBound=True, vmin=-2, vmax=2)
In [ ]:
In [148]:
# pre = "/Users/weilu/Research/server_backup/feb_2019/jan_optimization/gammas/"
# pre = "/Users/weilu/Research/server/feb_2019/optimization_with_biased_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter3/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter4/gammas/"
pre = "/Users/weilu/Research/server/april_2019/optimization_multi_iter2/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter2_improved/gammas/"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
pp = "proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = pp + "_A"
B_name = pp + "_B"
B_filtered_name = pp + "_B_filtered"
P_name = pp + "_P"
Gamma_name = pp + "_gamma"
Gamma_filtered_name = pp + "_gamma_filtered"
Lamb_name = pp + "_lamb"
Lamb_filtered_name = pp + "_lamb_filtered"
A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
half_B_name = pp + "_half_B"
half_B = np.loadtxt(pre+half_B_name)
other_half_B_name = pp + "_other_half_B"
other_half_B = np.loadtxt(pre+other_half_B_name)
std_half_B_name = pp + "_std_half_B"
std_half_B = np.loadtxt(pre+std_half_B_name)
In [71]:
cc_0 = res_type_map["C"] * 20 + res_type_map["C"]
cc_1 = res_type_map["C"] * 20 + res_type_map["C"] + 210
cc_2 = res_type_map["C"] * 20 + res_type_map["C"] + 420
In [72]:
[cc_0, cc_1, cc_2]
Out[72]:
In [40]:
A.shape
Out[40]:
In [41]:
B.shape
Out[41]:
In [79]:
368-210
Out[79]:
In [99]:
158/20
Out[99]:
In [100]:
158-140
Out[100]:
In [55]:
np.argmin(A[:630])
Out[55]:
In [42]:
plt.plot(A)
Out[42]:
In [140]:
In [149]:
BB = B.copy()
AA = A.copy()
BB[294,:] = 0
BB[:,294] = 0
AA[294] = 0
In [150]:
total_phis = 690
num_decoys = 1000
filtered_gamma_modified, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(639, AA, BB, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)
In [151]:
plt.plot(filtered_gamma)
Out[151]:
In [158]:
plot_contact_well(filtered_gamma_modified[210:420], inferBound=True, vmin=-2, vmax=2)
In [157]:
plot_contact_well(filtered_gamma_modified[210:420], inferBound=False, vmin=-2, vmax=2)
In [159]:
plot_contact_well(filtered_gamma[210:420], inferBound=False, vmin=-2, vmax=2)
In [162]:
np.max(np.abs(filtered_gamma - filtered_gamma_modified))
Out[162]:
In [165]:
tt = filtered_gamma - filtered_gamma_modified
In [166]:
plot_contact_well(tt[210:420], inferBound=False, vmin=-2, vmax=2)
In [111]:
def get_filtered_gamma_B_lamb_P_and_lamb(cutoff_mode, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
lamb, P = np.linalg.eig(B)
lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)
# cutoff_mode = 600
print(cutoff_mode)
filtered_lamb = np.copy(lamb)
filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
filtered_lamb, cutoff_mode, P)
filtered_gamma = np.dot(filtered_B_inv, A)
filtered_B = np.linalg.inv(filtered_B_inv)
return filtered_gamma, filtered_B, filtered_lamb, P, lamb
In [154]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(639, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)
In [155]:
plot_contact_well(filtered_gamma[210:420], inferBound=True, vmin=-2, vmax=2)
In [156]:
In [76]:
np.array(np.where(B == B.min()))
Out[76]:
In [78]:
np.array(np.where(B == B.max()))
Out[78]:
In [77]:
plt.imshow(B)
plt.colorbar()
Out[77]:
In [ ]:
In [50]:
plt.plot(filtered_gamma)
Out[50]:
In [54]:
np.argmin(filtered_gamma[:630])
Out[54]:
In [46]:
plt.plot(Gamma)
Out[46]:
In [48]:
def get_filtered_gamma_B_lamb_P_and_lamb(cutoff_mode, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
lamb, P = np.linalg.eig(B)
lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)
# cutoff_mode = 600
print(cutoff_mode)
filtered_lamb = np.copy(lamb)
filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
filtered_lamb, cutoff_mode, P)
filtered_gamma = np.dot(filtered_B_inv, A)
filtered_B = np.linalg.inv(filtered_B_inv)
return filtered_gamma, filtered_B, filtered_lamb, P, lamb
In [49]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)
In [3]:
# pre = "/Users/weilu/Research/server_backup/feb_2019/jan_optimization/gammas/"
# pre = "/Users/weilu/Research/server/feb_2019/optimization_with_biased_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter3/gammas/"
pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter4/gammas/"
pre = "/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter2_improved/gammas/"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
pp = "proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = pp + "_A"
B_name = pp + "_B"
B_filtered_name = pp + "_B_filtered"
P_name = pp + "_P"
Gamma_name = pp + "_gamma"
Gamma_filtered_name = pp + "_gamma_filtered"
Lamb_name = pp + "_lamb"
Lamb_filtered_name = pp + "_lamb_filtered"
A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
half_B_name = pp + "_half_B"
half_B = np.loadtxt(pre+half_B_name)
other_half_B_name = pp + "_other_half_B"
other_half_B = np.loadtxt(pre+other_half_B_name)
std_half_B_name = pp + "_std_half_B"
std_half_B = np.loadtxt(pre+std_half_B_name)
In [19]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)
In [18]:
def get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
lamb, P = np.linalg.eig(B)
lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)
cutoff_mode = 600
print(cutoff_mode)
filtered_lamb = np.copy(lamb)
filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
filtered_lamb, cutoff_mode, P)
filtered_gamma = np.dot(filtered_B_inv, A)
filtered_B = np.linalg.inv(filtered_B_inv)
return filtered_gamma, filtered_B, filtered_lamb, P, lamb
In [20]:
np.savetxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600", filtered_gamma)
In [27]:
cutoff600 = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600")
In [28]:
np.std(cutoff600)
Out[28]:
In [25]:
original = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma")
In [26]:
np.std(original)
Out[26]:
In [30]:
np.std(cutoff600 * np.std(original)/np.std(cutoff600))
Out[30]:
In [31]:
normalized_cutoff600 = cutoff600 * np.std(original)/np.std(cutoff600)
In [35]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600", normalized_cutoff600)
In [12]:
plot_contact_well(Gamma_filtered[:210], inferBound=True, invert_sign=False)
In [11]:
plot_contact_well(filtered_gamma[:210], inferBound=True, invert_sign=False)
In [10]:
plt.plot(filtered_lamb)
plt.yscale('log')
In [7]:
plt.plot(lamb)
plt.yscale('log')
In [5]:
plt.plot(filtered_lamb)
plt.yscale('log')
In [6]:
plt.plot(Lamb)
plt.yscale('log')
In [2]:
folder = "/Users/weilu/Research/optimization/mediated_term"
In [264]:
data = glob.glob(folder+"/multisequenceanddcafrustratometry/*.fasta")
In [77]:
def getSeqFromFasta(location):
seq = ""
with open(location, "r") as f:
fastaFile = f.readlines()
for line in fastaFile[1:]:
seq += line.strip()
return seq
In [265]:
len(data)
Out[265]:
In [56]:
data[0]
Out[56]:
In [266]:
filtered_data = []
for i, one in enumerate(data):
problematic = 0
# one = '/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/1VBHA_518-876.fasta'
pre = one.split(".")[0]
p = pre.split("/")[-1]
name = p.split("_")[0]
chainName = name[-1]
low, high = p.split("_")[1].rsplit('-', 1)
# try:
# # low, high = p.split("_")[1].split("-")
# low, high = p.split("_")[1].rsplit('-', 1)
# except:
# problematic = True
# print(one)
# continue
length = int(high) - int(low) + 1
targetPre = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/original_pdbs/"
# os.system(f"cp {pre}.pdb {targetPre}{name.lower()[:4]}.pdb")
os.system(f"cp {pre}.pdb {targetPre}{p}.pdb")
seq1 = getSeqFromFasta(one)
if length != len(seq1):
print(i, name, length, len(seq1))
problematic = 1
from Bio.PDB.PDBParser import PDBParser
pdbFileLocation = pre + ".pdb"
structure = PDBParser().get_structure(name, pdbFileLocation)
seq = ""
for r in structure.get_residues():
_, _, chain, (_, resId, _) = r.get_full_id()
try:
resName = three_to_one(r.get_resname())
except:
problematic = 2
# assert chain == "A"
if chain != chainName:
print(i, name, length, len(seq1), chain, chainName)
problematic = 3
seq += resName
if seq != seq1:
print(seq, seq1)
problematic = 4
# if not problematic:
# filtered_data.append(one)
filtered_data.append([name, p, length, len(seq1), seq, problematic])
In [268]:
filtered_data[0]
Out[268]:
In [271]:
complete_data = pd.DataFrame(filtered_data, columns=["Name", "FullName", "SeqLength", "Length", "Seq", "Problematic"])
# complete_data.to_csv(folder+"/data_info_3.csv")
In [270]:
complete_data.shape
Out[270]:
In [2]:
data = pd.read_csv("/Users/weilu/Research/server/april_2019/optimization_mult_seq/data_info_3.csv", index_col=0)
In [354]:
data.sort_values("Length").query("Problematic == 0")
Out[354]:
In [248]:
complete_data.query("Problematic == 0").shape
Out[248]:
In [249]:
# no problematic = 2 or 3.
complete_data.query("Problematic == 4").shape
Out[249]:
In [250]:
complete_data.query("Problematic == 0 and Length > 500")
Out[250]:
In [166]:
b = a.query("Length < 100").query("Problematic == 0").sample(10, random_state=0).reset_index(drop=True)
b.to_csv("/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/chosen.csv")
In [284]:
data = pd.read_csv("/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/chosen.csv", index_col=0)
In [349]:
Out[349]:
In [208]:
data
Out[208]:
In [285]:
for i, line in data.iterrows():
if "X" in line["Seq"]:
print(line)
In [302]:
complete_data_filtered.shape
Out[302]:
In [303]:
n = len(complete_data_filtered)
In [ ]:
In [332]:
In [333]:
In [334]:
number_of_runs
Out[334]:
In [345]:
n = len(complete_data_filtered)
number_of_runs = int(np.ceil(n/10))
perRun = 10
count = 0
for i in range(number_of_runs):
with open(to_location+f"proteins_name_list/proteins_name_list_{i}.txt", "w") as out:
cc = 0
while count < n and cc < perRun:
fullName = complete_data_filtered.iloc[count]["FullName"]
out.write(fullName+"\n")
cc += 1
count += 1
In [347]:
with open(to_location+f"proteins_name_list/proteins_name_list.txt", "w") as out:
for i, line in complete_data_filtered.iterrows():
out.write(line["FullName"]+"\n")
In [300]:
complete_data_filtered = complete_data.query("Problematic != 4").reset_index(drop=True)
In [336]:
from_location = "/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/"
to_location = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/"
# os.system("mkdir -p database/S20_seq")
# os.system("mkdir -p database/dompdb")
for i, line in complete_data_filtered.iterrows():
# print(line["FullName"])
print(i)
fullName = line["FullName"]
os.system(f"cp {from_location}{fullName}_filtered_0.05.seqs {to_location}alignments/")
os.system(f"cp {to_location}../optimization_mult_seq/cleaned_pdbs/{fullName}.pdb {to_location}database/dompdb/")
with open(to_location+f"database/S20_seq/{fullName}.seq", "w") as out:
with open(from_location+f"{fullName}.fasta") as f:
a = f.readlines()
out.write(a[1]+"\n")
In [286]:
from_location = "/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/"
to_location = "/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/"
# os.system("mkdir -p database/S20_seq")
# os.system("mkdir -p database/dompdb")
for i, line in data.iterrows():
# print(line["FullName"])
fullName = line["FullName"]
os.system(f"cp {from_location}{fullName}_filtered_0.05.seqs {to_location}aligments/")
os.system(f"cp {to_location}../optimization_mult_seq/cleaned_pdbs/{fullName}.pdb {to_location}database/dompdb/")
with open(to_location+f"database/S20_seq/{fullName}.seq", "w") as out:
with open(from_location+f"{fullName}.fasta") as f:
a = f.readlines()
out.write(a[1]+"\n")
with open(to_location+f"proteins_name_list/proteins_name_list_{i}.txt", "w") as out:
out.write(fullName+"\n")
In [288]:
loc = "/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/phis/phi_burial_well_2HLQA_33-127_native_4.0"
a = np.loadtxt(loc)
In [290]:
a.shape
Out[290]:
In [293]:
b = np.average(a, axis=0)
In [294]:
len(b)
Out[294]:
In [297]:
c = np.zeros(100)
c[0:60] = b
In [298]:
c
Out[298]:
In [253]:
info = []
for i, line in complete_data.iterrows():
fullName = line["FullName"]
try:
with open(f"{from_location}{fullName}_filtered_0.05.seqs") as f:
a = f.readlines()
info.append([fullName, len(a)])
except:
print(fullName, line)
In [256]:
a = pd.DataFrame(info, columns=["Name", "Length"])
In [260]:
a.hist("Length", bins=100)
plt.yscale("log")
In [261]:
a.query("Length < 1e5").hist("Length", bins=100)
plt.yscale("log")
In [262]:
a.query("Length < 2e4").hist("Length", bins=100)
plt.yscale("log")
In [275]:
a.query("Length < 1e4").hist("Length", bins=100)
plt.yscale("log")
In [278]:
"777" in "woewowefggjeg77 77sdfsdf"
Out[278]:
In [263]:
len(a)
Out[263]:
In [240]:
def generate_decoy_sequences(fullName, location="./", num_decoys=1000):
num_decoys = 10
# location = to_location
with open(location+f"aligments/{fullName}_filtered_0.05.seqs") as f:
a = f.readlines()
with open(location+f"database/S20_seq/{fullName}.seq") as f:
b = f.readlines()
with open(location+f"decoys/multiShuffle/{fullName}.decoys", "w") as out:
for seq in random.sample(a, num_decoys):
s = seq.strip()
shuffled_seq = ''.join(random.sample(s,len(s)))
out.write(shuffled_seq+"\n")
# print(shuffled_seq)
In [281]:
random.sample(c, 10)
Out[281]:
In [229]:
c = a + b
In [230]:
c[-4:]
Out[230]:
In [227]:
len(a)
Out[227]:
In [236]:
s = a[0]
''.join(random.sample(s,len(s)))
Out[236]:
In [234]:
random.shuffle(list(a[0].strip()))
In [237]:
In [214]:
len(a)
Out[214]:
In [176]:
pwd
Out[176]:
In [163]:
a.drop_duplicates("Name").shape
Out[163]:
In [ ]:
In [152]:
a.shape
Out[152]:
In [151]:
a.query("Problematic == 0").shape
Out[151]:
In [150]:
a.query("Problematic == 4")
Out[150]:
In [147]:
a.query("Problematic == 4")
Out[147]:
In [113]:
len(filtered_data)
Out[113]:
In [116]:
a = pd.DataFrame(filtered_data, columns=["Name", "Length", "Seq", "Problematic"])
a.to_csv(folder+"/data_info.csv")
In [123]:
b = a.query("Problematic == 0")
In [139]:
pd.read_csv(folder+"/data_info.csv", index_col=0)["Name"].values
Out[139]:
In [142]:
a.drop_duplicates("Name")
Out[142]:
In [140]:
a.query("Name == '5M3MC'")
Out[140]:
In [133]:
len(b)
Out[133]:
In [125]:
a.to_csv(folder+"/data_info.csv")
In [134]:
b.query("Length < 200").sample(1)
Out[134]:
In [135]:
b.query("Length < 200").shape
Out[135]:
In [131]:
b.sort_values("Length")["Length"].hist(bins=100)
Out[131]:
In [ ]:
In [ ]:
In [109]:
len(filtered_data)
Out[109]:
In [110]:
pd.DataFrame([["a", 1],["b", 2]])
Out[110]:
In [82]:
seq1 = getSeqFromFasta(one)
In [84]:
from Bio.PDB.PDBParser import PDBParser
pdbFileLocation = pre + ".pdb"
structure = PDBParser().get_structure(name, pdbFileLocation)
seq = ""
for r in structure.get_residues():
_, _, chain, (_, resId, _) = r.get_full_id()
resName = three_to_one(r.get_resname())
assert chain == "A"
seq += resName
In [81]:
len(seq)
Out[81]:
In [85]:
seq == seq1
Out[85]:
In [87]:
seq1
Out[87]:
In [86]:
seq
Out[86]:
In [76]:
targetPre = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/original_pdbs/"
os.system(f"cp {pre}.pdb {targetPre}{name.lower()[:4]}.pdb")
Out[76]:
In [75]:
pre
Out[75]:
In [73]:
name
Out[73]:
In [61]:
protein = one.split(".")[-2]
In [69]:
import re
re.split("\W+", one)
Out[69]:
In [71]:
Out[71]:
In [62]:
one.split(".")
Out[62]:
In [7]:
alen(data)
Out[7]:
In [9]:
with open(folder+"/3GL5A_2-207_filtered_0.05.seqs", "r") as f:
d = f.readlines()
In [11]:
len(d)
Out[11]:
In [13]:
len(d[0])
Out[13]:
In [14]:
len(d[0].strip())
Out[14]:
In [15]:
d[0].strip()
Out[15]:
In [18]:
with open(folder+"/3GL5A_2-207.fasta", "r") as f:
fastaFile = f.readlines()
fasta = fastaFile[1].strip()
In [19]:
fasta
Out[19]:
In [20]:
len(fasta)
Out[20]:
In [21]:
from Bio.PDB.PDBParser import PDBParser
pdbFileLocation = '/Users/weilu/Research/server/march_2019/optimization_mult_seq/cleaned_pdbs/3gl5.pdb'
structure = PDBParser().get_structure('3gl5', pdbFileLocation)
seq = ""
for r in structure.get_residues():
_, _, chain, (_, resId, _) = r.get_full_id()
resName = three_to_one(r.get_resname())
assert chain == "A"
seq += resName
In [ ]:
seq = ""
for r in structure.get_residues():
_, _, chain, (_, resId, _) = r.get_full_id()
resName = three_to_one(r.get_resname())
assert chain == "A"
seq += resName
In [55]:
seq == fasta
Out[55]:
In [22]:
a = pp.get_sequence()
In [ ]:
In [29]:
list(structure.get_chains())
Out[29]:
In [33]:
c = structure[0]["A"]
In [50]:
three_to_one('As')
In [51]:
seq = ""
for r in structure.get_residues():
_, _, chain, (_, resId, _) = r.get_full_id()
resName = three_to_one(r.get_resname())
assert chain == "A"
seq += resName
In [52]:
seq
Out[52]:
In [54]:
len(seq)
Out[54]:
In [ ]:
In [37]:
r = list(c.get_residues())[0]
In [38]:
r.get_segid()
Out[38]:
In [39]:
r.get_id()
Out[39]:
In [42]:
r.get_full_id()
Out[42]:
In [48]:
r.get_resname()
Out[48]:
In [45]:
list(structure.get_residues())
Out[45]:
In [ ]: