In [2]:
from pyCodeLib import *
import warnings
import glob
import re
import numpy as np
import pandas as pd
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one


warnings.filterwarnings('ignore')


# sys.path.insert(0, MYHOME)
%load_ext autoreload
%autoreload 2

In [4]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")

In [5]:
normalized_mutli_iter0.shape


Out[5]:
(690,)

In [8]:
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")

In [1]:
a = [0]*5

In [2]:
a[1] = 3

In [5]:
six_letter_code_combinations = ['000004', '000013', '000022', '000031', '000040', '000103', '000112', '000121', '000130', '000202', '000211', '000220', '000301', '000310', '000400', '001003', '001012', '001021', '001030', '001102', '001111', '001120', '001201', '001210', '001300', '002002', '002011', '002020', '002101', '002110', '002200', '003001', '003010', '003100', '004000', '010003', '010012', '010021', '010030', '010102', '010111', '010120', '010201', '010210', '010300', '011002', '011011', '011020', '011101', '011110', '011200', '012001', '012010', '012100', '013000', '020002', '020011', '020020', '020101', '020110', '020200', '021001', '021010', '021100', '022000', '030001', '030010', '030100', '031000', '040000', '100003', '100012', '100021', '100030', '100102', '100111', '100120', '100201', '100210', '100300', '101002', '101011', '101020', '101101', '101110', '101200', '102001', '102010', '102100', '103000', '110002', '110011', '110020', '110101', '110110', '110200', '111001', '111010', '111100', '112000', '120001', '120010', '120100', '121000', '130000', '200002', '200011', '200020', '200101', '200110', '200200', '201001', '201010', '201100', '202000', '210001', '210010', '210100', '211000', '220000', '300001', '300010', '300100', '301000', '310000', '400000']

In [7]:
from datetime import datetime

In [10]:
datetime.now()


Out[10]:
datetime.datetime(2019, 5, 1, 13, 5, 15, 173443)

In [6]:
len(six_letter_code_combinations)


Out[6]:
126

In [4]:
i_count = 0
for i in range(5):
    for j in range(5):
        for k in range(5):
            for l in range(5):
                for m in range(5):
                    for n in range(5):
                        if i+j+k+l+m+n != 4:
                            continue

                        i_count += 1
print(i_count)


126

In [9]:
relaitve_k.shape


Out[9]:
(11,)

In [12]:
iter1 = np.zeros(normalized_mutli_iter0.shape)

In [ ]:


In [210]:
# relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group_correct_phi/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")

n = 2
direct_k = np.zeros((n,n))

c = 0
for i in range(n):
    for j in range(i, n):
        direct_k[i][j] = relaitve_k[c]
        if i != j:
            direct_k[j][i] = direct_k[i][j]
        c += 1
protein_mediated = np.zeros((n,n))
for i in range(n):
    for j in range(i, n):
        protein_mediated[i][j] = relaitve_k[c]
        if i != j:
            protein_mediated[j][i] = protein_mediated[i][j]
        c += 1
water_mediated = np.zeros((n,n))
for i in range(n):
    for j in range(i, n):
        water_mediated[i][j] = relaitve_k[c]
        if i != j:
            water_mediated[j][i] = water_mediated[i][j]
        c += 1
burial = np.zeros(n)
for i in range(n):
    burial[i] = relaitve_k[c]
    c += 1

In [212]:
relaitve_k


Out[212]:
array([ -1.65544,   1.56111,  -4.24015,  -2.43098,   5.95314,  -9.54182,
       -85.27961,  25.48701,   4.63337,   7.21751,  -8.84623])

In [27]:
direct_k


Out[27]:
array([[0.4623 , 0.75084],
       [0.75084, 0.57789]])

In [213]:
protein_mediated


Out[213]:
array([[-2.43098,  5.95314],
       [ 5.95314, -9.54182]])

In [214]:
water_mediated


Out[214]:
array([[-85.27961,  25.48701],
       [ 25.48701,   4.63337]])

In [215]:
burial


Out[215]:
array([ 7.21751, -8.84623])

In [24]:
relaitve_k


Out[24]:
array([ 0.4623 ,  0.75084,  0.57789, -0.15398,  0.91116, -0.67762,
       -0.70208, -0.90577,  0.86935,  0.28824, -1.93509])

In [32]:
normalized_mutli_iter0[:10]


Out[32]:
array([ 0.53610512,  0.49380235,  0.30363139,  0.52868829,  0.1319227 ,
        0.46272257,  0.51411357,  0.25170126,  0.42018033, -0.80350618])

In [37]:
res_type_map_letters[0]


Out[37]:
'A'

In [39]:
res_type_map_HP = {
    'C': 0,
    'M': 0,
    'F': 0,
    'I': 0,
    'L': 0,
    'V': 0,
    'W': 0,
    'Y': 0,
    'A': 1,
    'H': 1,
    'T': 1,
    'G': 1,
    'P': 1,
    'D': 1,
    'E': 1,
    'N': 1,
    'Q': 1,
    'R': 1,
    'K': 1,
    'S': 1
}

In [131]:
iter1 = np.zeros(normalized_mutli_iter0.shape)
c = 0 
for i in range(20):
    type1 = res_type_map_HP[res_type_map_letters[i]]
    for j in range(i, 20):
        type2 = res_type_map_HP[res_type_map_letters[j]]
        iter1[c] = normalized_mutli_iter0[c] * direct_k[type1][type2]
        c += 1
for i in range(20):
    type1 = res_type_map_HP[res_type_map_letters[i]]
    for j in range(i, 20):
        type2 = res_type_map_HP[res_type_map_letters[j]]
        iter1[c] = normalized_mutli_iter0[c] * protein_mediated[type1][type2]
        c += 1
for i in range(20):
    type1 = res_type_map_HP[res_type_map_letters[i]]
    for j in range(i, 20):
        type2 = res_type_map_HP[res_type_map_letters[j]]
        iter1[c] = normalized_mutli_iter0[c] * water_mediated[type1][type2]
        c += 1
for i in range(3):
    for j in range(20):
        type2 = res_type_map_HP[res_type_map_letters[j]]
        iter1[c] = normalized_mutli_iter0[c] * burial[type2]
        c += 1
print(c)


690

In [349]:
def getIterAfterGroupOptimization(pre, relaitve_k):
    n = 2
    direct_k = np.zeros((n,n))

    c = 0
    for i in range(n):
        for j in range(i, n):
            direct_k[i][j] = relaitve_k[c]
            if i != j:
                direct_k[j][i] = direct_k[i][j]
            c += 1
    protein_mediated = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            protein_mediated[i][j] = relaitve_k[c]
            if i != j:
                protein_mediated[j][i] = protein_mediated[i][j]
            c += 1
    water_mediated = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            water_mediated[i][j] = relaitve_k[c]
            if i != j:
                water_mediated[j][i] = water_mediated[i][j]
            c += 1
    burial = np.zeros(n)
    for i in range(n):
        burial[i] = relaitve_k[c]
        c += 1
    iter1 = np.zeros(pre.shape)
    c = 0 
    for i in range(20):
        type1 = res_type_map_HP[res_type_map_letters[i]]
        for j in range(i, 20):
            type2 = res_type_map_HP[res_type_map_letters[j]]
            iter1[c] = pre[c] * direct_k[type1][type2]
            c += 1
    for i in range(20):
        type1 = res_type_map_HP[res_type_map_letters[i]]
        for j in range(i, 20):
            type2 = res_type_map_HP[res_type_map_letters[j]]
            iter1[c] = pre[c] * protein_mediated[type1][type2]
            c += 1
    for i in range(20):
        type1 = res_type_map_HP[res_type_map_letters[i]]
        for j in range(i, 20):
            type2 = res_type_map_HP[res_type_map_letters[j]]
            iter1[c] = pre[c] * water_mediated[type1][type2]
            c += 1
    for i in range(3):
        for j in range(20):
            type2 = res_type_map_HP[res_type_map_letters[j]]
            iter1[c] = pre[c] * burial[type2]
            c += 1
    print(c)
#     normalized_iter1 = iter1 * np.std(pre) / np.std(iter1) 
    return iter1

In [380]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_only_iter0/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
# t = t * np.std(normalized_mutli_iter0) / np.std(t) 

np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only", t)


690

In [313]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
# t = t * np.std(normalized_mutli_iter0) / np.std(t) 

np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_correct_test_2", t)


690

In [246]:
np.mean(t)


Out[246]:
0.11065182543150173

In [209]:
relaitve_k


Out[209]:
array([ -1.65544,   1.56111,  -4.24015,  -2.43098,   5.95314,  -9.54182,
       -85.27961,  25.48701,   4.63337,   7.21751,  -8.84623])

In [192]:
cutoff600 = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600")

In [193]:
np.mean(cutoff600)


Out[193]:
-0.024137607291968073

In [198]:
centered = (cutoff600 - np.mean(cutoff600) + np.mean(original) ) * np.std(original) / np.std(cutoff600)

In [199]:
np.std(centered)


Out[199]:
0.4362224236601442

In [200]:
np.mean(centered)


Out[200]:
-0.21613870259219412

In [203]:
np.std(original)


Out[203]:
0.4362224236601442

In [202]:
np.mean(original)


Out[202]:
-0.11829680821739127

In [182]:
centered = normalized_mutli_iter0 - np.mean(normalized_mutli_iter0) + np.mean(original)

In [185]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/multi_original_centered", centered)

In [ ]:


In [183]:
np.mean(centered)


Out[183]:
-0.11829680821739127

In [179]:
np.mean(normalized_mutli_iter0)


Out[179]:
-0.044101537500306626

In [181]:
np.mean(original)


Out[181]:
-0.11829680821739127

In [159]:
np.sum(normalized_iter1_2- normalized_iter1_3)


Out[159]:
-18.333939374744403

In [180]:
original = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma")

In [ ]:


In [172]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_3", normalized_iter1_3)

In [ ]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_group_correct_phi/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")

In [137]:
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)


690

In [132]:
np.std(iter1)


Out[132]:
0.4896216168552758

In [133]:
normalized_iter1 = iter1 * np.std(normalized_mutli_iter0) / np.std(iter1)

In [147]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_2", -normalized_iter1)

In [151]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_check", normalized_iter1)

using this new gamma to compute the Z


In [382]:
from collections import defaultdict
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
            "new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
            "test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]

pdb_list_dic = {"../iterative_optimization_old_set":"old",
                "../iterative_optimization_new_temp_range":"new",
                "../iterative_optimization_biased_sampling":"new"}
pdb_list_dic_rev = {"old":"iterative_optimization_old_set",
                        "new":"iterative_optimization_new_temp_range"}

iteration_source_dic = {"bias_2":"../iterative_optimization_biased_sampling",
                        "bias_old_gamma":"../iterative_optimization_biased_sampling",
                        "iter1_with_bias_96percent":"../iterative_optimization_new_temp_range",
                        "iter1_with_bias_98percent":"../iterative_optimization_new_temp_range",
                        "new_iter1_0":"../iterative_optimization_new_temp_range",
                        "new_iter1_90":"../iterative_optimization_new_temp_range",
                        "new_iter1_96":"../iterative_optimization_new_temp_range",
                        "new_iter1_98":"../iterative_optimization_new_temp_range",
                        "new_iter1_combined_on_B":"../iterative_optimization_new_temp_range",
                        "new_iter2_8":"../iterative_optimization_new_temp_range",
                        "new_iter2_10":"../iterative_optimization_new_temp_range",
                        "old_new_iter2_8":"../iterative_optimization_new_temp_range",
                        "new_iter3_10":"../iterative_optimization_old_set",
                        "single":"../iterative_optimization_old_set",
                        "iter4_30":"../iterative_optimization_old_set",
                        "iter4_6":"../iterative_optimization_old_set",
                        "iter4_13":"../iterative_optimization_old_set",
                        "iter5_30":"../iterative_optimization_old_set",
                        "iter6_30":"../iterative_optimization_old_set",
                        "noFrag":"../iterative_optimization_old_set",
                        "iter0_normalized_noFrag":"../iterative_optimization_combined_train_set",
                        "iter1_normalized_noFrag":"../iterative_optimization_combined_train_set",
                        "iter2_normalized_noFrag":"../iterative_optimization_combined_train_set",
                        "iter3_normalized_noFrag":"../iterative_optimization_combined_train_set",
                        "iter4_normalized_noFrag":"../iterative_optimization_combined_train_set",
                        "iter3_normalized_noFrag_90":"../iterative_optimization_combined_train_set",
                        "iter5_normalized_noFrag":"../iterative_optimization_combined_train_set_with_frag",
                        "original":"../iterative_optimization_combined_train_set_with_frag",
                        "iter6_normalized_noFrag":"../iterative_optimization_combined_train_set_with_frag",
                        "iter0":"../iterative_optimization_combined_train_set_with_frag",
                        "without_contact":"../iterative_optimization_combined_train_set_with_frag",
                        "original_with_rg":"../iterative_optimization_combined_train_set_with_frag",
                        "iter1_with_rg":"../iterative_optimization_combined_train_set_with_frag",
                        "iter6_with_rg":"../iterative_optimization_combined_train_set_with_frag",
                        "iter2_with_rg":"../iterative_optimization_combined_train_set_with_frag",
                        "iter3_with_rg":"../iterative_optimization_combined_train_set_with_frag",
                        "iter2_with_rg_90":"../iterative_optimization_combined_train_set_with_frag",
                        "iter3_with_rg_less_frag":"../iterative_optimization_combined_train_set_with_frag",
                        "multi_iter0":"../iterative_optimization_combined_train_set_with_frag",
                        "multi_iter1":"../iterative_optimization_combined_train_set_with_frag",
                            "multi_groupedNorm":"../iterative_optimization_combined_train_set_with_frag",
                            "multi_iter2":"../iterative_optimization_combined_train_set_with_frag",
                            "multi_groupedNorm_check":"../iterative_optimization_combined_train_set_with_frag",

                        }
pdb_list_dic = {"../iterative_optimization_old_set":"old",
                "../iterative_optimization_new_temp_range":"new",
                "../iterative_optimization_biased_sampling":"new",
                "../iterative_optimization_combined_train_set":"combined",
                "../iterative_optimization_combined_train_set_with_frag":"combined"}
# new_simulation_list = ["iter1_with_bias_96percent", "new_iter2_10"]
# old_protein_simulation_list = ["single", "new_iter3_10"]

# new_simulation_list = ["bias_2","bias_old_gamma", "iter1_with_bias_96percent", "iter1_with_bias_98percent", "new_iter2_10", "new_iter1_90", "new_iter2_8", "old_new_iter2_8"]
# old_protein_simulation_list = ["noFrag", "iter6_30", "iter5_30", "single", "new_iter3_10", "iter4_30", "iter4_6", "iter4_13"]
# combined_simulation_list = ["iter5_normalized_noFrag", "original", "iter0_normalized_noFrag", "iter1_normalized_noFrag", "iter2_normalized_noFrag", "iter3_normalized_noFrag", "iter4_normalized_noFrag", "iter3_normalized_noFrag_90"]
# new_data = ["iter5_normalized_noFrag", "original"]


new_simulation_list = []
old_protein_simulation_list = []
combined_simulation_list = ["multi_iter1", "iter3_with_rg_less_frag", "multi_iter0", "iter3_with_rg", "iter2_with_rg_90", "iter2_with_rg", "iter0", "without_contact", "original_with_rg", "iter1_with_rg", "iter6_with_rg"]
# new_data = ["multi_iter1"]
combined_simulation_list = ["multi_iter1", "iter3_with_rg_less_frag", "multi_iter0", "without_contact", "original_with_rg"]
# combined_simulation_list = ["multi_iter0"]

# combined_simulation_list = ["multi_groupedNorm"]
# combined_simulation_list = ["multi_iter2"]

new_data = []

simulation_location_list_dic = defaultdict(list)
for p in dataset["new"]:
    name = p.lower()[:4]
    simulation_location_list_dic[name] += new_simulation_list
for p in dataset["old"]:
    name = p.lower()[:4]
    simulation_location_list_dic[name] += old_protein_simulation_list
for p in dataset["combined"]:
    name = p.lower()[:4]
    simulation_location_list_dic[name] += combined_simulation_list

# simulation_location_list = ["multi_iter1"] 
cwd = os.getcwd()
print(cwd)
Run = 30
decoy_n = 1000
n = len(combined_simulation_list)


/Users/weilu/Research/server/april_2019/optimization_test

relative k


In [370]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.9069489161706255+0j)
Out[370]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.906949 1.453930 1.778054 0.357378
1 1enh 0.985537 0.839888 1.368144 0.536008
2 2gb1 1.079965 2.509042 3.105200 0.552016
3 2cro 2.712171 1.108918 2.882465 0.653921
4 1ctf 1.260369 3.503305 6.116724 2.073534
5 4icb -0.091443 1.979762 1.874206 1.154340
6 1r69 0.350150 1.993553 2.486989 1.409214
7 1utg 1.399246 2.698984 3.691827 0.709556
8 3icb 0.093880 1.922433 2.014342 0.978999
9 256b 2.327309 2.371710 4.135706 0.757955
10 4cpv 2.089008 2.375804 4.474900 1.004830
11 1ccr 3.186951 0.921298 5.584829 1.463321
12 2mhr 1.966495 2.557881 6.767437 2.140639
13 1mba 2.249461 1.433117 4.488645 1.358337
14 2fha 1.543130 2.580974 5.302115 1.763391

In [372]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_apr26"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.9069455039988377+0j)
Out[372]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.906946 9.509919 11.629953 2.337555
1 1enh 0.985544 5.493527 8.948746 3.505899
2 2gb1 1.079953 16.411642 20.310939 3.610618
3 2cro 2.712130 7.253434 18.853676 4.277171
4 1ctf 1.260374 22.914806 40.008656 13.562521
5 4icb -0.091411 12.948948 12.258769 7.550251
6 1r69 0.350148 13.039531 16.266988 9.217420
7 1utg 1.399223 17.653761 24.147659 4.641073
8 3icb 0.093915 12.573972 13.175347 6.403396
9 256b 2.327278 15.513414 27.051197 4.957630
10 4cpv 2.089022 15.540184 29.270021 6.572376
11 1ccr 3.186961 6.026478 36.529788 9.571286
12 2mhr 1.966502 16.731056 44.265006 14.001485
13 1mba 2.249460 9.374779 29.360368 8.884615
14 2fha 1.543141 16.881999 34.680556 11.533981

In [375]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (3.323809191418299+0j)
Out[375]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 3.323809 1.935051 6.717076 1.438718
1 1enh 3.206551 6.203263 12.503053 1.964662
2 2gb1 4.640695 -23.920461 -11.641314 2.645972
3 2cro 5.246270 -11.166243 3.202014 2.738756
4 1ctf 4.148457 25.670878 34.944346 2.235402
5 4icb 5.672571 -15.031520 -1.553487 2.376001
6 1r69 2.869913 -5.568738 1.474874 2.454294
7 1utg 2.076055 -18.356731 -12.177535 2.976412
8 3icb 4.405987 -13.377446 -1.660375 2.659352
9 256b 4.418805 3.026435 17.662955 3.312326
10 4cpv 5.842750 20.329227 40.155155 3.393253
11 1ccr 6.485876 -23.627812 9.488204 5.105866
12 2mhr 4.514758 -5.264950 17.102839 4.954371
13 1mba 5.537162 40.929257 70.042394 5.257772
14 2fha 7.556860 -32.222225 9.979156 5.584513

In [374]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.15290199537968704+0j)
Out[374]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.152902 -28.992596 -27.184842 11.822957
1 1enh 0.080996 -23.960075 -23.121380 10.354832
2 2gb1 -0.504960 -18.192487 -25.066090 13.612181
3 2cro -0.619151 -26.656842 -44.731148 29.192063
4 1ctf -0.430731 -60.023665 -76.668315 38.642787
5 4icb -0.614493 -49.083201 -69.524317 33.265010
6 1r69 0.408498 -37.485392 -27.237745 25.086142
7 1utg -0.956318 -18.708530 -50.304104 33.038778
8 3icb -0.538191 -50.699963 -68.670571 33.390762
9 256b -0.186851 -67.584370 -73.547922 31.916056
10 4cpv -0.691643 -58.752905 -101.823112 62.272337
11 1ccr -0.672497 -37.727471 -77.759887 59.528026
12 2mhr -0.457610 -70.312479 -102.303078 69.907978
13 1mba -0.451267 -89.587258 -130.133837 89.850445
14 2fha -0.355404 -82.171720 -109.048777 75.623992

In [376]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_iter1_apr26"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.23269582913642559+0j)
Out[376]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.232696 -26.336071 -23.465021 12.338212
1 1enh 0.204255 -22.836269 -20.384657 12.002719
2 2gb1 -0.394936 -11.794488 -17.293273 13.923234
3 2cro -0.444232 -24.889428 -38.738475 31.175265
4 1ctf -0.278589 -53.062370 -62.911836 35.354805
5 4icb -0.578150 -46.013131 -67.931117 37.910554
6 1r69 0.505721 -33.713731 -21.420448 24.308420
7 1utg -0.863709 -11.777266 -42.230769 35.259001
8 3icb -0.500993 -47.891833 -66.613554 37.369212
9 256b -0.031826 -64.406388 -65.483201 33.834013
10 4cpv -0.600748 -55.059796 -94.363732 65.424952
11 1ccr -0.467269 -37.146376 -65.641440 60.982137
12 2mhr -0.307871 -66.738309 -88.078593 69.315572
13 1mba -0.354899 -90.443144 -124.245982 95.246368
14 2fha -0.243840 -79.204624 -99.550069 83.437564

In [377]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_apr26_15"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.18567200837329298+0j)
Out[377]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.185672 -28.596654 -26.312348 12.302911
1 1enh 0.134702 -24.069929 -22.553492 11.257720
2 2gb1 -0.461209 -16.014403 -22.490274 14.041064
3 2cro -0.546939 -26.568145 -43.347749 30.679096
4 1ctf -0.373558 -58.607862 -72.875724 38.194466
5 4icb -0.599089 -48.995039 -70.523280 35.934994
6 1r69 0.447542 -36.836032 -25.511019 25.304926
7 1utg -0.918358 -16.325185 -48.204239 34.713113
8 3icb -0.522555 -50.756979 -69.460113 35.791684
9 256b -0.122972 -67.890950 -72.002800 33.437333
10 4cpv -0.654508 -58.639945 -101.195434 65.019058
11 1ccr -0.589819 -38.385172 -74.661279 61.503753
12 2mhr -0.398437 -70.522166 -98.927534 71.292061
13 1mba -0.411607 -92.060912 -130.818521 94.161724
14 2fha -0.308386 -82.910254 -107.760686 80.582183

In [383]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (-0.20044274150919+0j)
Out[383]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 -0.200443 26.498155 23.223222 16.338494
1 1enh -0.224065 18.649030 15.219895 15.304195
2 2gb1 0.632764 3.723007 17.056591 21.071984
3 2cro 0.432941 14.881535 31.400690 38.155686
4 1ctf 0.378874 50.288398 67.915396 46.524718
5 4icb 0.417804 32.078290 47.512736 36.941838
6 1r69 -0.496760 27.972490 13.808359 28.513013
7 1utg 0.891971 -7.382588 30.243406 42.182976
8 3icb 0.407629 33.879715 48.640539 36.211407
9 256b 0.041400 66.984388 68.544168 37.675657
10 4cpv 0.986098 9.420485 78.917183 70.476440
11 1ccr 0.443941 7.317954 44.252068 83.195927
12 2mhr 0.287101 44.839586 68.482680 82.351003
13 1mba 0.201569 63.491286 85.860671 110.976397
14 2fha 0.241917 49.863443 71.516896 89.507635

In [384]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/iter_multi_group_iter0_only_15"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.14970740626443105+0j)
Out[384]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.149707 -28.259689 -26.561327 11.344543
1 1enh 0.071669 -23.481936 -22.768952 9.948251
2 2gb1 -0.494498 -18.239103 -24.662173 12.989061
3 2cro -0.627937 -26.355153 -43.972560 28.055989
4 1ctf -0.433019 -58.685738 -74.814858 37.248060
5 4icb -0.622210 -48.344127 -68.395996 32.226869
6 1r69 0.402387 -36.784597 -26.984334 24.355301
7 1utg -0.957036 -19.196521 -49.649987 31.820610
8 3icb -0.543217 -49.907310 -67.488938 32.365723
9 256b -0.193622 -65.671211 -71.636725 30.810112
10 4cpv -0.677680 -59.005624 -99.804336 60.203485
11 1ccr -0.683525 -37.839953 -76.846807 57.067209
12 2mhr -0.465383 -69.297433 -100.698953 67.474523
13 1mba -0.462731 -88.044327 -128.142472 86.655346
14 2fha -0.360476 -81.085130 -107.397026 72.992176

In [381]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_iter0_only"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (4.06968441135203+0j)
Out[381]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 4.069684 26.498155 40.507580 3.442386
1 1enh 3.747978 18.649030 30.963431 3.285612
2 2gb1 8.822935 3.723007 39.947730 4.105745
3 2cro 8.341816 14.881535 75.326804 7.246056
4 1ctf 10.097734 50.288398 117.903667 6.696083
5 4icb 11.171815 32.078290 88.932142 5.089043
6 1r69 5.525025 27.972490 62.748057 6.294192
7 1utg 6.575540 -7.382588 76.739193 12.793137
8 3icb 8.518524 33.879715 89.215953 6.495989
9 256b 6.622561 66.984388 111.338028 6.697355
10 4cpv 16.553031 9.420485 155.765507 8.840980
11 1ccr 14.099696 7.317954 136.490708 9.161386
12 2mhr 11.054889 44.839586 169.517190 11.278051
13 1mba 11.633520 63.491286 197.090582 11.483996
14 2fha 7.378837 49.863443 164.145318 15.487789

explore why E native is different for relative k gamma and the converted gamma.


In [371]:
normalized_mutli_iter0 = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600")
relaitve_k = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma")
t = getIterAfterGroupOptimization(normalized_mutli_iter0, relaitve_k)
t = t * np.std(normalized_mutli_iter0) / np.std(t) 
new_gamma = t 
np.savetxt("/Users/weilu/Research/server/april_2019/gammas_complete/mutli_group_normalized_apr26", t)


690

In [353]:
direct = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_pairwise_contact_well_1r69_multi_iter1_native_4.5_6.5_5.0_10")
burial = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_burial_well_1r69_multi_iter1_native_4.0")
phi = np.loadtxt("/Users/weilu/Research/server/april_2019/phis/phi_normalize_relative_k_1r69_multi_iter1_native_1")
burial_HP = phi[-2:]

In [302]:
np.sum(relaitve_k[-2:] * burial_HP)


Out[302]:
181.60457402507433

In [256]:
relaitve_k[-2:]


Out[256]:
array([ 4.57706, -6.48611])

In [328]:
np.sum(normalized_mutli_iter0[630:] * burial)


Out[328]:
-19.547737520544807

In [329]:
np.sum(phi[-2:])


Out[329]:
-19.54711038572897

In [330]:
np.sum(normalized_mutli_iter0[:210] * direct)


Out[330]:
-7.374369838900499

In [331]:
np.sum(phi[:3])


Out[331]:
-7.374392610377298

In [354]:
np.sum(new_gamma[:210] * direct)


Out[354]:
16.001611484499612

In [355]:
np.sum(relaitve_k[:3] * phi[:3])


Out[355]:
16.00176745298115

In [356]:
np.sum(new_gamma[630:] * burial)


Out[356]:
181.60640789040195

In [357]:
np.sum(relaitve_k[-2:] * burial_HP)


Out[357]:
181.60457402507436

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [242]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (2.744680117094902+0j)
Out[242]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 2.744680 -32.219842 -20.971483 4.098241
1 1enh 3.616229 -51.479522 -32.537929 5.237941
2 2gb1 4.698842 8.525538 40.254484 6.752503
3 2cro 4.892017 -36.455338 -1.082517 7.230723
4 1ctf 3.061144 113.542466 134.879489 6.970278
5 4icb 4.617961 -36.362889 -4.101792 6.986005
6 1r69 1.377066 8.263962 18.042864 7.101259
7 1utg 1.375256 -3.878534 5.994301 7.178907
8 3icb 4.444853 -36.109087 -4.618184 7.084802
9 256b 3.403290 3.288068 40.446049 10.918253
10 4cpv 5.660298 140.116817 189.434827 8.712971
11 1ccr 5.818824 104.588676 173.718623 11.880399
12 2mhr 2.721218 121.753237 157.016101 12.958487
13 1mba 5.718847 244.551172 310.771398 11.579297
14 2fha 6.235800 21.944437 101.277068 12.722126

In [243]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_group_correct_phi_2/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list_contact.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (2.579928002470748+0j)
Out[243]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 2.579928 -33.977177 -23.593693 4.024718
1 1enh 3.590208 -51.435739 -32.089524 5.388606
2 2gb1 4.966652 6.861182 41.266754 6.927317
3 2cro 4.812885 -37.586820 -1.291249 7.541333
4 1ctf 3.140788 117.970983 140.782726 7.263063
5 4icb 4.389622 -34.149328 -1.797693 7.370027
6 1r69 1.250809 8.237188 17.441793 7.358924
7 1utg 1.447186 -2.462500 7.999819 7.229420
8 3icb 4.302885 -33.453960 -2.008127 7.308080
9 256b 3.624872 -0.526098 39.328192 10.994675
10 4cpv 5.235787 148.183003 197.615142 9.441206
11 1ccr 5.525826 120.348338 188.914952 12.408393
12 2mhr 2.656941 132.720021 167.568377 13.115967
13 1mba 5.931559 256.459363 326.936490 11.881721
14 2fha 6.050397 29.334929 108.782911 13.131037

In [ ]:

only one run, multi iter 2


In [175]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (5.329596253586245+0j)
Out[175]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 5.329596 -97.490309 -11.267791 16.178058
1 1enh 3.714175 -129.790529 -63.729653 17.786152
2 2gb1 9.303248 -156.198400 11.705171 18.047845
3 2cro 9.808932 -171.007406 28.101269 20.298711
4 1ctf 10.923785 44.513313 263.822895 20.076337
5 4icb 9.388147 -160.762884 86.912963 26.381759
6 1r69 5.347680 -113.548726 -1.558844 20.941770
7 1utg 8.482160 -129.933827 51.161856 21.350186
8 3icb 8.834743 -147.311402 76.505361 25.333703
9 256b 7.752919 -149.944852 24.894471 22.551417
10 4cpv 15.651142 -118.345673 278.619219 25.363318
11 1ccr 14.086395 -293.867064 150.567719 31.550640
12 2mhr 10.610058 -97.147941 224.520829 30.317344
13 1mba 15.496629 -92.132306 432.294745 33.841364
14 2fha 13.902511 -444.312967 25.493008 33.792887

In [236]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test_2"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (25.37327750437509+0j)
Out[236]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 25.373278 -141.980195 138.927374 11.071001
1 1enh 35.224855 -185.818663 178.272177 10.336191
2 2gb1 31.875889 -193.651406 170.329905 11.418703
3 2cro 18.048462 -197.713919 197.456484 21.894962
4 1ctf 26.178092 -209.678978 212.458506 16.125602
5 4icb 32.994030 -304.544591 292.572074 18.097719
6 1r69 25.290466 -220.435921 199.251162 16.594676
7 1utg 23.817054 -250.875944 224.280605 19.950265
8 3icb 35.156813 -311.585984 289.434001 17.095406
9 256b 31.570309 -282.307440 333.839540 19.516660
10 4cpv 28.146730 -323.381353 345.619029 23.768316
11 1ccr 28.837517 -249.293803 328.415837 20.033266
12 2mhr 35.866471 -370.579611 400.421671 21.496435
13 1mba 20.424964 -332.500548 401.482988 35.935610
14 2fha 37.334589 -503.366857 559.524078 28.469336

In [226]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_test/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/proteins_name_list_phi_normalize_relative_k1_gamma"
data = validate_hamiltonian_wei("phi_list_relative_k.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (3.0670839917531443+0j)
Out[226]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 3.067084 -238.620221 114.875728 115.254734
1 1enh 3.087372 -308.282547 147.093439 147.496311
2 2gb1 3.296432 -321.514388 139.313861 139.796077
3 2cro 2.930175 -329.919730 169.772897 170.533366
4 1ctf 2.841338 -399.383490 215.946459 216.563457
5 4icb 3.044248 -528.601822 257.988935 258.385923
6 1r69 3.111858 -369.613618 174.224203 174.763048
7 1utg 3.174108 -424.083501 193.830033 194.673107
8 3icb 3.115520 -538.988067 254.236175 254.604122
9 256b 2.661892 -490.339644 294.042965 294.671114
10 4cpv 2.893818 -611.661822 321.839125 322.584531
11 1ccr 2.977748 -544.268326 273.940978 274.774538
12 2mhr 2.899444 -665.584277 349.233332 350.004227
13 1mba 2.512601 -613.240376 403.595047 404.694295
14 2fha 2.813174 -876.186132 481.941820 482.774214

In [208]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (-0.7785625572725587+0j)
Out[208]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 -0.778563 15.724115 14.971336 0.966882
1 1enh -0.857263 20.420186 19.486177 1.089524
2 2gb1 -3.660392 22.661257 18.202556 1.218094
3 2cro -0.126194 22.437517 22.221975 1.708010
4 1ctf 0.747117 26.715554 27.930645 1.626372
5 4icb -0.884219 35.802427 34.521224 1.448965
6 1r69 -1.162904 24.753218 23.086217 1.433482
7 1utg -2.077597 30.541210 26.121620 2.127260
8 3icb -1.704480 36.528655 34.051203 1.453494
9 256b 3.410401 31.958260 38.078629 1.794619
10 4cpv -1.161129 44.975473 42.444982 2.179336
11 1ccr -0.907471 38.520517 36.522833 2.201375
12 2mhr 0.668119 45.214261 46.822112 2.406534
13 1mba 4.119132 41.933685 53.820336 2.885717
14 2fha 1.383783 59.922998 63.932649 2.897600

In [177]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_test"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (-2.172711921492115+0j)
Out[177]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 -2.172712 -0.812395 -2.676399 0.857916
1 1enh -2.884478 2.234617 0.210406 0.701760
2 2gb1 -6.858332 6.108313 -0.996106 1.035882
3 2cro -5.708815 2.456680 -5.591613 1.409801
4 1ctf -7.987736 -3.609573 -12.731486 1.141990
5 4icb -5.848075 2.842043 -6.288045 1.561212
6 1r69 -3.885521 0.939568 -3.437161 1.126420
7 1utg -5.652986 8.483598 -3.210749 2.068703
8 3icb -6.237105 2.744125 -5.767669 1.364703
9 256b -2.701631 -3.874825 -7.968332 1.515198
10 4cpv -11.008795 7.779069 -15.951679 2.155617
11 1ccr -8.367106 9.250335 -9.770382 2.273273
12 2mhr -5.074708 2.278361 -11.650519 2.744765
13 1mba -7.479056 -4.900818 -23.517526 2.489179
14 2fha -6.152933 5.202115 -12.278705 2.841055

In [164]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/iter_multi_iter2"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.32813540429785293+0j)
Out[164]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.328135 -32.869283 -29.846201 9.212907
1 1enh 0.303215 -31.402105 -29.015923 7.869601
2 2gb1 -0.343159 -41.623280 -45.192385 10.400725
3 2cro -0.397593 -38.206859 -47.125332 22.431188
4 1ctf -0.366632 -67.848082 -78.532530 29.142195
5 4icb -0.468538 -61.885816 -74.212406 26.308603
6 1r69 0.195245 -48.203319 -44.644285 18.228570
7 1utg -0.847994 -38.432243 -59.511640 24.857951
8 3icb -0.418489 -61.971023 -72.895200 26.103857
9 256b -0.040790 -75.917259 -76.932887 24.899176
10 4cpv -0.610212 -83.433184 -112.415115 47.494841
11 1ccr -0.481562 -85.150915 -106.995663 45.362243
12 2mhr -0.300094 -104.695659 -120.331010 52.101564
13 1mba -0.368211 -136.028997 -161.274277 68.561921
14 2fha -0.159214 -122.910580 -132.059784 57.464970

In [173]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_correct_3"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (-1.0420057481487388+0j)
Out[173]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 -1.042006 -9.058026 -11.510900 2.353993
1 1enh -1.126255 -4.057777 -7.927308 3.435750
2 2gb1 -1.044006 -15.997458 -19.737067 3.581981
3 2cro -2.925307 -7.412005 -19.996809 4.302046
4 1ctf -1.222518 -26.003415 -43.825628 14.578284
5 4icb 0.102962 -13.290380 -12.572156 6.975611
6 1r69 -1.601661 -12.362236 -20.246264 4.922407
7 1utg -1.514912 -18.593834 -25.837844 4.781804
8 3icb -0.064607 -13.015977 -13.409528 6.091497
9 256b -2.292311 -15.182407 -26.539098 4.954255
10 4cpv -2.118797 -19.152332 -34.836469 7.402379
11 1ccr -2.948578 -7.855197 -39.535346 10.744211
12 2mhr -1.875728 -21.543533 -49.444234 14.874601
13 1mba -2.297710 -17.877586 -39.535855 9.426025
14 2fha -1.682448 -20.314614 -38.002604 10.513247

In [168]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_group_correct_phi_2/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (2.5843292106511555+0j)
Out[168]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 2.584329 -33.807592 -23.465494 4.001850
1 1enh 3.610412 -51.494636 -32.257894 5.328130
2 2gb1 4.913295 6.976602 41.021585 6.929155
3 2cro 4.889124 -37.601514 -1.406690 7.403131
4 1ctf 3.162037 117.876041 140.684784 7.213307
5 4icb 4.476919 -34.095312 -1.752761 7.224287
6 1r69 1.219380 8.299799 17.437454 7.493687
7 1utg 1.466626 -2.411346 8.005802 7.102799
8 3icb 4.263285 -33.445365 -1.932287 7.391737
9 256b 3.627542 -0.494520 39.495007 11.023863
10 4cpv 5.225571 148.032052 197.362497 9.440202
11 1ccr 5.643339 120.125345 188.810689 12.171048
12 2mhr 2.691431 132.423451 167.102701 12.885062
13 1mba 5.978182 256.545129 326.984100 11.782675
14 2fha 6.140449 28.811918 108.015605 12.898680

In [165]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/optimization_multi_iter1_correct_phi/gammas/proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma_filtered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (2.4828147964542224+0j)
Out[165]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 2.482815 52.923033 86.995881 13.723475
1 1enh 2.076786 119.546608 158.009825 18.520551
2 2gb1 3.587303 -265.793075 -162.701408 28.737928
3 2cro 4.713304 -65.778917 48.827093 24.315428
4 1ctf 3.704938 -271.101438 -179.634801 24.687765
5 4icb 6.014450 -105.103500 17.294480 20.350651
6 1r69 2.546984 -28.691472 24.690745 20.958996
7 1utg 2.611957 -110.528885 -39.555397 27.172538
8 3icb 4.344024 -85.056741 10.790746 22.064218
9 256b 3.994876 -177.101871 -54.729052 30.632442
10 4cpv 3.684924 -440.666970 -288.295656 41.349916
11 1ccr 6.585857 -780.011644 -373.283277 61.757851
12 2mhr 3.056886 -382.741909 -214.142939 55.153830
13 1mba 4.949694 -679.071064 -403.728362 55.628231
14 2fha 7.034065 -685.383920 -243.335724 62.843920

In [166]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/mutli_group_normalized_2"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.7984852942465899+0j)
Out[166]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.798485 -42.201317 -40.045188 2.700274
1 1enh 2.048859 -60.508035 -53.077833 3.626507
2 2gb1 0.381139 -43.389702 -42.064389 3.477240
3 2cro 0.076409 -53.111697 -52.789757 4.213383
4 1ctf 1.325868 -65.886841 -56.782921 6.866385
5 4icb 1.081715 -87.941315 -83.279520 4.309632
6 1r69 2.801610 -63.959089 -52.527439 4.080387
7 1utg 1.653784 -59.885841 -53.371786 3.938879
8 3icb 1.624683 -89.009963 -82.171401 4.209167
9 256b -0.543316 -103.627364 -109.609560 11.010526
10 4cpv -0.919007 -83.395480 -90.027557 7.216572
11 1ccr 0.743565 -77.168440 -72.276134 6.579527
12 2mhr 1.493558 -103.078464 -91.634176 7.662436
13 1mba -0.687656 -87.801019 -96.517339 12.675403
14 2fha -0.796232 -145.314736 -155.005611 12.170916

In [205]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/multi_original_centered"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.26079451804949966+0j)
Out[205]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.260795 -37.653693 -34.422850 12.388461
1 1enh 0.091020 -33.308153 -32.331360 10.731647
2 2gb1 -0.198443 -33.140230 -35.959843 14.208699
3 2cro -0.593647 -40.948309 -59.660158 31.520177
4 1ctf -0.361268 -78.841639 -93.762336 41.300911
5 4icb -0.622126 -64.862661 -87.379830 36.193928
6 1r69 -0.101881 -50.839146 -53.472092 25.843375
7 1utg -0.923017 -30.534971 -63.922862 36.172572
8 3icb -0.547125 -66.389580 -86.168221 36.150146
9 256b -0.104779 -93.564173 -97.329818 35.938777
10 4cpv -0.679349 -85.117739 -131.133611 67.735241
11 1ccr -0.589801 -68.835126 -106.731532 64.252908
12 2mhr -0.465439 -98.311869 -133.685459 76.000545
13 1mba -0.436230 -127.280681 -170.647809 99.413397
14 2fha -0.276815 -126.550131 -149.917875 84.416606

In [171]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.156274796025097+0j)
Out[171]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.156275 -28.992596 -27.142675 11.837614
1 1enh 0.075814 -23.960075 -23.186892 10.198434
2 2gb1 -0.499431 -18.192487 -24.988922 13.608364
3 2cro -0.612100 -26.656842 -44.565857 29.258327
4 1ctf -0.431936 -60.023665 -76.721743 38.658643
5 4icb -0.610629 -49.083201 -69.411432 33.290614
6 1r69 -0.089119 -37.485392 -39.636875 24.141582
7 1utg -0.954058 -18.708530 -50.076712 32.878710
8 3icb -0.540551 -50.699963 -68.712193 33.321977
9 256b -0.183055 -67.584370 -73.428494 31.925518
10 4cpv -0.695240 -58.752905 -102.008073 62.216131
11 1ccr -0.673638 -37.727471 -77.747647 59.408998
12 2mhr -0.460260 -70.312479 -102.460676 69.847943
13 1mba -0.452302 -89.587258 -130.290985 89.992404
14 2fha -0.362563 -82.171720 -109.491216 75.351115

In [170]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/iter_multi_iter1_correct_real"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.24369756397575776+0j)
Out[170]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.243698 -28.741377 -26.161028 10.588324
1 1enh 0.270351 -25.886326 -23.443027 9.037508
2 2gb1 -0.240983 -15.460156 -18.359206 12.030101
3 2cro -0.485629 -27.005730 -39.535258 25.800605
4 1ctf -0.375159 -42.298917 -54.979141 33.799556
5 4icb -0.513951 -46.522550 -61.541893 29.223288
6 1r69 -0.050060 -32.394509 -33.459440 21.273186
7 1utg -0.922741 -16.768300 -43.551507 29.025711
8 3icb -0.444028 -47.888668 -60.936800 29.385847
9 256b -0.054282 -59.810647 -61.324630 27.891162
10 4cpv -0.617861 -38.415016 -72.157007 54.610989
11 1ccr -0.556534 -22.365945 -51.482234 52.317211
12 2mhr -0.408011 -50.049477 -75.310683 61.913025
13 1mba -0.371424 -55.788522 -85.343778 79.572742
14 2fha -0.253979 -69.981606 -86.904398 66.630593

In [163]:
os.chdir("/Users/weilu/Research/server/april_2019/optimization_multi_iter2/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma"
data = validate_hamiltonian_wei("phi_list.txt", "proteins_name_list.txt", gamma_file_name, "lammps", n*decoy_n, mode=2, simulation_location_list_dic=simulation_location_list_dic)
data


0 (0.06838831996057486+0j)
Out[163]:
Protein Z_scores E_native E_mgs Std_mg
0 1fc2 0.068388 -54.879664 -54.392256 7.127063
1 1enh -0.512487 -54.113328 -58.485539 8.531361
2 2gb1 0.115170 -65.788891 -64.764799 8.892017
3 2cro -0.749168 -75.607544 -88.995377 17.870262
4 1ctf -0.451089 -95.643924 -105.205415 21.196441
5 4icb -0.573953 -96.301123 -109.478452 22.958915
6 1r69 -0.347996 -74.085447 -78.791857 13.524329
7 1utg -0.933715 -76.555773 -97.768524 22.718659
8 3icb -0.413450 -97.942451 -107.483931 23.077737
9 256b -0.307580 -126.514286 -133.193973 21.716881
10 4cpv -0.650470 -139.856600 -168.022385 43.300666
11 1ccr -0.991737 -111.007824 -153.573178 42.920010
12 2mhr -0.633280 -143.913085 -172.393175 44.972377
13 1mba -0.535166 -198.213895 -228.846468 57.239354
14 2fha -0.319088 -208.326703 -224.255503 49.919749

In [59]:
iter1[630:]


Out[59]:
array([-7.94458461e-01,  5.82561979e-01,  8.81648026e-01,  1.60095596e+00,
        4.39400207e-01,  1.02069292e+00,  1.85344695e+00,  6.47701792e-01,
       -4.03048013e-01,  3.99310506e-01,  3.23222270e-01,  1.50913733e+00,
        2.75128353e-01,  3.50502249e-01,  7.68935287e-01,  4.61405177e-01,
       -6.49050916e-03,  2.48547014e-01,  2.51877279e-01,  3.68950950e-01,
        8.80556512e-01,  1.73488159e+00,  1.09766260e+00,  1.21498559e+00,
        1.12133666e-01,  1.90410625e+00,  1.75084781e+00, -1.21537498e+00,
        1.44796500e-01,  1.44428763e-01, -2.33323410e-02,  1.60804382e+00,
       -3.57990358e-02,  1.26253609e-01, -1.57683077e+00,  5.20236658e-01,
        3.63045053e-01, -5.32826422e-03, -5.13790148e-04,  1.75547685e-01,
        3.41442253e+00,  7.55751228e-01,  1.19306689e+00,  1.31426924e+00,
       -3.41152814e-01,  1.86974210e+00,  1.71213491e+00, -7.53094269e-01,
       -6.22444952e-01, -1.97211272e-01, -3.29479364e-01,  8.61016603e-01,
       -2.64396832e-01,  1.25981308e-01, -3.26205940e+00,  8.31759618e-01,
        8.92754917e-01,  2.57398790e-02,  7.83639915e-02, -1.99526043e-01])

In [61]:
# now, positive means favored.
rhoGamma = pd.DataFrame(-iter1[630:].reshape(3,20).T, columns=["rho1", "rho2", "rho3"]).reset_index()
rhoGamma["oneLetter"] = rhoGamma["index"].apply(lambda x: inverse_res_type_map[x])
rhoGamma["Residue"] = rhoGamma["index"].apply(lambda x: one_to_three(inverse_res_type_map[x]))
rhoGamma = rhoGamma[["Residue", "rho1", "rho2", "rho3", "index", "oneLetter"]]
g = rhoGamma[["rho1", "rho2", "rho3"]].values
# np.savetxt("/Users/weilu/Research/server/feb_2019/burial_only_gamma.dat", g, fmt='%7.4f')
# rhoGamma
rhoGamma["hydrophobicityOrder"] = rhoGamma["oneLetter"].apply(lambda x: hydrophobicity_map[x])
rhoGamma.sort_values("hydrophobicityOrder")


Out[61]:
Residue rho1 rho2 rho3 index oneLetter hydrophobicityOrder
1 ARG -0.582562 -1.734882 -0.755751 1 R 0
11 LYS -1.509137 -1.608044 -0.861017 11 K 1
2 ASN -0.881648 -1.097663 -1.193067 2 N 2
5 GLN -1.020693 -1.904106 -1.869742 5 Q 3
3 ASP -1.600956 -1.214986 -1.314269 3 D 4
6 GLU -1.853447 -1.750848 -1.712135 6 E 5
8 HIS 0.403048 -0.144797 0.622445 8 H 6
18 TYR -0.251877 0.000514 -0.078364 18 Y 7
17 TRP -0.248547 0.005328 -0.025740 17 W 8
15 SER -0.461405 -0.520237 -0.831760 15 S 9
16 THR 0.006491 -0.363045 -0.892755 16 T 10
7 GLY -0.647702 1.215375 0.753094 7 G 11
14 PRO -0.768935 1.576831 3.262059 14 P 12
0 ALA 0.794458 -0.880557 -3.414423 0 A 13
12 MET -0.275128 0.035799 0.264397 12 M 14
4 CYS -0.439400 -0.112134 0.341153 4 C 15
13 PHE -0.350502 -0.126254 -0.125981 13 F 16
10 LEU -0.323222 0.023332 0.329479 10 L 17
19 VAL -0.368951 -0.175548 0.199526 19 V 18
9 ILE -0.399311 -0.144429 0.197211 9 I 19

In [56]:
plt.plot(iter1.flatten())


Out[56]:
[<matplotlib.lines.Line2D at 0x1a2492d6a0>]

In [57]:
plt.plot(normalized_mutli_iter0.flatten())


Out[57]:
[<matplotlib.lines.Line2D at 0x1a24632630>]

In [49]:
plot_contact_well(iter1[:210], inferBound=True, vmin=-2, vmax=2)



In [48]:
plot_contact_well(iter1[210:420], inferBound=True, vmin=-2, vmax=2)



In [ ]:


In [148]:
# pre = "/Users/weilu/Research/server_backup/feb_2019/jan_optimization/gammas/"
# pre = "/Users/weilu/Research/server/feb_2019/optimization_with_biased_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter3/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter4/gammas/"
pre = "/Users/weilu/Research/server/april_2019/optimization_multi_iter2/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter2_improved/gammas/"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
pp = "proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = pp + "_A"
B_name = pp + "_B"
B_filtered_name = pp + "_B_filtered"
P_name = pp + "_P"
Gamma_name = pp + "_gamma"
Gamma_filtered_name = pp + "_gamma_filtered"
Lamb_name = pp + "_lamb"
Lamb_filtered_name = pp + "_lamb_filtered"

A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})

half_B_name = pp + "_half_B"
half_B = np.loadtxt(pre+half_B_name)
other_half_B_name = pp + "_other_half_B"
other_half_B = np.loadtxt(pre+other_half_B_name)
std_half_B_name = pp + "_std_half_B"
std_half_B = np.loadtxt(pre+std_half_B_name)

In [71]:
cc_0 = res_type_map["C"] * 20 + res_type_map["C"]
cc_1 = res_type_map["C"] * 20 + res_type_map["C"] + 210
cc_2 = res_type_map["C"] * 20 + res_type_map["C"] + 420

In [72]:
[cc_0, cc_1, cc_2]


Out[72]:
[84, 294, 504]

In [40]:
A.shape


Out[40]:
(690,)

In [41]:
B.shape


Out[41]:
(690, 690)

In [79]:
368-210


Out[79]:
158

In [99]:
158/20


Out[99]:
7.9

In [100]:
158-140


Out[100]:
18

In [55]:
np.argmin(A[:630])


Out[55]:
365

In [42]:
plt.plot(A)


Out[42]:
[<matplotlib.lines.Line2D at 0x1a24b29080>]

In [140]:


In [149]:
BB = B.copy()
AA = A.copy()
BB[294,:] = 0
BB[:,294] = 0
AA[294] = 0

In [150]:
total_phis = 690
num_decoys = 1000
filtered_gamma_modified, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(639, AA, BB, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)


639

In [151]:
plt.plot(filtered_gamma)


Out[151]:
[<matplotlib.lines.Line2D at 0x1a2436bba8>]

In [158]:
plot_contact_well(filtered_gamma_modified[210:420], inferBound=True, vmin=-2, vmax=2)



In [157]:
plot_contact_well(filtered_gamma_modified[210:420], inferBound=False, vmin=-2, vmax=2)



In [159]:
plot_contact_well(filtered_gamma[210:420], inferBound=False, vmin=-2, vmax=2)



In [162]:
np.max(np.abs(filtered_gamma - filtered_gamma_modified))


Out[162]:
0.8638396308661012

In [165]:
tt = filtered_gamma - filtered_gamma_modified

In [166]:
plot_contact_well(tt[210:420], inferBound=False, vmin=-2, vmax=2)



In [111]:
def get_filtered_gamma_B_lamb_P_and_lamb(cutoff_mode, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
    lamb, P = np.linalg.eig(B)
    lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)

#     cutoff_mode = 600
    print(cutoff_mode)

    filtered_lamb = np.copy(lamb)
    filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
        filtered_lamb, cutoff_mode, P)

    filtered_gamma = np.dot(filtered_B_inv, A)
    filtered_B = np.linalg.inv(filtered_B_inv)
    return filtered_gamma, filtered_B, filtered_lamb, P, lamb

In [154]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(639, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)


639

In [155]:
plot_contact_well(filtered_gamma[210:420], inferBound=True, vmin=-2, vmax=2)



In [156]:




In [76]:
np.array(np.where(B == B.min()))


Out[76]:
array([[365, 640],
       [640, 365]])

In [78]:
np.array(np.where(B == B.max()))


Out[78]:
array([[368],
       [368]])

In [77]:
plt.imshow(B)
plt.colorbar()


Out[77]:
<matplotlib.colorbar.Colorbar at 0x1a198a1048>

In [ ]:


In [50]:
plt.plot(filtered_gamma)


Out[50]:
[<matplotlib.lines.Line2D at 0x1a1982ee80>]

In [54]:
np.argmin(filtered_gamma[:630])


Out[54]:
284

In [46]:
plt.plot(Gamma)


Out[46]:
[<matplotlib.lines.Line2D at 0x1a251d70f0>]

In [48]:
def get_filtered_gamma_B_lamb_P_and_lamb(cutoff_mode, A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
    lamb, P = np.linalg.eig(B)
    lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)

#     cutoff_mode = 600
    print(cutoff_mode)

    filtered_lamb = np.copy(lamb)
    filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
        filtered_lamb, cutoff_mode, P)

    filtered_gamma = np.dot(filtered_B_inv, A)
    filtered_B = np.linalg.inv(filtered_B_inv)
    return filtered_gamma, filtered_B, filtered_lamb, P, lamb

In [49]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)


600

In [3]:
# pre = "/Users/weilu/Research/server_backup/feb_2019/jan_optimization/gammas/"
# pre = "/Users/weilu/Research/server/feb_2019/optimization_with_biased_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter1/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter3/gammas/"
pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter4/gammas/"
pre = "/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/"
# pre = "/Users/weilu/Research/server/march_2019/optimization_weighted_by_q_iter2_improved/gammas/"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
pp = "proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = pp + "_A"
B_name = pp + "_B"
B_filtered_name = pp + "_B_filtered"
P_name = pp + "_P"
Gamma_name = pp + "_gamma"
Gamma_filtered_name = pp + "_gamma_filtered"
Lamb_name = pp + "_lamb"
Lamb_filtered_name = pp + "_lamb_filtered"

A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
                           0: lambda s: complex(s.decode().replace('+-', '-'))})

half_B_name = pp + "_half_B"
half_B = np.loadtxt(pre+half_B_name)
other_half_B_name = pp + "_other_half_B"
other_half_B = np.loadtxt(pre+other_half_B_name)
std_half_B_name = pp + "_std_half_B"
std_half_B = np.loadtxt(pre+std_half_B_name)

In [19]:
total_phis = 690
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, mode=2)


600

In [18]:
def get_filtered_gamma_B_lamb_P_and_lamb(A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, noise_iterations=10, relative_error_threshold=0.5, mode=2):
    lamb, P = np.linalg.eig(B)
    lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)

    cutoff_mode = 600
    print(cutoff_mode)

    filtered_lamb = np.copy(lamb)
    filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(
        filtered_lamb, cutoff_mode, P)

    filtered_gamma = np.dot(filtered_B_inv, A)
    filtered_B = np.linalg.inv(filtered_B_inv)
    return filtered_gamma, filtered_B, filtered_lamb, P, lamb

In [20]:
np.savetxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600", filtered_gamma)

In [27]:
cutoff600 = np.loadtxt("/Users/weilu/Research/server/april_2019/optimization_mult_seq_2/gammas/cutoff_600")

In [28]:
np.std(cutoff600)


Out[28]:
0.23875279981306488

In [25]:
original = np.loadtxt("/Users/weilu/Research/server/april_2019/complete_gammas/original_gamma")

In [26]:
np.std(original)


Out[26]:
0.4362224236601442

In [30]:
np.std(cutoff600 * np.std(original)/np.std(cutoff600))


Out[30]:
0.4362224236601442

In [31]:
normalized_cutoff600 = cutoff600 * np.std(original)/np.std(cutoff600)

In [35]:
np.savetxt("/Users/weilu/Research/server/april_2019/complete_gammas/normalized_cutoff600", normalized_cutoff600)

In [12]:
plot_contact_well(Gamma_filtered[:210], inferBound=True, invert_sign=False)



In [11]:
plot_contact_well(filtered_gamma[:210], inferBound=True, invert_sign=False)



In [10]:
plt.plot(filtered_lamb)
plt.yscale('log')



In [7]:
plt.plot(lamb)
plt.yscale('log')



In [5]:
plt.plot(filtered_lamb)
plt.yscale('log')



In [6]:
plt.plot(Lamb)
plt.yscale('log')



In [2]:
folder = "/Users/weilu/Research/optimization/mediated_term"

In [264]:
data = glob.glob(folder+"/multisequenceanddcafrustratometry/*.fasta")

In [77]:
def getSeqFromFasta(location):
    seq = ""
    with open(location, "r") as f:
        fastaFile = f.readlines()
    for line in fastaFile[1:]:
        seq += line.strip()
    return seq

In [265]:
len(data)


Out[265]:
1829

In [56]:
data[0]


Out[56]:
'/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/1VBHA_518-876.fasta'

In [266]:
filtered_data = []
for i, one in enumerate(data):
    problematic = 0
    # one = '/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/1VBHA_518-876.fasta'
    pre = one.split(".")[0]
    p = pre.split("/")[-1]
    name = p.split("_")[0]
    chainName = name[-1]
    low, high = p.split("_")[1].rsplit('-', 1)
#     try:
#         # low, high = p.split("_")[1].split("-")
#         low, high = p.split("_")[1].rsplit('-', 1)
#     except:
#         problematic = True
#         print(one)
#         continue
    length = int(high) - int(low) + 1

    targetPre = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/original_pdbs/"
    # os.system(f"cp {pre}.pdb {targetPre}{name.lower()[:4]}.pdb")
    os.system(f"cp {pre}.pdb {targetPre}{p}.pdb")


    seq1 = getSeqFromFasta(one)

    if length != len(seq1):
        print(i, name, length, len(seq1))
        problematic = 1

    from Bio.PDB.PDBParser import PDBParser
    pdbFileLocation = pre + ".pdb"
    structure = PDBParser().get_structure(name, pdbFileLocation)
    seq = ""
    for r in structure.get_residues():
        _, _, chain, (_, resId, _) = r.get_full_id()
        try:
            resName = three_to_one(r.get_resname())
        except:
            problematic = 2
        # assert chain == "A"
        if chain != chainName:
            print(i, name, length, len(seq1), chain, chainName)
            problematic = 3
        seq += resName

    if seq != seq1:
        print(seq, seq1)
        problematic = 4
    # if not problematic:
        # filtered_data.append(one)
    filtered_data.append([name, p, length, len(seq1), seq, problematic])


6 2HR3D 68 63
9 4IHCH 257 237
11 4M7IA 417 172
13 5K98D 255 244
14 3GVZB 230 222
15 3C6MC 193 190
17 3JU1B 340 325
23 1W1WD 1211 316
25 3EBRA 93 89
27 3U07C 133 113
34 5I47B 193 181
35 2OC6B 96 94
36 5F1RB 76 69
37 4R0VB 341 322
39 1A0IA 233 225
40 3JU6D 216 196
41 3SDVA 269 263
42 3JC75 157 128
44 2WJSA 142 115
47 2ARRA 410 355
48 4CDGB 128 111
49 2QF9B 157 131
50 3EFZB 227 223
51 2VKTA 238 231
54 2PVSB 338 330
57 4H3VB 140 128
60 3HRRB 320 305
61 4MAFH 224 208
62 2QGGA 83 81
65 3MJFA 194 186
66 4UNSB 195 174
67 2G8SB 333 327
70 1OG0H 316 308
72 5D51L 551 549
76 4A0RB 269 251
78 5KF7A 113 110
79 4QQUA 325 324
80 5BYHM 193 192
84 3CRRA 260 175
85 4R6IA 108 106
86 4C5HA 81 66
88 3FZ0D 344 315
91 2YKQC 98 83
93 3THZB 203 181
97 3RC0B 135 124
100 4FX5A 160 156
101 5LQWh 93 0
NFLKKLRNEQVTIELKNGTTVWGTLQSVSPQMNAILTDVKLTLPSDNIASLQYINIRGNTIRQIIL 
104 2PF5E 93 81
106 3HWJB 158 147
107 4NA3B 128 120
109 3S5MA 229 207
114 3F9SB 130 128
118 4ZELB 118 112
119 4KBRH 153 147
121 3T7JB 114 108
122 3MI6D 84 83
125 1RYHB 193 159
128 3UE9D 427 411
130 7PCKD 61 122
131 2XHEB 202 185
136 2FNAB 235 227
143 4BY7X 128 113
148 1LXND 93 91
156 4RCNB 211 175
159 4R40C 104 102
162 2IMJD 130 128
165 3IG4F 122 118
167 4QLAB 296 252
168 3V2IA 193 184
170 5CAEB 209 208
172 5CVTB 150 149
175 4Z85A 162 148
177 3TQOA 63 61
183 5TPMD 129 125
187 4R7GA 255 254
188 4MEEA 261 252
189 3EEQB 75 72
190 3RQZC 208 205
194 1ZFJA 77 74
195 5FVBX 176 175
196 3MKAX 492 191
197 3X1DA 276 250
199 3NETB 338 295
200 5CUSD 164 142
204 3VKHB 339 186
211 5E9HB 523 516
214 3QO8A 112 109
215 4I9KB 202 149
216 4KC9A 69 63
218 5DY3A 354 349
222 4GP3B 115 113
227 5I2CD 69 61
231 2A9UB 111 110
233 3UBYB 195 163
235 4WYKC 151 149
238 3CNXC 145 138
239 3SWHB 158 105
240 1UT9A 501 497
245 5SZJB 135 127
246 2O8FB 167 165
248 4R4UD 256 238
249 4L7UA 137 134
250 3BRMB 296 262
251 3IVFA 97 58
253 4K06A 118 107
256 4MXWZ 142 119
263 3T1VD 187 183
266 4NV6A 128 127
271 3RHFD 231 224
272 1P49A 136 132
274 3L87A 179 174
278 5CZKB 320 309
280 2R39A 110 106
287 2HH9B 72 70
290 4ZAJA 89 78
291 3ZJCF 213 206
298 4QVSA 166 162
299 2OD0B 93 91
301 4FMOB 155 140
302 1PDIR 57 0
GAIMMWAADSLPSDAWRFCHGGTVSASDCPLYASRIGTRYGGSSSNPGLPDMRGLFV 
303 4F54A 175 163
304 4PF1D 257 252
307 2FWVA 160 150
309 2Q47B 150 147
314 3CEDC 74 73
317 2WWWD 284 254
318 1OFEB 397 385
326 2GM3F 157 131
327 4WKIA 211 204
329 4BQQB 162 147
331 4M4WO 192 179
KKQQSLMKSMYIQQDLLGATFQQVDISDPSRLAMFQHVTDFLKSYNETGKGKGLYLYGKFGVGKTFMLAAIANELAEKEYSSMIVYVPEFVRELKNSLQDQTLEEKLNMVKTTPVLMLDDIGAESTSWVRDEVIGTVLQHRMSQQLPTFFSSNFSPDELKHHFTYEKEEVKAARLMERIL KQQSLMKSMYIQQDLLGATFQQVDISDPSRLAMFQHVTDFLKSYNETGKGKGLYLYGKFGVGKTFMLAAIANELAEKEYSSMIVYVPEFVRELKNSLQDQTLEEKLNMVKTTPVLMLDDIGAESTSWVRDEVIGTVLQHRMSQQLPTFFSSNFSPDELKHHFTYEKEEVKAARLMERIL
336 5EZBB 193 113
337 5LQWB 93 0
ILLEPIYEVDITVHAPLLPIVEELMKKRRGSRIYKTIKVAGTPLLEVRGQVPVIESAGFETDLRLSTNGLGMCQLYFWHKIWRKVPGDVLDKD 
343 2A8IB 310 272
345 4A2IV 177 158
349 4BXIA 105 98
354 4ZC0B 289 262
355 4XXBA 99 84
358 5FRTE 98 84
359 2O1PA 179 173
360 1UXOA 179 177
361 5LQWB 98 0
AEWSLVRIYSGLLKRGDTVRILDTSQSESRQKRQLHDISKTEDDETPSCEVEEIGLLGGRYVYPVHEAHKGQIVLIKGISSAYIKSATLY 
362 3VKFB 575 519
363 1V0YA 171 164
368 5KI6A 131 123
371 4ZGZC 359 325
375 5I0CA 80 78
376 3B40A 390 385
378 3A1FA 151 128
380 3VHPB 238 239
387 5JPQk 260 166
389 3GKUC 75 71
391 3A98D 127 94
392 4TMAK 56 52
397 3KHKB 170 150
399 2GLTA 175 155
400 5EAYD 100 96
402 4B9XA 123 121
405 4US2S 118 102
407 5BUYF 136 119
408 3I4TA 237 217
411 3WVFB 284 273
416 2GYQA 159 151
TMEDLLLHGLRDIYYAEQQITKALPKMIEQATNRDLSQGLTSHLEETQKQIERLDQVFKKLGQKPSGVNCPAIDGLIKEADETAGEIADKTVLDAAIVANAQAVEHYEIARYGTLIAWAEELGHDDIVRFLTTNLNEEKAANTKLNTVALRA TMEDLLLHGLRDIYYAEQQITKALPKMIEQATNRDLSQGLTSHLEETQKQIERLDQVFKKLGQKPSGVNCPAIDGLIKEADETAGEIADKTVLDAAIVANAQAVEHYEIARYGTLIAWAEELGHDDIVRFLTTNLNEEKAANTKLNTVALR
417 1WR2A 224 222
419 3QTAB 103 99
421 2BDRB 162 155
422 2VQYA 150 148
423 3IBZA 175 172
430 5LQWX 536 0
FLALTSDSGNLSIVQIHAGALRLKTLVNQPLTRTTLRRVSPISYMEIDPNGRCIILSSVEQNKLCFLVDYAQKLRISSPLEIIRPHMVTLDMAVVDVNFNNPCFVTLEIDNAATQLSVHLELGLNHIVKKADYLVNPSANFVLSLPDLSRYNITTSLSDNNNPFVVIGFENHILVKDMNGFFSLKVEIPKRSITNSRHKNVTIISGIVQKLKNDFFVLLQSNHGDLFKLTVSPDTRPLVQLSYFDTIQNSHQLHIFKNGYLFALSEMNNNFLFQFEKLGVEKNDFSNVLTSSLVFEPSKLQNLSILSQQLNLNPSIKSQIVSDSPLSIATKHFTNNKIITLTNAVNYSNLISTSLPPNATKLWLIPDPTTGDNNTLLFITFPKKTMILQIDNTPDEATRSAFLSQDTTIHTCLMGSHSIIQVCTAELRHITGKSRYSNWVPPAGIRIVCATSSKTQLIISLSNYELVYFKIDVSSDLIELTTHPELDTMPSKVAIV 
433 5M3MA 623 498
435 1IYWB 66 0
DVEEWRRRQEKRLKELLALAERSQRKLASPGFREKAPKEVVEAEEARLKENLEQAERIREALSQIG 
437 1ON8B 255 252
438 2OZ5B 273 261
447 4FLEA 187 183
448 5DN6G 287 260
450 2APJD 237 233
452 2RHSD 108 94
461 3KFVA 69 55
463 4IT5D 77 73
465 5CHEB 151 148
466 5K1EA 547 546
467 1QQEA 284 275
472 5CBQF 266 259
479 3THZA 97 93
480 5AH4B 123 121
481 4W8FB 221 217
482 3WEOA 143 124
483 5G55A 143 113
484 2Y2WF 220 214
486 1WPXB 177 162
489 5C7JB 303 299
490 3C6AA 170 162
492 3IHJA 402 392
493 4KFFC 418 417
496 3C4VB 181 174
497 5A15O 96 89
502 5JENC 83 80
505 3G5TA 263 257
510 3L22A 385 374
517 1XETD 226 222
520 2JI4A 185 144
524 4HXCA 226 223
526 2OZ0B 349 332
529 4DCMA 173 161
531 4D1EA 112 111
535 5LKDB 140 123
538 5CHEB 105 103
543 3WX6B 150 91
545 2IPXA 228 215
546 2ZTSC 235 218
550 5JPQj 216 140
554 3M3IH 169 151
556 1PVND 481 350
557 2Q4MA 182 153
562 5JLCA 468 459
564 2PG3A 212 202
565 3OHGA 190 184
571 5JJLF 75 70
573 3KJSD 210 200
578 1XHND 167 163
579 3MXQA 120 108
581 3B49A 188 179
583 4LD0B 152 142
584 2WYIB 496 493
585 4XD7C 128 127
587 4EHIB 115 96
588 4R3AB 77 60
599 3RQSB 186 183
600 3J99F 79 66
604 3MJFA 102 101
605 5DWUB 108 101
607 4NHAA 170 134
620 3MPVB 93 80
621 5SYTA 246 219
622 3EBWB 145 136
623 4EBBB 428 420
624 3CU2B 207 205
627 2VZNB 150 146
628 5AZYB 98 96
630 2P4HX 234 222
633 2VOSA 241 229
636 5FGJD 331 325
640 4W5WA 148 119
642 4KMRB 168 162
645 2O8RB 349 327
647 4ECCA 88 66
650 4LCMD 390 381
651 5LHKA 313 309
652 4KQTA 147 145
654 4XAJD 995 183
655 3UJ4B 202 166
658 5CVIB 70 69
659 1YGUB 260 230
661 1CBUC 178 177
663 2GCLB 93 85
665 5SVAZ 237 224
667 1O7LD 65 58
669 1I6VD 590 263
PETEAKVCERCAVEVTRSIVRRYRMAHIELATPAAHIWFVKDVPSKIATLLDLSATELEQVLYFNKYIVLDPKAAVLDAVPVEKRQLLTDDDDDDDDDDDDDDDDDDDIDARMGAEAIQELLKELDLEKLERELLEEMKHPSRARRAKARKRLEVVRAFLDSGNRPEWMILEAVPVLPPDATSDLNDLYRRLINRNNRLKKLLAQGAPEIIIRNEKRMLQEAVDAVIDNGRRGSPVTNPGSERPLRSLTDILSGKQGRFRQNL PETEAKVCERCAVEVTRSIVRRYRMAHIELATPAAHIWFVKDVPSKIATLLDLSATELEQVLYFNKYIVLDPKAAVLDAVPVEKRQLLTDXXXXXXXXXXXXXXXXXXIDARMGAEAIQELLKELDLEKLERELLEEMKHPSRARRAKARKRLEVVRAFLDSGNRPEWMILEAVPVLPPDATSDLNDLYRRLINRNNRLKKLLAQGAPEIIIRNEKRMLQEAVDAVIDNGRRGSPVTNPGSERPLRSLTDILSGKQGRFRQNL
670 4E1BA 137 116
671 2OA2A 77 75
676 2Z14A 104 99
677 1FNNB 85 76
679 4FN5A 127 125
682 4EBRB 68 66
683 4M3CG 246 244
689 4O8SA 218 212
695 4P5OD 59 52
696 1KNXF 172 171
699 5C9FD 91 84
703 4USAA 119 118
706 2G40A 113 102
707 1XP4A 101 100
708 3GU3B 195 188
709 3GLJA 77 154
711 5IN9B 160 157
712 3NKLB 96 94
713 2DQBF 88 86
716 4GI5B 218 213
717 4QT8B 106 82
718 3IYVI 66 0
NLAGAEELFARKFNALFAQGNYSEAAKVAANAPKGILRTPDTIRRFQSVPAQPGQTSPLLQYFGIL 
720 2EJQB 98 87
723 4F6PA 294 219
724 3WFSD 136 130
729 2WQZC 172 142
730 3BGEB 168 147
731 3HS0I 272 264
733 3BB5F 101 96
737 5DV2A 339 305
739 2IA7A 93 92
742 3ON2D 86 83
745 5DA9A 219 218
747 2HV2F 219 210
748 2G2QC 109 98
749 3QV0A 209 170
757 3DMYB 157 151
758 3EAGB 103 99
759 5M3MG 82 81
760 3G3OA 330 258
761 4QRKA 110 109
762 3GZNC 521 519
767 3GIGB 68 67
771 3CPKA 132 106
773 3O1JD 297 293
774 5L09B 147 143
775 3E0SB 234 228
780 2Q4LB 176 145
782 4H18D 279 251
784 3DCZA 108 89
786 5AR1A 277 216
787 1FOEG 133 131
789 3PZDA 166 120
791 4XU3B 126 124
793 4NHOA 202 201
794 2G6GA 264 260
800 3B85B 205 180
802 4JQIA 164 161
803 3IEDA 180 155
804 3FDIB 175 152
805 3NNBA 282 266
806 4P2BA 199 173
809 2ITBB 145 141
812 5LQWO 88 0
ITRCFISGFPMNIVQLGPTGYQTMGRSSGGLNVSVHPTSILFAQRPSKYVLYQQLMLTSKEFIRDCLVIPKEEWLIDMVPQI 
813 4DIOB 230 183
815 4ZO4D 180 172
816 3MI6D 258 253
817 4WO7B 120 119
820 5HCNA 251 236
821 3TWMB 130 113
828 3CEGB 159 158
831 4WJUB 74 43
832 2QIPA 150 149
836 2HWXA 163 149
840 4NBQC 84 75
842 3E2VA 378 358
844 4NN2B 92 88
845 4PVFB 400 384
846 3V97B 215 207
848 5DA9A 183 173
849 4PO7A 167 163
850 2O8RA 109 92
851 4OURB 182 168
853 5FWWB 83 78
854 2NSMA 300 291
857 3J2NZ 194 181
858 5T5MA 471 470
859 4HJFA 239 230
860 3Q6VB 235 198
865 4BUJE 276 23
869 3EQ1B 79 74
871 2HH9B 165 155
872 4R98B 161 159
875 2Q4PA 77 76
877 3SV1C 159 133
879 4N6BF 105 101
882 5LGDB 158 140
889 4KP4B 1061 69
891 1DCUD 205 189
892 3WXBB 218 214
893 3FGVB 89 86
894 2D1HB 76 68
899 1OAZB 118 111
901 3PZ5B 203 183
902 5JTWE 267 242
903 4OFYC 126 105
907 2YQYB 148 114
909 5CZWA 197 193
917 2B3TB 196 157
918 1QR6B 256 252
920 5FUED 340 312
923 1KTRH 144 109
926 3B1BB 336 291
928 3R8FD 69 68
929 3IH5D 191 188
931 1RY2A 86 74
932 4I07A 250 249
940 2WYND 479 468
945 3K11A 373 364
946 4J5TA 515 498
947 5CHHA 185 174
948 3D89A 119 114
949 3W6WB 329 295
956 4Q1FB 235 225
957 5LLIK 66 57
960 1XFCB 129 114
961 3H1TA 97 96
962 2P5XB 198 195
963 2R44A 131 126
967 2JB8A 113 95
970 5F9ZB 86 76
976 1Z3HB 371 368
977 5JEAJ 148 124
981 2YF0A 340 336
982 3U2RA 61 57
983 2BMXC 142 141
986 3OXQF 73 62
987 4DQ9B 141 123
990 2I76B 133 129
992 4I6MC 69 68
993 2YXBA 130 125
996 1E5SA 168 138
999 4BINA 216 204
1000 4XVZD 137 105
1001 4RNHA 162 156
1002 1KR2F 218 179
1003 3BNWB 137 127
1004 4FZSB 211 189
1008 4Q95B 132 123
1010 4A27B 163 125
1016 4ZZDA 75 69
1019 4E5VB 248 244
1020 2X51A 702 639
1023 2YFIE 278 269
1029 1RT8A 122 119
1030 3IHLB 271 227
1033 2QLXB 101 94
1034 3IWFB 77 76
1035 5IE9D 79 68
1037 1UNFX 120 111
1040 3GO5A 55 54
1043 5EPVB 83 80
1046 1YA0B 115 93
1047 1VP8A 149 145
1048 4OLED 100 98
1049 4TZSB 203 197
1055 2E7YB 226 215
1060 4IT5D 69 68
1068 4OQBF 81 73
1069 1ZENA 342 324
1072 4YB7L 74 72
1074 3A69A 79 70
1076 3LMLF 109 108
1077 3TQUD 190 188
1078 1G6IA 488 482
1081 5JIGA 126 117
1085 4F38B 191 185
1090 4UM2A 357 288
1094 2ZHHA 66 61
1102 2FDSB 296 291
1103 1JEYB 236 226
1104 3VKHB 719 642
1105 1I4LA 198 182
1110 3LWUA 333 326
1115 4YZOD 142 139
1119 2WAFA 243 231
1120 5C5BC 243 235
1124 3PFEA 164 161
1127 4P0SH 184 140
1130 4KQKB 336 333
1132 3HWUA 115 111
1133 3UFGB 282 275
1134 2NTII 127 125
1139 3SOOC 65 55
1144 2OIIC 136 117
1146 1KL7B 82 81
1147 4JJYB 290 287
1148 1RQDB 311 296
1149 3KH1B 162 155
1150 3NM3F 201 194
1153 1QXPB 146 140
1156 3QK9B 155 142
1158 1CT9D 159 145
1160 4KY9P 244 226
1161 1C41J 178 148
1162 2XBZA 128 119
1164 2Q4AB 305 298
1167 3SDOB 382 339
1172 5BWKP 319 276
1174 3CPGA 238 226
1175 5DDWD 490 485
1176 2D4CB 238 226
1177 4XGCD 154 149
1178 4PJ3A 307 293
1179 4UMGA 123 95
1183 3VUFA 262 244
1184 4YYEB 109 108
1185 4MIMD 201 199
1189 2FIUB 93 91
1190 1LW7A 162 149
1194 2WUQB 252 222
1202 3HQ1A 170 133
1203 2WSHA 100 98
1204 1JVAB 463 430
1205 2ZDSF 254 252
1206 4EK7B 496 465
1207 2O2KB 282 276
1211 4JOIB 99 78
1214 2V3ME 101 99
1216 4R3AB 118 111
1219 4ADXG 59 0
KMKTKIFRVKGKFLMGDKLQPFTKELNAIREEEIYERLYSEFGSKHRVPRSKVKIEEIE 
1220 4FGMA 104 102
1221 1M2WB 166 162
1222 4NCSA 315 289
1224 4LZHA 141 132
1227 5AOUA 1244 237
1230 6CTSA 380 376
1231 5M3FB 135 126
1234 3EAYA 266 209
1235 5AE3D 274 246
1243 3IS1X 111 86
1244 3LWKA 89 85
1246 3SDVA 208 206
1248 3JC73 393 302
1252 2D28C 107 104
1253 4O5PB 203 200
1254 3IBWB 76 73
1256 4MK6A 77 75
1258 1VBVA 94 76
1259 1XTDA 76 73
1262 5KZMA 260 248
1265 2YPNA 239 222
1266 3ULPD 112 107
1268 4U3KA 159 158
1270 1IPAA 77 72
1272 4B0MM 136 118
1279 2GVCE 167 166
1280 3S04B 84 70
1283 5CQQB 76 72
1285 2BDQB 209 205
1288 3Q98A 188 179
1290 5AFXB 289 279
1293 2QT1A 144 135
1296 4ARZA 243 227
1297 2NRKA 160 156
1298 4Z9RB 348 346
1299 4J2CC 99 96
1302 4M4DB 238 224
1304 3E20K 140 113
1309 4BZIG 114 97
1310 1A0RP 218 188
1314 4Q8BB 154 146
PQNIRTLKLAGTQDEYGRPVLLLNNKRWHDPVTETPKVGTTEIWSIINPTRGTHPIHLHLVSFRVLDRRPFDIARYQESGELSYTGPAVPPPPSEKGWKDTIQAHAGEVLRIAATFGPYSGRYVWHCHILEHEDYDMMRPMDITDPH QNIRTLKLAGTQDEYGRPVLLLNNKRWHDPVTETPKVGTTEIWSIINPTRGTHPIHLHLVSFRVLDRRPFDIARYQESGELSYTGPAVPPPPSEKGWKDTIQAHAGEVLRIAATFGPYSGRYVWHCHILEHEDYDMMRPMDITDPH
1316 4QJVD 77 71
1317 2NXFA 264 257
1318 4P7DD 127 125
1323 1DYPA 232 228
1325 2X34B 167 164
1327 4WCXC 109 96
1328 5J3BB 56 48
1329 2QY0C 72 68
1330 3D01L 149 143
1331 1JZTB 177 175
1332 3MEMA 201 194
1336 2CSUB 127 125
1340 4I3HB 172 167
1341 3GW2A 63 62
1347 2W55H 571 555
1348 4QKQL 261 253
1352 4ADXA 136 0
APGNTLPLAEIPEGVPVCNVESSPGDGGKFARASGVNAQLLTHDRNVAVVKLPSGEMKRLDPQCRATIGVVAGGGRTDKPFVKAGNKHHKMKARGTKWPNVRGVAMNAVDHPFGGGGRQHPGKPKSISRNAPPGRK 
1353 1A0PA 83 76
1354 5EEOA 666 656
1356 2W61A 318 312
1357 5A2EA 109 88
1358 4YN3B 78 61
1360 4PW7F 171 158
1362 3KLBA 155 154
1364 3K2QC 318 292
1366 3GL5A 206 204
1367 3FIJH 216 196
1368 1SXJE 177 143
1369 3RCHB 380 340
1372 3DI4B 182 178
1374 5TBOA 344 302
1377 2YY3C 85 84
1378 5K36A 81 69
1385 5CU5B 262 233
1389 4Q42D 313 301
1393 3WSZA 454 435
1400 3EN0C 208 202
1401 3AC0D 400 386
1403 4NNJC 251 243
1404 2E6VE 227 214
1407 1YPXA 356 305
1410 1FPQA 55 53
1412 1TLLB 174 134
1414 2O8IA 159 148
1419 2IDXC 170 167
1421 1U5KB 79 75
1428 2O7VA 225 221
1429 1YFHC 85 66
1430 3OOVB 148 142
1431 2E0WB 512 460
1435 5BROA 123 113
1436 4B6WA 85 82
1443 3GPKB 306 98
1445 4H9MA 100 99
1446 1ZLGA 98 0
AKPENLSASFIVQDVNITGHFSWKMAKANLYQPMTGFQVTWAEVTTESRQNSLPNSIISQSQILPSDHYVLTVPNLRPSTLYRLEVQVLTPGGEGPAT 
1448 2UVFB 389 384
1449 3VMFB 135 123
1458 2XR1B 126 121
1459 3GZRB 115 113
1460 5LP6D 130 113
1468 1RE0B 208 182
1470 3LMMD 85 83
1472 1QUTA 299 289
1473 3O6DA 254 252
1474 3S6DA 266 255
1476 3CNVC 143 139
1478 4YOCA 170 157
1479 4D2IB 211 192
1481 4D73A 172 163
1483 3VCRB 201 199
1485 3GZDD 399 382
1490 3I0ZB 145 142
1494 4O61A 205 198
1496 3JCRK 130 0
SVYRVRNLSNPAKKFKIEANAGQLYLTGVVVLHKDVNVVVVEGGPKAQKKFKRLMLHRIKWDEQTSNTDEESDEEAVKKTNKCVLVWEGTAKDRSFGEMKFKQCPTENMAREHFKKHGAEHYWDLA 
1498 5A68A 202 196
1502 3VKHB 291 274
1503 2ICGA 125 123
1507 3MYPD 263 259
1509 2AQ5A 62 61
1510 2YX0A 187 185
1511 1HX8B 276 267
1515 5FMWV 230 0
FSYSKNETYQLFLSYSSKKEKMFLHVKGEIHLGRFVMRNRDVVLTTTFVDDIKALPTTYEKGEYFAFLETYGTHYSSSGSLGGLYELIYVLDKASMKRKGVELKDIKRCLGYHLDVSLAFSEISVGAEFNKDDCVKRGEGRAVNITSENLIDDVVSLIRGGTRKYAFELKEKLLRGTVIDVTDFVNWASSINDAPVLISQKLSPIYNLVPVKMKNAHLKKQNLERAIEDY 
1517 3PG5D 211 202
1519 2VAXL 319 283
1520 2H9EC 65 39
1521 3H4VH 272 251
1527 1OFEB 294 288
1528 4A9AC 92 73
1529 5KHQB 116 92
1531 5ERCA 120 110
1533 2CGTU 103 79
1536 4ZH4L 212 161
1537 4O3MA 73 70
1539 5K36C 197 184
1540 4BZIG 270 253
1541 3H8WA 172 161
1543 2QJVB 259 248
1544 3OWAD 166 162
1547 3DJME 93 91
1548 3LYVF 53 44
1550 1STFI 111 89
1560 5TJHF 116 111
1564 3LXUX 479 418
1565 3P27B 116 115
1567 2IOJB 107 104
1568 5I45A 149 147
1571 3FZ3F 214 147
1572 1Q9UB 63 61
1573 3VB9D 108 107
1574 5AHRA 107 94
1576 5DMHB 229 217
1580 4APFA 117 114
1589 4Q2UO 81 80
1590 3CBWB 319 306
1592 5K5XA 358 283
1594 5IJPA 120 107
1595 5JQKB 179 169
1600 5HLZH 107 75
1603 3F9UB 107 103
1604 2V53A 138 120
1606 2I6EH 264 260
1607 4XGJA 181 146
1608 4F882 116 112
1610 1XM3D 246 240
1611 4EXQA 350 346
1613 4JDUA 200 191
1616 2VRTD 85 78
1618 2ZBKG 79 72
1619 2CNBD 365 346
1620 4H9DC 55 54
1621 4FINB 83 80
1623 4S3RA 520 504
1627 3GKUC 50 39
1628 3NWJB 179 161
1629 1D5BH 196 203
1630 3GZED 117 105
1632 5E5HB 217 216
1633 3MZKC 255 247
1637 2FVMD 450 440
1649 1T62B 118 115
1653 3W1YA 118 109
1655 2FRXD 98 82
1658 5FB1A 72 65
1659 4R6IB 86 84
1661 3JC77 188 130
1662 3E15D 257 250
1664 2OWMD 369 308
1668 4IL3B 187 182
1669 3LRAA 53 50
1674 4DVGB 177 172
1676 4V02D 101 98
1677 3QKIC 525 524
1680 3BUTA 97 95
1682 3SN6A 364 327
1685 4CEJA 414 402
1687 1KYQC 146 133
1688 3HD8C 167 164
1689 3C1QB 100 96
1692 1VLIA 240 223
1693 1R6ZZ 230 11
1696 4YXVB 181 168
1699 3BF4B 80 76
1701 1O65B 116 112
EHGGPDRALCHYPREHYLYWAREFPEQAELFVAPAFGENLSTDGLTESNVYMGDIFRWGEALIQVSQPRSPCYKLNYHFDISDIAQLMQNTGKVGWLYSVIAPGKVSADAPLE HGGPDRALCHYPREHYLYWAREFPEQAELFVAPAFGENLSTDGLTESNVYMGDIFRWGEALIQVSQPRSPCYKLNYHFDISDIAQLMQNTGKVGWLYSVIAPGKVSADAPLE
1702 5D8GA 390 386
1704 5JJWB 93 89
1705 5HMQF 140 136
1706 3Q3CA 128 121
1707 4J57B 128 112
1709 4P4PA 126 118
1710 2QEAC 148 146
1713 3ILVA 303 277
1714 3EEAB 144 141
1716 3D6TB 121 91
1717 1A7JA 209 200
1719 5LQWO 126 0
SLENLYILGALNSKGTITRLGKMMCEFPCEPEFAKVLYTAATHEQCQGVLEECLTIVSMLHETPSLFIGRDAAASVLSDHILYLEIFNQWRNSKFSRSWCQDHKIQFKTMLRVRNIRNQL 
1723 4DLQA 225 214
1725 3TYSA 61 60
1727 3EJVA 149 143
1728 5H80B 112 100
1729 2HLJA 138 132
1730 1WD5A 190 188
1731 3HLTC 185 174
1732 5FIHA 77 73
1733 4CBVF 98 97
1737 3NZEB 260 253
1742 3HTRB 81 79
1743 4QBLF 117 95
1745 3QOUA 90 87
1750 3NBMA 95 92
1752 3P8AB 188 184
1757 2O8FB 327 325
1764 1P9NB 136 126
1765 2PC6D 63 61
1766 3MN2B 82 80
1767 5ERVA 76 71
1775 3HM0D 102 91
1778 3QTYA 157 150
1788 2JI4A 119 112
1789 3NM3F 271 246
1790 3HN3E 187 183
1796 4MTNA 69 68
1802 5AV7D 136 133
1803 4DIED 216 196
1806 3CR7D 166 163
1809 5BYHM 158 125
1810 3L0QB 210 198
1811 3MDOB 202 195
1815 4XD7F 75 70
1821 4S2AA 445 441
1824 5IM3B 99 96
1825 3JYGF 176 174
1828 4WY9A 278 273

In [268]:
filtered_data[0]


Out[268]:
['1VBHA',
 '1VBHA_518-876',
 359,
 359,
 'PALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQGIGLCRTEHMFFASDERIKAVRQMIMAPTLELRQQALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHEFLPEGNIEDIVSELCAETGANQEDALARIEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAIAMTNQGVQVFPEIMVPLVGTPQELGHQVTLIRQVAEKVFANVGKTIGYKVGTMIEIPRAALVADEIAEQAEFFSFGTNDLTQMTFGYSRDDVGKFIPVYLAQGILQHDPFEVLDQRGVGELVKFATERGRKARPNLKVGICGEHGGEPSSVAFFAKAGLDYVSCSPFRVPIARLAAAQVLV',
 0]

In [271]:
complete_data = pd.DataFrame(filtered_data, columns=["Name", "FullName", "SeqLength", "Length", "Seq", "Problematic"])
# complete_data.to_csv(folder+"/data_info_3.csv")

In [270]:
complete_data.shape


Out[270]:
(1829, 6)

In [2]:
data = pd.read_csv("/Users/weilu/Research/server/april_2019/optimization_mult_seq/data_info_3.csv", index_col=0)

In [354]:
data.sort_values("Length").query("Problematic == 0")


Out[354]:
Name FullName SeqLength Length Seq Problematic
519 5JNEE 5JNEE_357-406 50 50 MSLQDPISYTRMKYPSKSINCKHLQCFDALWFLHSQLQIPTWQCPV... 0
1009 5JVHW 5JVHW_1-51 51 51 MKIKLVRSVIGRPGNQVKTVQALGLRKIGDSREVSDTPAVRGMVKT... 0
1425 4B48A 4B48A_1-52 52 52 MAKVRIYQLAKELGMETQELLELLDQMGVAYKSHASTLEEKDAEAV... 0
852 2IYJB 2IYJB_6-57 52 52 QQTLAKMGIKSSDIQPAPVAGMKTVLTNSGVLYITDDGKHIIQGPM... 0
843 5A4OB 5A4OB_14-66 53 53 QKSATQRIRISHRWYRGRRYVDVRLVVVDRDGDFVPTRQGISIRPE... 0
1306 1YWWA 1YWWA_4-56 53 53 DVIKGKWKQLTGKIKERWGDLTDDDLQAADGHAEYLVGKLQERYGW... 0
464 2LW7A 2LW7A_6-58 53 53 LEELVKLQGERVRGLKQQKASAELIEEEVAKLLKLKAQLGPDESKQ... 0
1615 2KZWA 2KZWA_2-54 53 53 NARDNKFNTWNDSRGNYWSDYEGSDENGDGIGDSAYAVNPEAGSMD... 0
1271 4V1Ag 4V1Ag_43-95 53 53 AREFVEREVTDFARRNPGVVIYVNPRPCCVPRVVAEYLNGAVREES... 0
675 3CXLA 3CXLA_206-258 53 53 HNFKVHTFRGPHWCEYCANFMWGLIAQGVKCADCGLNVHKQCSKMV... 0
904 5TJJB 5TJJB_7-59 53 53 SVALAARVLKLLSRHKYRQSTLTEIAERLGVNKTTCLRVLRTLERE... 0
732 3O8ZA 3O8ZA_241-293 53 53 SLFAALNDLPVKTEHLKESGLGRVVIFYTKSKRVEAQLARLAEKLI... 0
123 5JA2B 5JA2B_1-53 53 53 MTSVFDRDDIQFQVVVNHEEQYSIWPEYKEIPQGWRAAGKSGLKKD... 0
95 5LWBA 5LWBA_692-744 53 53 CVKCKTTCFMSAISCSCKPGLLVCLHHVKELCSCPPYKYKLRYRYT... 0
420 5FTWA 5FTWA_2-54 53 53 DTYSVFTTKWKQLTGVDLTLYKEAQMKRRLTSLYEKKGFQSFKDFA... 0
1712 4P4PA 4P4PA_97-150 54 54 QELTQVHGFGPRAAAALFDREGIFTVDELLQKADSIPSLTDQQRVG... 0
1500 4NTYB 4NTYB_6-59 54 54 FCYEDPPFFQKCGAFVDSYYFNRSRITCVHFFYGQCDVNQNHFTTM... 0
617 2DSRB 2DSRB_6-59 54 54 CPPCSEEKLARCRPPVGCEELVREPGCGCCATCALGLGMPCGVYTP... 0
874 5TMFF 5TMFF_357-410 54 54 ALSKLSEREAMVLKLRKGLIDGREHTLEEVGAFFGVTRERIRQIEN... 0
1024 4AKKB 4AKKB_331-384 54 54 ASLKDALEERKLIEKAKSVLMTYQGMQEEQAWQALRKMAMDKNQRM... 0
293 4E6KI 4E6KI_2-55 54 54 YVCLCQGVTDNQIRDAIYEGCCSYREVREATGVGTQCGKCASLAKQ... 0
705 2V53A 2V53A_78-132 55 55 CQDPTSCPAPIGEFEKVCSNDNKTFDSSCHFFATKCTLEGTKKGHK... 0
1512 5AX2A 5AX2A_6-60 55 55 DEIEIEDMTFEPENQMFTYPCPCGDRFQIYLDDMFEGEKVAVCPSC... 0
1319 2LNHA 2LNHA_2-56 55 55 SNFQHIGHVGWDPNTGFDLNNLDPELKNLFDMCGISEAQLKDRETS... 0
1223 1HLVA 1HLVA_2-56 55 55 GPKRRQLTFREKSRIIQEVEENPDLRKGEIARRFNIPPSTLSTILK... 0
415 1Q1VA 1Q1VA_320-374 55 55 PTDEELKETIKKLLASANLEEVTMKQICKKVYENYPTYDLTERKDF... 0
396 4KRTB 4KRTB_285-340 56 56 TFLNVREEGSLNSRIVDKINSGDIFRIDWVDSDFIGWYRITTKNGK... 0
878 2CVEA 2CVEA_132-187 56 56 FLVPFAEVGRVYALLEARALKAEETYTPEGVRFALLLPKPEREGFL... 0
1492 2GA1B 2GA1B_33-88 56 56 IQITPGVCGGQARIRNTRIPVWTLVAYRQQGAPDKELLANYPGLTA... 0
56 1HFET 1HFET_40-95 56 56 KDYMLDRINGVYGADAKFPVRASQDNTQVKALYKSYLEKPLGHKSH... 0
... ... ... ... ... ... ...
59 2XQ1P 2XQ1P_8-401 394 394 TTSQGCPVSDPFTTQRIPLDSTGYKYAPPIGPLLLQDFKLIDTLSH... 0
21 2C81A 2C81A_13-407 395 395 QHSDRTRRKIEEVFQSNRWAISGYWTGEESMERKFAKAFADFNGVP... 0
105 2D4VD 2D4VD_28-425 398 398 IIPFIEGDGIGCDVTPAMRSVVDAAVAKVYGGQRQIAWMELFAGQK... 0
634 16PKA 16PKA_9-408 400 400 INECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKVLTEG... 0
822 2A3LA 2A3LA_384-790 407 407 NVRKVDTHVHHSACMNQKHLLRFIKSKLRKEPDEVVIFRDGTYLTL... 0
1439 3RYBA 3RYBA_80-495 416 416 KFIKGGAADVALDKESKTATITLRKDLKWSDGSEVTAKDYEFTYET... 0
653 2CTZB 2CTZB_4-421 418 418 ETLQLHAGYEPEPTTLSRQVPIYPTTSYVFKSPEHAANLFALKEFG... 0
594 2VDCF 2VDCF_1-419 419 419 CGVGFIAAIDGKPRRSVVEKGIEALKAVWHRGAVDADGKTGDGAGI... 0
887 2XE4A 2XE4A_27-447 421 421 YVEGEDRGPNPMNPPRYREDPYFWMRDDDRKDPAVIEHLNKEKVYF... 0
173 3DINB 3DINB_1-427 427 427 MILFDKNKRILKKYAKMVSKINQIESDLRSKKNSELIRLSMVLKEK... 0
1301 1QBBA 1QBBA_338-766 429 429 FPYRGIFLDVARNFHKKDAVLRLLDQMAAYKLNKFHFHLSDDEGWR... 0
477 1WYVG 1WYVG_1-433 433 433 MDYTPHTEEEIREMLRRVGAASLEDLFAHLPKEILSPPIDLPEPLP... 0
1195 5EX4B 5EX4B_25-457 433 433 RLDAALAKPAAQQPTWPADQALAMRTVLESVPPVTVPSEIVRLQEQ... 0
160 1ZQ1D 1ZQ1D_15-449 435 435 LKVGLEIHRQLDTKKLFSPVPSELSDKVEFTFQRRLRPTMSELGEI... 0
566 4RMFA 4RMFA_116-558 443 443 GEEARLKYRYLDLRREGPGNALRLRSKVNAAARSVLAEHDFVEIET... 0
455 2VYCJ 2VYCJ_140-590 451 451 PPLFSALMKYSDIHEYSWAAPGHQGGVGFTKTPAGRFYHDYYGENL... 0
1812 4CEJA 4CEJA_11-462 452 452 WTDDQWNAIVSTGQDILVAAAAGSGKTAVLVERMIRKITAEENPID... 0
373 1AC5A 1AC5A_15-468 454 454 PGLSEVPDPSNIPQMHAGHIPLRSEDADEQDSSDLEYFFWKFTNND... 0
265 4TXGA 4TXGA_155-608 454 454 RIIGYFTSWRTGKDGSPAYLASDIPWSKLTHINYAFAHVDGSNKLS... 0
171 4WN9C 4WN9C_53-508 456 456 CAYAGCKGVVMGPIKDMVHITHGPIGCSFYTWGGRRFKSKPENGTG... 0
1070 4YOCA 4YOCA_1139-1594 456 456 LRTLDVFSGCGGLSEGFHQAGISDTLWAIEMWDPAAQAFRLNNPGS... 0
1795 4REPA 4REPA_3-482 480 480 NAIVIGAGIGGLAAALRLRHQGYSVTIFEKNDYAGGKLHAIEKDGY... 0
895 5D0FB 5D0FB_1037-1517 481 481 MTSIQMVSAMKSTSILPDQNIAAMAAGLPHFSTNYMRCWGRDVFIS... 0
1552 5TLSD 5TLSD_4-494 491 491 ESRIKDISLAEFGLQDMEIAKTDMMGLVELQRKYRDSKPLKGARIT... 0
1278 3WBHB 3WBHB_3-498 496 496 VKNVILMIGDGMGPQQVGLLETYANQAPDSIYDGEPTAFHQLAKEG... 0
722 2WZSH 2WZSH_235-735 501 501 IGFKTRKGEQVNARIASSFISFEQAAANMNELGKDNIEQLAQKGKD... 0
559 5LBQA 5LBQA_288-826 539 539 VSGLAAARQLQSFGMDVTLLEARDRVGGRVATFRKGNYVADLGAMV... 0
1073 4AMFB 4AMFB_11-553 543 543 LGFDSIPAATTDTISLPKGYKSSVLISWGQPLHKNGPAFDPSGNGT... 0
1209 5IPNC 5IPNC_717-1264 548 548 VAKRGGVVQYVDASRIVIKVNEDEMYPGEAGIDIYNLTKYTRSNQN... 0
1645 1ULVA 1ULVA_281-956 676 676 ALRTQYDVSLMTVKSHEDKTFPGAFIASLTIPWGQAASAETHREGY... 0

1087 rows × 6 columns

problem 1 is probably fine. because not continues in seq, may not means not continues in space

1087 without any problem


In [248]:
complete_data.query("Problematic == 0").shape


Out[248]:
(1087, 5)

In [249]:
# no problematic = 2 or 3.
complete_data.query("Problematic == 4").shape


Out[249]:
(19, 5)

In [250]:
complete_data.query("Problematic == 0 and Length > 500")


Out[250]:
Name FullName Length Seq Problematic
559 5LBQA 5LBQA_288-826 539 VSGLAAARQLQSFGMDVTLLEARDRVGGRVATFRKGNYVADLGAMV... 0
722 2WZSH 2WZSH_235-735 501 IGFKTRKGEQVNARIASSFISFEQAAANMNELGKDNIEQLAQKGKD... 0
1073 4AMFB 4AMFB_11-553 543 LGFDSIPAATTDTISLPKGYKSSVLISWGQPLHKNGPAFDPSGNGT... 0
1209 5IPNC 5IPNC_717-1264 548 VAKRGGVVQYVDASRIVIKVNEDEMYPGEAGIDIYNLTKYTRSNQN... 0
1645 1ULVA 1ULVA_281-956 676 ALRTQYDVSLMTVKSHEDKTFPGAFIASLTIPWGQAASAETHREGY... 0

In [166]:
b = a.query("Length < 100").query("Problematic == 0").sample(10, random_state=0).reset_index(drop=True)
b.to_csv("/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/chosen.csv")

In [284]:
data = pd.read_csv("/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/chosen.csv", index_col=0)

In [349]:



Out[349]:
Name FullName Length Seq Problematic
0 4TVYB 4TVYB_248-332 85 APAFSHLLDERFTWGGVELHFDVEKGHITRAQVFTDSLNPAPLEAL... 0
1 5LBQA 5LBQA_178-264 87 AFQSRLPHDRMTSQEAACFPDIISGPQQTQKVFLFIRNRTLQLWLD... 0
2 3C2VA 3C2VA_26-117 92 VPSFDFGGYVVGSDLKEANLYCKQDGMLCGVPFAQEVFNQCELQVE... 0
3 2HLQA 2HLQA_33-127 95 LCAFKDPYQQDLGIGESRISHENGTILCSKGSTCYGLWEKSKGDIN... 0
4 5J3BB 5J3BB_5-62 58 STNDFKPGLKVMLDSNPCSIMENEYVKPGKGQAFNRVKLRNLKTGK... 0
5 4ZLHB 4ZLHB_190-257 68 HMASDDLDRAMTLLKKGAAADKNSARVSIMMGRVFMAKGEYAKAVE... 0
6 3O8ZA 3O8ZA_241-293 53 SLFAALNDLPVKTEHLKESGLGRVVIFYTKSKRVEAQLARLAEKLI... 0
7 4ZRJA 4ZRJA_226-315 90 NKKGTELLLGVDALGLHIYDPENRLTPKISFPWNEIRNISYSDKEF... 0
8 2IX1A 2IX1A_24-81 58 GVVKATEKGFGFLEVDAQKSYFIPPPQMKKVMHGDRIIAVIHSEKE... 0
9 4UR3F 4UR3F_328-393 66 FCETCKKCARECPSKAITEGPRTFEGRSIHNQSGKLQWQNDYNKCL... 0

In [208]:
data


Out[208]:
Name FullName Length Seq Problematic
0 4TVYB 4TVYB_248-332 85 APAFSHLLDERFTWGGVELHFDVEKGHITRAQVFTDSLNPAPLEAL... 0
1 5LBQA 5LBQA_178-264 87 AFQSRLPHDRMTSQEAACFPDIISGPQQTQKVFLFIRNRTLQLWLD... 0
2 3C2VA 3C2VA_26-117 92 VPSFDFGGYVVGSDLKEANLYCKQDGMLCGVPFAQEVFNQCELQVE... 0
3 2HLQA 2HLQA_33-127 95 LCAFKDPYQQDLGIGESRISHENGTILCSKGSTCYGLWEKSKGDIN... 0
4 5J3BB 5J3BB_5-62 58 STNDFKPGLKVMLDSNPCSIMENEYVKPGKGQAFNRVKLRNLKTGK... 0
5 4ZLHB 4ZLHB_190-257 68 HMASDDLDRAMTLLKKGAAADKNSARVSIMMGRVFMAKGEYAKAVE... 0
6 3O8ZA 3O8ZA_241-293 53 SLFAALNDLPVKTEHLKESGLGRVVIFYTKSKRVEAQLARLAEKLI... 0
7 4ZRJA 4ZRJA_226-315 90 NKKGTELLLGVDALGLHIYDPENRLTPKISFPWNEIRNISYSDKEF... 0
8 2IX1A 2IX1A_24-81 58 GVVKATEKGFGFLEVDAQKSYFIPPPQMKKVMHGDRIIAVIHSEKE... 0
9 4UR3F 4UR3F_328-393 66 FCETCKKCARECPSKAITEGPRTFEGRSIHNQSGKLQWQNDYNKCL... 0

In [285]:
for i, line in data.iterrows():
    if "X" in line["Seq"]:
        print(line)

In [302]:
complete_data_filtered.shape


Out[302]:
(1810, 6)

In [303]:
n = len(complete_data_filtered)

In [ ]:


In [332]:


In [333]:


In [334]:
number_of_runs


Out[334]:
181

In [345]:
n = len(complete_data_filtered)
number_of_runs = int(np.ceil(n/10))
perRun = 10
count = 0
for i in range(number_of_runs):
    with open(to_location+f"proteins_name_list/proteins_name_list_{i}.txt", "w") as out:
        cc = 0
        while count < n and cc < perRun:
            fullName = complete_data_filtered.iloc[count]["FullName"]
            out.write(fullName+"\n")
            cc += 1
            count += 1

In [347]:
with open(to_location+f"proteins_name_list/proteins_name_list.txt", "w") as out:
    for i, line in complete_data_filtered.iterrows():
        out.write(line["FullName"]+"\n")

In [300]:
complete_data_filtered = complete_data.query("Problematic != 4").reset_index(drop=True)

In [336]:
from_location = "/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/"
to_location = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/"
# os.system("mkdir -p database/S20_seq")
# os.system("mkdir -p database/dompdb")
for i, line in complete_data_filtered.iterrows():
    # print(line["FullName"])
    print(i)
    fullName = line["FullName"]
    os.system(f"cp {from_location}{fullName}_filtered_0.05.seqs {to_location}alignments/")
    os.system(f"cp {to_location}../optimization_mult_seq/cleaned_pdbs/{fullName}.pdb {to_location}database/dompdb/")
    with open(to_location+f"database/S20_seq/{fullName}.seq", "w") as out:
        with open(from_location+f"{fullName}.fasta") as f:
            a = f.readlines()
        out.write(a[1]+"\n")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809

In [286]:
from_location = "/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/"
to_location = "/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/"
# os.system("mkdir -p database/S20_seq")
# os.system("mkdir -p database/dompdb")
for i, line in data.iterrows():
    # print(line["FullName"])
    fullName = line["FullName"]
    os.system(f"cp {from_location}{fullName}_filtered_0.05.seqs {to_location}aligments/")
    os.system(f"cp {to_location}../optimization_mult_seq/cleaned_pdbs/{fullName}.pdb {to_location}database/dompdb/")
    with open(to_location+f"database/S20_seq/{fullName}.seq", "w") as out:
        with open(from_location+f"{fullName}.fasta") as f:
            a = f.readlines()
        out.write(a[1]+"\n")
    with open(to_location+f"proteins_name_list/proteins_name_list_{i}.txt", "w") as out:
        out.write(fullName+"\n")

In [288]:
loc = "/Users/weilu/Research/server/march_2019/optimization_mult_seq_test/phis/phi_burial_well_2HLQA_33-127_native_4.0"
a = np.loadtxt(loc)

In [290]:
a.shape


Out[290]:
(1000, 60)

In [293]:
b = np.average(a, axis=0)

In [294]:
len(b)


Out[294]:
60

In [297]:
c = np.zeros(100)
c[0:60] = b

In [298]:
c


Out[298]:
array([1.02120895e+00, 2.04774029e+00, 2.04850226e+00, 4.86511504e+00,
       1.99914379e+00, 3.93221736e+00, 3.89030093e+00, 3.46330586e+00,
       1.56779611e+00, 2.86871570e+00, 3.12796018e+00, 2.79853589e+00,
       6.99447186e-01, 1.12783344e+00, 2.78140343e+00, 3.46941678e+00,
       3.27886956e+00, 4.57498445e-01, 1.92307424e+00, 1.29925846e+00,
       2.22593738e+00, 1.26467769e+00, 1.85645096e+00, 1.31403768e+00,
       6.44498297e+00, 1.17688198e+00, 2.69877366e+00, 3.95480128e+00,
       9.99425448e-01, 1.66408298e+00, 2.05632997e+00, 1.74496335e+00,
       3.18549685e-01, 2.97527360e+00, 1.24112624e+00, 2.12193816e+00,
       2.02052799e+00, 4.13360332e-01, 2.07197791e+00, 1.66660266e+00,
       1.18441951e-02, 2.21237903e-02, 7.97624586e-01, 3.87243867e-02,
       1.70387622e+00, 2.14795223e-02, 1.01061107e-01, 3.85974894e-03,
       2.04567699e-02, 1.47721438e-01, 2.24155235e-02, 3.05324222e-02,
       6.71733582e-03, 4.34403645e-01, 1.59724987e-03, 2.64722104e-02,
       1.23581031e-01, 1.03140067e-01, 4.60238350e-02, 6.76725773e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

check the number of aligned seqs.

this shows, problematic==4 is bad.


In [253]:
info = []
for i, line in complete_data.iterrows():
    fullName = line["FullName"]
    try:
        with open(f"{from_location}{fullName}_filtered_0.05.seqs") as f:
            a = f.readlines()
            info.append([fullName, len(a)])
    except:
        print(fullName, line)


5LQWh_5-97 Name                                                       5LQWh
FullName                                              5LQWh_5-97
Length                                                        93
Seq            NFLKKLRNEQVTIELKNGTTVWGTLQSVSPQMNAILTDVKLTLPSD...
Problematic                                                    4
Name: 101, dtype: object
1PDIR_344-400 Name                                                       1PDIR
FullName                                           1PDIR_344-400
Length                                                        57
Seq            GAIMMWAADSLPSDAWRFCHGGTVSASDCPLYASRIGTRYGGSSSN...
Problematic                                                    4
Name: 302, dtype: object
5LQWB_856-948 Name                                                       5LQWB
FullName                                           5LQWB_856-948
Length                                                        93
Seq            ILLEPIYEVDITVHAPLLPIVEELMKKRRGSRIYKTIKVAGTPLLE...
Problematic                                                    4
Name: 337, dtype: object
5LQWB_481-578 Name                                                       5LQWB
FullName                                           5LQWB_481-578
Length                                                        98
Seq            AEWSLVRIYSGLLKRGDTVRILDTSQSESRQKRQLHDISKTEDDET...
Problematic                                                    4
Name: 361, dtype: object
5LQWX_147-682 Name                                                       5LQWX
FullName                                           5LQWX_147-682
Length                                                       536
Seq            FLALTSDSGNLSIVQIHAGALRLKTLVNQPLTRTTLRRVSPISYME...
Problematic                                                    4
Name: 430, dtype: object
1IYWB_797-862 Name                                                       1IYWB
FullName                                           1IYWB_797-862
Length                                                        66
Seq            DVEEWRRRQEKRLKELLALAERSQRKLASPGFREKAPKEVVEAEEA...
Problematic                                                    4
Name: 435, dtype: object
3IYVI_356-421 Name                                                       3IYVI
FullName                                           3IYVI_356-421
Length                                                        66
Seq            NLAGAEELFARKFNALFAQGNYSEAAKVAANAPKGILRTPDTIRRF...
Problematic                                                    4
Name: 718, dtype: object
5LQWO_775-862 Name                                                       5LQWO
FullName                                           5LQWO_775-862
Length                                                        88
Seq            ITRCFISGFPMNIVQLGPTGYQTMGRSSGGLNVSVHPTSILFAQRP...
Problematic                                                    4
Name: 812, dtype: object
4ADXG_2-60 Name                                                       4ADXG
FullName                                              4ADXG_2-60
Length                                                        59
Seq            KMKTKIFRVKGKFLMGDKLQPFTKELNAIREEEIYERLYSEFGSKH...
Problematic                                                    4
Name: 1219, dtype: object
4ADXA_89-224 Name                                                       4ADXA
FullName                                            4ADXA_89-224
Length                                                       136
Seq            APGNTLPLAEIPEGVPVCNVESSPGDGGKFARASGVNAQLLTHDRN...
Problematic                                                    4
Name: 1352, dtype: object
1ZLGA_549-646 Name                                                       1ZLGA
FullName                                           1ZLGA_549-646
Length                                                        98
Seq            AKPENLSASFIVQDVNITGHFSWKMAKANLYQPMTGFQVTWAEVTT...
Problematic                                                    4
Name: 1446, dtype: object
3JCRK_544-673 Name                                                       3JCRK
FullName                                           3JCRK_544-673
Length                                                       130
Seq            SVYRVRNLSNPAKKFKIEANAGQLYLTGVVVLHKDVNVVVVEGGPK...
Problematic                                                    4
Name: 1496, dtype: object
5FMWV_251-480 Name                                                       5FMWV
FullName                                           5FMWV_251-480
Length                                                       230
Seq            FSYSKNETYQLFLSYSSKKEKMFLHVKGEIHLGRFVMRNRDVVLTT...
Problematic                                                    4
Name: 1515, dtype: object
5LQWO_619-744 Name                                                       5LQWO
FullName                                           5LQWO_619-744
Length                                                       126
Seq            SLENLYILGALNSKGTITRLGKMMCEFPCEPEFAKVLYTAATHEQC...
Problematic                                                    4
Name: 1719, dtype: object

In [256]:
a = pd.DataFrame(info, columns=["Name", "Length"])

In [260]:
a.hist("Length", bins=100)
plt.yscale("log")



In [261]:
a.query("Length < 1e5").hist("Length", bins=100)
plt.yscale("log")


2e4 seems a good cut off


In [262]:
a.query("Length < 2e4").hist("Length", bins=100)
plt.yscale("log")



In [275]:
a.query("Length < 1e4").hist("Length", bins=100)
plt.yscale("log")



In [278]:
"777" in "woewowefggjeg77 77sdfsdf"


Out[278]:
False

In [263]:
len(a)


Out[263]:
1815

In [240]:
def generate_decoy_sequences(fullName, location="./", num_decoys=1000):
    num_decoys = 10
    # location = to_location
    with open(location+f"aligments/{fullName}_filtered_0.05.seqs") as f:
        a = f.readlines()

    with open(location+f"database/S20_seq/{fullName}.seq") as f:
        b = f.readlines()

    with open(location+f"decoys/multiShuffle/{fullName}.decoys", "w") as out:
        for seq in random.sample(a, num_decoys):
            s = seq.strip()
            shuffled_seq = ''.join(random.sample(s,len(s)))
            out.write(shuffled_seq+"\n")
        # print(shuffled_seq)

In [281]:
random.sample(c, 10)


Out[281]:
['HCGKCRACITVCPTDAIVAPPYQLDRRCISYTIEHDGSIPEDLREGGNRIESGFGCDDCQLMCPWT\n',
 'HCGSCRACLDACPTDAFPAPPYKLDRRCISYTIEHKGPIDPDIRERGNRIESGYGCDDCLAACPWT\n',
 'RCGSCNRCLEACPTGALVRAPGQINKICISYTQTKEDIPEPYRKKIGNRLESGYGCDTCQVVCPYT\n',
 'HCGSCTKCLDICPTDAFPAPPFRLDRRCISYTIEHRGPVDATLRPAGNRIESGYGCDDCLAICPWT\n',
 'FCIFCGKCPFVCPHSAIRSKFPEADPPTFKHQVKGKEFQKGEHISYVEDCESGTGCQLCVDVCPAT\n',
 'FCSSCSLCIDNCPTGAIDKSRFVIRHHCLTNFNESDEPIPEWVNADHNAVESGVGCMRCQDVCPHT\n',
 'RCGDCRDCVDACPVKAFTGRFTEAEREARHYARRCEKYLNGLEASTGYGYESGGVCGMCLNICPHT\n',
 'RCGSCQACLDACPTDAFPAPPYELDRRCISYTIELDGPVPAEFRAAGNRIESGYGCDDCLAVCPWT\n',
 'HCGSCRACLDICPTEAFPAPPYRLDRRCISYTIEHKGPIAPELREKGNRIESGYGCDDCLAICPWT\n',
 'QCGRCVACITTCPTGAIVEPPYTVDRRCISYTIELEGAIPEAFRPLGNRIESGYGCDDCQLICPWT\n']

In [229]:
c = a + b

In [230]:
c[-4:]


Out[230]:
['FCGTCRKCLDICPTKAIVHPPFVVDRRCIAYTIENNDFPEYIKNNLNGWIESGAGCDLCQDVCPWT\n',
 'KCIGCGRCFKVCPRDVLTIVPQRAVDVDFDDDDDDDDSDNSFMTLSAMDCESGIGCEACSRVCPFT\n',
 'FCETCKKCARECPSKAITEGPRTFEGRSIHNQSGKLQWQNDYNKCLGYWPESGGYCGVCVAVCPFT\n',
 'FCETCKKCARECPSKAITEGPRTFEGRSIHNQSGKLQWQNDYNKCLGYWPESGGYCGVCVAVCPFT\n']

In [227]:
len(a)


Out[227]:
3251

In [236]:
s = a[0]
''.join(random.sample(s,len(s)))


Out[236]:
'DEVCPTVKPG\nYRGQCAELIFCDHPSGLGSICLEDNCGPFRWKSTPSICALGSVLTHRSCRDLECDL'

In [234]:
random.shuffle(list(a[0].strip()))

In [237]:



STCLIFKGNNKSCCNETPPEWICNDFQIEACTWTKANNFSSIDSFKKGAYAICYCTKCSKFSEGKS
VTCPCIRARHVCPLYDESKDGEWGCFVGYGPATCRDPLRAPQTTGAGYNCCTCIEIDSAPHPEVSA
DFQCVNETGRLFGRCTTATKQCGGTGTRTQYGKSACQGIEQIPKEDMLCPETCNFPVKCPALCQLG
KFDSYIDVCMAGIEREFRRSVVDCNCQVLVNGPSATPGTFCAGHVCGGMAKATLPCLCPKDCGQQN
TRCAGPEFDVDPCIVAIGLGRPPGTAEAYTESKHCCNIALERQATANVGCADVCPDIWNVTICCIW
GDEVSCPCCICRADYGVCPCIEPRVLTAAKGTQSCLYSCDERGLILPFSTHEWITTRLVIFPRNGD
CKCDCGCTPKVCGWGYPDIRIYAAIGSEEYNPVATGNRVIQDICRYAISPTSILQNTKICPDVECV
TASGANACIGCEQDKTPGVQTLCCAPISPRAFLARTAFLMVYKHPLNGKCKCIEECEEEAGAAQGC
TFPSNVLGVKIWVLVLNCNTPCINNCCECCCAFAGDDRRGKPHYIIEGEDTDDQRPWPFCTAIKYI
ETAPRRGLDPSSCTENCACYPKGACRINDDVTPWKGCTSIDYCQLAGFCLFPVHGAPERIPDICLL

In [214]:
len(a)


Out[214]:
3250

In [176]:
pwd


Out[176]:
'/Users/weilu/opt/notebook/Optimization'

In [163]:
a.drop_duplicates("Name").shape


Out[163]:
(1746, 5)

In [ ]:


In [152]:
a.shape


Out[152]:
(1829, 5)

In [151]:
a.query("Problematic == 0").shape


Out[151]:
(1087, 5)

In [150]:
a.query("Problematic == 4")


Out[150]:
Name FullName Length Seq Problematic
101 5LQWh 5LQWh_5-97 93 NFLKKLRNEQVTIELKNGTTVWGTLQSVSPQMNAILTDVKLTLPSD... 4
302 1PDIR 1PDIR_344-400 57 GAIMMWAADSLPSDAWRFCHGGTVSASDCPLYASRIGTRYGGSSSN... 4
331 4M4WO 4M4WO_104-295 192 KKQQSLMKSMYIQQDLLGATFQQVDISDPSRLAMFQHVTDFLKSYN... 4
337 5LQWB 5LQWB_856-948 93 ILLEPIYEVDITVHAPLLPIVEELMKKRRGSRIYKTIKVAGTPLLE... 4
361 5LQWB 5LQWB_481-578 98 AEWSLVRIYSGLLKRGDTVRILDTSQSESRQKRQLHDISKTEDDET... 4
416 2GYQA 2GYQA_10-168 159 TMEDLLLHGLRDIYYAEQQITKALPKMIEQATNRDLSQGLTSHLEE... 4
430 5LQWX 5LQWX_147-682 536 FLALTSDSGNLSIVQIHAGALRLKTLVNQPLTRTTLRRVSPISYME... 4
435 1IYWB 1IYWB_797-862 66 DVEEWRRRQEKRLKELLALAERSQRKLASPGFREKAPKEVVEAEEA... 4
669 1I6VD 1I6VD_29-618 590 PETEAKVCERCAVEVTRSIVRRYRMAHIELATPAAHIWFVKDVPSK... 4
718 3IYVI 3IYVI_356-421 66 NLAGAEELFARKFNALFAQGNYSEAAKVAANAPKGILRTPDTIRRF... 4
812 5LQWO 5LQWO_775-862 88 ITRCFISGFPMNIVQLGPTGYQTMGRSSGGLNVSVHPTSILFAQRP... 4
1219 4ADXG 4ADXG_2-60 59 KMKTKIFRVKGKFLMGDKLQPFTKELNAIREEEIYERLYSEFGSKH... 4
1314 4Q8BB 4Q8BB_359-512 154 PQNIRTLKLAGTQDEYGRPVLLLNNKRWHDPVTETPKVGTTEIWSI... 4
1352 4ADXA 4ADXA_89-224 136 APGNTLPLAEIPEGVPVCNVESSPGDGGKFARASGVNAQLLTHDRN... 4
1446 1ZLGA 1ZLGA_549-646 98 AKPENLSASFIVQDVNITGHFSWKMAKANLYQPMTGFQVTWAEVTT... 4
1496 3JCRK 3JCRK_544-673 130 SVYRVRNLSNPAKKFKIEANAGQLYLTGVVVLHKDVNVVVVEGGPK... 4
1515 5FMWV 5FMWV_251-480 230 FSYSKNETYQLFLSYSSKKEKMFLHVKGEIHLGRFVMRNRDVVLTT... 4
1701 1O65B 1O65B_56-171 116 EHGGPDRALCHYPREHYLYWAREFPEQAELFVAPAFGENLSTDGLT... 4
1719 5LQWO 5LQWO_619-744 126 SLENLYILGALNSKGTITRLGKMMCEFPCEPEFAKVLYTAATHEQC... 4

In [147]:
a.query("Problematic == 4")


Out[147]:
Name FullName Length Seq Problematic
101 5LQWh 5LQWh_5-97 93 NFLKKLRNEQVTIELKNGTTVWGTLQSVSPQMNAILTDVKLTLPSD... 4
302 1PDIR 1PDIR_344-400 57 GAIMMWAADSLPSDAWRFCHGGTVSASDCPLYASRIGTRYGGSSSN... 4
331 4M4WO 4M4WO_104-295 192 KKQQSLMKSMYIQQDLLGATFQQVDISDPSRLAMFQHVTDFLKSYN... 4
337 5LQWB 5LQWB_856-948 93 ILLEPIYEVDITVHAPLLPIVEELMKKRRGSRIYKTIKVAGTPLLE... 4
361 5LQWB 5LQWB_481-578 98 AEWSLVRIYSGLLKRGDTVRILDTSQSESRQKRQLHDISKTEDDET... 4
416 2GYQA 2GYQA_10-168 159 TMEDLLLHGLRDIYYAEQQITKALPKMIEQATNRDLSQGLTSHLEE... 4
430 5LQWX 5LQWX_147-682 536 FLALTSDSGNLSIVQIHAGALRLKTLVNQPLTRTTLRRVSPISYME... 4
435 1IYWB 1IYWB_797-862 66 DVEEWRRRQEKRLKELLALAERSQRKLASPGFREKAPKEVVEAEEA... 4
669 1I6VD 1I6VD_29-618 590 PETEAKVCERCAVEVTRSIVRRYRMAHIELATPAAHIWFVKDVPSK... 4
718 3IYVI 3IYVI_356-421 66 NLAGAEELFARKFNALFAQGNYSEAAKVAANAPKGILRTPDTIRRF... 4
812 5LQWO 5LQWO_775-862 88 ITRCFISGFPMNIVQLGPTGYQTMGRSSGGLNVSVHPTSILFAQRP... 4
1219 4ADXG 4ADXG_2-60 59 KMKTKIFRVKGKFLMGDKLQPFTKELNAIREEEIYERLYSEFGSKH... 4
1314 4Q8BB 4Q8BB_359-512 154 PQNIRTLKLAGTQDEYGRPVLLLNNKRWHDPVTETPKVGTTEIWSI... 4
1352 4ADXA 4ADXA_89-224 136 APGNTLPLAEIPEGVPVCNVESSPGDGGKFARASGVNAQLLTHDRN... 4
1446 1ZLGA 1ZLGA_549-646 98 AKPENLSASFIVQDVNITGHFSWKMAKANLYQPMTGFQVTWAEVTT... 4
1496 3JCRK 3JCRK_544-673 130 SVYRVRNLSNPAKKFKIEANAGQLYLTGVVVLHKDVNVVVVEGGPK... 4
1515 5FMWV 5FMWV_251-480 230 FSYSKNETYQLFLSYSSKKEKMFLHVKGEIHLGRFVMRNRDVVLTT... 4
1701 1O65B 1O65B_56-171 116 EHGGPDRALCHYPREHYLYWAREFPEQAELFVAPAFGENLSTDGLT... 4
1719 5LQWO 5LQWO_619-744 126 SLENLYILGALNSKGTITRLGKMMCEFPCEPEFAKVLYTAATHEQC... 4

In [113]:
len(filtered_data)


Out[113]:
1829

In [116]:
a = pd.DataFrame(filtered_data, columns=["Name", "Length", "Seq", "Problematic"])
a.to_csv(folder+"/data_info.csv")

In [123]:
b = a.query("Problematic == 0")

In [139]:
pd.read_csv(folder+"/data_info.csv", index_col=0)["Name"].values


Out[139]:
array(['1VBHA', '1K1GA', '3OHBA', ..., '4O1PD', '3BJEB', '4WY9A'],
      dtype=object)

In [142]:
a.drop_duplicates("Name")


Out[142]:
Name Length Seq Problematic
0 1VBHA 359 PALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQGIGLC... 0
1 1K1GA 88 SDKVMIPQDEYPEINFVGLLIGPRGNTLKNIEKECNAKIMIRGKGS... 0
2 3OHBA 128 PLSSRPVVKSMMSNKNLRGKSCNSIVDCISWLEVFCAELTSRIQDL... 0
3 2IVXA 141 SSRWFFTREQLENTPSRRCGVEADKELSCRQQAANLIQEMGQRLNV... 0
4 3IKKB 107 VLSLEPQHELKFRGPFTDVVTTNLKLGNPTDRNVCFKVKTTAPRRY... 0
5 2YTUA 85 PPNMTTNERRVIVPADPTLWTQEHVRQWLEWAIKEYSLMEIDTSFF... 0
6 2HR3D 68 VQFSQLVVLGAIDRLGGDVTPSELAAAERRSSNLAALLRELERGGL... 1
7 2P3HA 95 TETSPDKWLIDGDTPLDEVERAIGYELPEGDYETISGLLFDHANAL... 0
8 1X4UA 71 RYPTNNFGNCTGCSATFSVLKKRRSCSNCGNSFCSRCCSFKVPKSS... 0
9 4IHCH 257 EVEDNIRARMEEGYQYVRCQMGMYGGPKRSPRSKTPGIYFDPEAYA... 1
10 4IM7A 248 DNVTFPSTMVDRIVPAVTEDTLAKIEQLTGVRDAAGVACEPFRQWV... 0
11 4M7IA 417 KKVYLYIQMQLCRKENLKDWMNGRCTIEERERSVCLHIFLQIAEAV... 1
12 5IY9W 177 LLEMSAVVPDGIVAFFTSYQYMESTVASWYEQGILENIQRNKLLFI... 0
13 5K98D 255 ISVAGAQEKTALLRIGNDWCIPKGITPTTHIIKLPILSQSVDNEYY... 1
14 3GVZB 230 TASEGSLLAKNRDWKPDHAQSLRLLHPEHGYAYLGLYADNGSEPGI... 1
15 3C6MC 193 DLAYTRAIMGSGKEDYTGKDVLILGGGDGGILCEIVKLKPKMVTMV... 1
16 1UB9A 79 VRLGIMIFLLPRRKAPFSQIQKVLDLTPGNLDSHIRVLERNGLVKT... 0
17 3JU1B 340 VGVVTLNVEKALNALDLDVRATVQLNLWKKDPLIACVVLDGSGEKA... 1
18 4FGCE 81 PDFATIYISYIPDEKMVESKSLKLYLFSFRNHGDFHEDCMNIIMND... 0
19 3BZKA 65 VNAVGVDVNTASAALLARISGLNSTLAQNIVAHRDANGAFRTRDEL... 0
20 2M2KA 80 FKRYPKDALRLKRQGVGQVRFTLDRQGHVLAVTLVSSAGLPSLDRE... 0
21 2C81A 395 QHSDRTRRKIEEVFQSNRWAISGYWTGEESMERKFAKAFADFNGVP... 0
22 3B39A 56 SAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLRIYLRQE... 0
23 1W1WD 1211 RLVGLELSNFKSYRGVTKVGFGESNFTSIIGPNGSGKSNMMDAISF... 1
24 1LYQB 102 HPELKSSVPQADSAVAAPEKIQLNFSENLTVKFSGAKLTMTGMKGM... 0
25 3EBRA 93 PWPFAPYSNDVVKYFKIDPVRGETITLLKAPAGEPRHHHTGTVIVY... 1
26 5FWZA 89 DIEMIATTMSVPRQVEVTEKFKSLVTAHNGKDEEMKDVAQDMKNYM... 0
27 3U07C 133 LNEFQNPTINNDTLHLGWLDVAAEPVIVSVPDDEGRYWILHTDGHY... 1
28 5J3BB 58 STNDFKPGLKVMLDSNPCSIMENEYVKPGKGQAFNRVKLRNLKTGK... 0
29 3BYRA 80 LPPEEVERIRAFLQERIRGRALEVHDLKTRRAGPRSFLEFHLVVRG... 0
... ... ... ... ...
1796 4MTNA 69 AGYRTKVAVSCADSNIDPVGACVGVRGARIRNVGEELGGERIEVVR... 1
1797 1MIYB 149 LAAREERWALLCHALGVQESRPFLRAWKLPNKVVDEAGAILTALAD... 0
1798 3KYHD 203 QPVSFQHSDVEEKLLAHDYYVCEKTDGLRVLMFIVINPVTGEQGCF... 0
1799 1W7CA 88 LAKEEVQEVLDLLHSTYNITEVTKADFFSNYVLWIETLKPNKTEAL... 0
1800 1PZSA 161 TAPDGTKVATAKFEFANGYATVTIATTGVGKLTPGFHGLHIHQVGK... 0
1801 2WP7A 143 YPVKLYVYDLSKGLARRLSPIMLGKQLEGIWHTSIVVHKDEFFFGS... 0
1802 5AV7D 136 GNFWSFRPVNKINQIVISYGGGGNNPIALTFSSTKAGSKDTITVGG... 1
1803 4DIED 216 VAVDGPSGTGKSSVAKELARQLGASYLDTGAMYRIVTLWVLRAGVD... 1
1804 1IXRB 60 FELLLSVSGVGPKVALALLSALPPRLLARALLEGDARLLTSASGVG... 0
1805 5J3EB 165 SHWLMKSEPESRLEKGVDVKFSIEDLKAQPKQTTCWDGVRNYQARN... 0
1806 3CR7D 166 RGLTIWLTGLSASGKSTLAVELEHQLVRDRRVHAYRLDGDNIRFGL... 1
1807 3PFLB 112 FGPGANPMHGRDQKGAVASLTSVAKLPFAYAKDGISYTFSIVPNAL... 0
1808 1XD3C 209 WLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCA... 0
1810 3L0QB 210 IALIGGTSTAHASRSAHFISGIWGPYYSAILPEYWLNEGGQSATGA... 1
1811 3MDOB 202 QGGDVIVGLASSGQATYEKEYNGGGSNGLTSARHDVFSKYLAKKYP... 1
1813 4PV1A 190 VTSKYVPPHVNIFYCLGGITLTCFLIQFATGFAMTFYYKPTVTEAY... 0
1814 4A6EA 90 LLNDYANGFMVSQVLFAACELGVFDLLAEAPGPLDVAAVAAGVRAS... 0
1815 4XD7F 75 VIQVGPVVDVKFNGHLPIYNALKIQHKARNENEVDIDLTLEVALHL... 1
1816 3WO1A 165 IGTPLILKPTYLASSIGVTLITDTETAEDEFNRVNDYLKSINVPKA... 0
1817 5E27B 77 VIDGSIWDAIAGCEAGGNWAINTGNGYYGGVQFDQGTWEANGGLRY... 0
1818 5K12F 131 CNHVLSLSFPIRRDDGSWEVIEGYRAQHSQHRTPCKGGIRYSTDVS... 0
1819 1QFWB 105 RPRCRPINATLAVEKEGCPVCITVNTTICAGYCPTMTRVLQGVLPA... 0
1820 4ZQHB 127 LGTPDMKTPIAHALAYPERIKSGVMPLDLYQLGSLKFLAPDLDKFA... 0
1822 4HQEB 94 ILGRSWNGLIINYLSRCNDCSAHFSDMKRDLKTITPRALSLKLSEL... 0
1823 4PCQD 75 ISVNLQSSARGKIRSFIQQIRRKRQVMDVYFLAGADDFILHVAARD... 0
1824 5IM3B 99 LRVIKRNGTVVPYTDDKITVAITKAFLAVEAAASSRIHDTVRRLTE... 1
1825 3JYGF 176 QIAKEFDFCYGHRVWSQELNPDFSLDPCLSCRHLHGHQGKVIVHLE... 1
1826 4O1PD 57 MHRKTKQDQERIRKGGATALMDAAEKGHVGVVTILLHAMKAEVDAR... 0
1827 3BJEB 304 DRIIFVGDPGRVDVISGYFDKDSIRASRDHREIRFATGTYKGTPVT... 0
1828 4WY9A 278 NITKNTEDILASITKEYATQTQGIFGEMIALNKSISGTLTEMFRST... 1

1746 rows × 4 columns


In [140]:
a.query("Name == '5M3MC'")


Out[140]:
Name Length Seq Problematic
38 5M3MC 273 NFDLINIDTSIANAFRRIMISEVPSVAAEYVYFFNNTSVIQDEVLA... 0
580 5M3MC 138 VYFFNNTSVIQDEVLAHRIGLVPLKVDPDMLTWVDSNLPDDEKFTD... 0

In [133]:
len(b)


Out[133]:
1087

In [125]:
a.to_csv(folder+"/data_info.csv")

In [134]:
b.query("Length < 200").sample(1)


Out[134]:
Name Length Seq Problematic
135 1ZRTR 132 IFVDVSAVEVGTQLTVKWRGKPVFIRRRDEKDIELARSVPLGALRD... 0

In [135]:
b.query("Length < 200").shape


Out[135]:
(868, 4)

In [131]:
b.sort_values("Length")["Length"].hist(bins=100)


Out[131]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2846f4e0>

In [ ]:


In [ ]:


In [109]:
len(filtered_data)


Out[109]:
1087

In [110]:
pd.DataFrame([["a", 1],["b", 2]])


Out[110]:
0 1
0 a 1
1 b 2

In [82]:
seq1 = getSeqFromFasta(one)

In [84]:
from Bio.PDB.PDBParser import PDBParser
pdbFileLocation = pre + ".pdb"
structure = PDBParser().get_structure(name, pdbFileLocation)
seq = ""
for r in structure.get_residues():
    _, _, chain, (_, resId, _) = r.get_full_id()
    resName = three_to_one(r.get_resname())
    assert chain == "A"
    seq += resName

In [81]:
len(seq)


Out[81]:
359

In [85]:
seq == seq1


Out[85]:
True

In [87]:
seq1


Out[87]:
'PALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQGIGLCRTEHMFFASDERIKAVRQMIMAPTLELRQQALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHEFLPEGNIEDIVSELCAETGANQEDALARIEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAIAMTNQGVQVFPEIMVPLVGTPQELGHQVTLIRQVAEKVFANVGKTIGYKVGTMIEIPRAALVADEIAEQAEFFSFGTNDLTQMTFGYSRDDVGKFIPVYLAQGILQHDPFEVLDQRGVGELVKFATERGRKARPNLKVGICGEHGGEPSSVAFFAKAGLDYVSCSPFRVPIARLAAAQVLV'

In [86]:
seq


Out[86]:
'PALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQGIGLCRTEHMFFASDERIKAVRQMIMAPTLELRQQALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHEFLPEGNIEDIVSELCAETGANQEDALARIEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAIAMTNQGVQVFPEIMVPLVGTPQELGHQVTLIRQVAEKVFANVGKTIGYKVGTMIEIPRAALVADEIAEQAEFFSFGTNDLTQMTFGYSRDDVGKFIPVYLAQGILQHDPFEVLDQRGVGELVKFATERGRKARPNLKVGICGEHGGEPSSVAFFAKAGLDYVSCSPFRVPIARLAAAQVLV'

In [76]:
targetPre = "/Users/weilu/Research/server/march_2019/optimization_mult_seq/original_pdbs/"
os.system(f"cp {pre}.pdb {targetPre}{name.lower()[:4]}.pdb")


Out[76]:
0

In [75]:
pre


Out[75]:
'/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/1VBHA_518-876'

In [73]:
name


Out[73]:
'1VBHA'

In [61]:
protein = one.split(".")[-2]

In [69]:
import re
re.split("\W+", one)


Out[69]:
['',
 'Users',
 'weilu',
 'Research',
 'optimization',
 'mediated_term',
 'multisequenceanddcafrustratometry',
 '1VBHA_518',
 '876',
 'fasta']

In [71]:



Out[71]:
'1VBHA_518-876'

In [62]:
one.split(".")


Out[62]:
['/Users/weilu/Research/optimization/mediated_term/multisequenceanddcafrustratometry/1VBHA_518-876',
 'fasta']

In [7]:
alen(data)


Out[7]:
1829

In [9]:
with open(folder+"/3GL5A_2-207_filtered_0.05.seqs", "r") as f:
    d = f.readlines()

In [11]:
len(d)


Out[11]:
6705

In [13]:
len(d[0])


Out[13]:
205

In [14]:
len(d[0].strip())


Out[14]:
204

In [15]:
d[0].strip()


Out[15]:
'RVEIWSDIACPWCYVGKARFEKALAAFPHRDGVEVVHRSFELDPGRAKDDVQPVLTLTAKYGSQEQAQAGEDNLGAQAAAEGLAYRTRDRDHGSTFRAQVASLYVKETFQWLAFDEGVFAALWADDRDVGDADVRLADIADGVGLDGEEIRTVVDDPEAWRDRLRDEFADAREAGITGVPTFVYDGHYGARGAVPPSQLERLLT'

In [18]:
with open(folder+"/3GL5A_2-207.fasta", "r") as f:
    fastaFile = f.readlines()
fasta = fastaFile[1].strip()

In [19]:
fasta


Out[19]:
'RVEIWSDIACPWCYVGKARFEKALAAFPHRDGVEVVHRSFELDPGRAKDDVQPVLTLTAKYGSQEQAQAGEDNLGAQAAAEGLAYRTRDRDHGSTFDLHRLLHLAKERGRHEALLDAFYRGNFADERSVFNDDERLVELAVGAGLDAEEVRAVLADPAAYADEVRADEREAAQLGATGVPFFVLDRAYGVSGAQPAEVFTQALT'

In [20]:
len(fasta)


Out[20]:
204

In [21]:
from Bio.PDB.PDBParser import PDBParser
pdbFileLocation = '/Users/weilu/Research/server/march_2019/optimization_mult_seq/cleaned_pdbs/3gl5.pdb'
structure = PDBParser().get_structure('3gl5', pdbFileLocation)
seq = ""
for r in structure.get_residues():
    _, _, chain, (_, resId, _) = r.get_full_id()
    resName = three_to_one(r.get_resname())
    assert chain == "A"
    seq += resName


RVEIWSDIACPWCYVGKARFEKALAAFPHRDGVEVVHRSFELDPGRAKDDVQPVLT
LTAKYG
SQEQAQAGEDNLGAQAAAEGLAYRTRDRDHGSTFDLHRLLHLAKERGRHEALLDAFYRGNFADERSVFNDDERLVELAVGAGLDAEEVRAVLADPAAYADEVRADEREAAQLGATGVPFFVLDRAYGVSGAQPAEVFTQALT

In [ ]:
seq = ""
for r in structure.get_residues():
    _, _, chain, (_, resId, _) = r.get_full_id()
    resName = three_to_one(r.get_resname())
    assert chain == "A"
    seq += resName

In [55]:
seq == fasta


Out[55]:
True

In [22]:
a = pp.get_sequence()

In [ ]:


In [29]:
list(structure.get_chains())


Out[29]:
[<Chain id=A>]

In [33]:
c = structure[0]["A"]

In [50]:
three_to_one('As')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-50-9962ca1efbf3> in <module>
----> 1 three_to_one('As')

~/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/Polypeptide.py in three_to_one(s)
    143     KeyError: 'MSE'
    144     """
--> 145     i = d3_to_index[s]
    146     return dindex_to_1[i]
    147 

KeyError: 'As'

In [51]:
seq = ""
for r in structure.get_residues():
    _, _, chain, (_, resId, _) = r.get_full_id()
    resName = three_to_one(r.get_resname())
    assert chain == "A"
    seq += resName

In [52]:
seq


Out[52]:
'RVEIWSDIACPWCYVGKARFEKALAAFPHRDGVEVVHRSFELDPGRAKDDVQPVLTLTAKYGSQEQAQAGEDNLGAQAAAEGLAYRTRDRDHGSTFDLHRLLHLAKERGRHEALLDAFYRGNFADERSVFNDDERLVELAVGAGLDAEEVRAVLADPAAYADEVRADEREAAQLGATGVPFFVLDRAYGVSGAQPAEVFTQALT'

In [54]:
len(seq)


Out[54]:
204

In [ ]:


In [37]:
r = list(c.get_residues())[0]

In [38]:
r.get_segid()


Out[38]:
'    '

In [39]:
r.get_id()


Out[39]:
(' ', 1, ' ')

In [42]:
r.get_full_id()


Out[42]:
('3gl5', 0, 'A', (' ', 1, ' '))

In [48]:
r.get_resname()


Out[48]:
'THR'

In [45]:
list(structure.get_residues())


Out[45]:
[<Residue ARG het=  resseq=1 icode= >,
 <Residue VAL het=  resseq=2 icode= >,
 <Residue GLU het=  resseq=3 icode= >,
 <Residue ILE het=  resseq=4 icode= >,
 <Residue TRP het=  resseq=5 icode= >,
 <Residue SER het=  resseq=6 icode= >,
 <Residue ASP het=  resseq=7 icode= >,
 <Residue ILE het=  resseq=8 icode= >,
 <Residue ALA het=  resseq=9 icode= >,
 <Residue CYS het=  resseq=10 icode= >,
 <Residue PRO het=  resseq=11 icode= >,
 <Residue TRP het=  resseq=12 icode= >,
 <Residue CYS het=  resseq=13 icode= >,
 <Residue TYR het=  resseq=14 icode= >,
 <Residue VAL het=  resseq=15 icode= >,
 <Residue GLY het=  resseq=16 icode= >,
 <Residue LYS het=  resseq=17 icode= >,
 <Residue ALA het=  resseq=18 icode= >,
 <Residue ARG het=  resseq=19 icode= >,
 <Residue PHE het=  resseq=20 icode= >,
 <Residue GLU het=  resseq=21 icode= >,
 <Residue LYS het=  resseq=22 icode= >,
 <Residue ALA het=  resseq=23 icode= >,
 <Residue LEU het=  resseq=24 icode= >,
 <Residue ALA het=  resseq=25 icode= >,
 <Residue ALA het=  resseq=26 icode= >,
 <Residue PHE het=  resseq=27 icode= >,
 <Residue PRO het=  resseq=28 icode= >,
 <Residue HIS het=  resseq=29 icode= >,
 <Residue ARG het=  resseq=30 icode= >,
 <Residue ASP het=  resseq=31 icode= >,
 <Residue GLY het=  resseq=32 icode= >,
 <Residue VAL het=  resseq=33 icode= >,
 <Residue GLU het=  resseq=34 icode= >,
 <Residue VAL het=  resseq=35 icode= >,
 <Residue VAL het=  resseq=36 icode= >,
 <Residue HIS het=  resseq=37 icode= >,
 <Residue ARG het=  resseq=38 icode= >,
 <Residue SER het=  resseq=39 icode= >,
 <Residue PHE het=  resseq=40 icode= >,
 <Residue GLU het=  resseq=41 icode= >,
 <Residue LEU het=  resseq=42 icode= >,
 <Residue ASP het=  resseq=43 icode= >,
 <Residue PRO het=  resseq=44 icode= >,
 <Residue GLY het=  resseq=45 icode= >,
 <Residue ARG het=  resseq=46 icode= >,
 <Residue ALA het=  resseq=47 icode= >,
 <Residue LYS het=  resseq=48 icode= >,
 <Residue ASP het=  resseq=49 icode= >,
 <Residue ASP het=  resseq=50 icode= >,
 <Residue VAL het=  resseq=51 icode= >,
 <Residue GLN het=  resseq=52 icode= >,
 <Residue PRO het=  resseq=53 icode= >,
 <Residue VAL het=  resseq=54 icode= >,
 <Residue LEU het=  resseq=55 icode= >,
 <Residue THR het=  resseq=56 icode= >,
 <Residue LEU het=  resseq=57 icode= >,
 <Residue THR het=  resseq=58 icode= >,
 <Residue ALA het=  resseq=59 icode= >,
 <Residue LYS het=  resseq=60 icode= >,
 <Residue TYR het=  resseq=61 icode= >,
 <Residue GLY het=  resseq=62 icode= >,
 <Residue SER het=  resseq=63 icode= >,
 <Residue GLN het=  resseq=64 icode= >,
 <Residue GLU het=  resseq=65 icode= >,
 <Residue GLN het=  resseq=66 icode= >,
 <Residue ALA het=  resseq=67 icode= >,
 <Residue GLN het=  resseq=68 icode= >,
 <Residue ALA het=  resseq=69 icode= >,
 <Residue GLY het=  resseq=70 icode= >,
 <Residue GLU het=  resseq=71 icode= >,
 <Residue ASP het=  resseq=72 icode= >,
 <Residue ASN het=  resseq=73 icode= >,
 <Residue LEU het=  resseq=74 icode= >,
 <Residue GLY het=  resseq=75 icode= >,
 <Residue ALA het=  resseq=76 icode= >,
 <Residue GLN het=  resseq=77 icode= >,
 <Residue ALA het=  resseq=78 icode= >,
 <Residue ALA het=  resseq=79 icode= >,
 <Residue ALA het=  resseq=80 icode= >,
 <Residue GLU het=  resseq=81 icode= >,
 <Residue GLY het=  resseq=82 icode= >,
 <Residue LEU het=  resseq=83 icode= >,
 <Residue ALA het=  resseq=84 icode= >,
 <Residue TYR het=  resseq=85 icode= >,
 <Residue ARG het=  resseq=86 icode= >,
 <Residue THR het=  resseq=87 icode= >,
 <Residue ARG het=  resseq=88 icode= >,
 <Residue ASP het=  resseq=89 icode= >,
 <Residue ARG het=  resseq=90 icode= >,
 <Residue ASP het=  resseq=91 icode= >,
 <Residue HIS het=  resseq=92 icode= >,
 <Residue GLY het=  resseq=93 icode= >,
 <Residue SER het=  resseq=94 icode= >,
 <Residue THR het=  resseq=95 icode= >,
 <Residue PHE het=  resseq=96 icode= >,
 <Residue ASP het=  resseq=97 icode= >,
 <Residue LEU het=  resseq=98 icode= >,
 <Residue HIS het=  resseq=99 icode= >,
 <Residue ARG het=  resseq=100 icode= >,
 <Residue LEU het=  resseq=101 icode= >,
 <Residue LEU het=  resseq=102 icode= >,
 <Residue HIS het=  resseq=103 icode= >,
 <Residue LEU het=  resseq=104 icode= >,
 <Residue ALA het=  resseq=105 icode= >,
 <Residue LYS het=  resseq=106 icode= >,
 <Residue GLU het=  resseq=107 icode= >,
 <Residue ARG het=  resseq=108 icode= >,
 <Residue GLY het=  resseq=109 icode= >,
 <Residue ARG het=  resseq=110 icode= >,
 <Residue HIS het=  resseq=111 icode= >,
 <Residue GLU het=  resseq=112 icode= >,
 <Residue ALA het=  resseq=113 icode= >,
 <Residue LEU het=  resseq=114 icode= >,
 <Residue LEU het=  resseq=115 icode= >,
 <Residue ASP het=  resseq=116 icode= >,
 <Residue ALA het=  resseq=117 icode= >,
 <Residue PHE het=  resseq=118 icode= >,
 <Residue TYR het=  resseq=119 icode= >,
 <Residue ARG het=  resseq=120 icode= >,
 <Residue GLY het=  resseq=121 icode= >,
 <Residue ASN het=  resseq=122 icode= >,
 <Residue PHE het=  resseq=123 icode= >,
 <Residue ALA het=  resseq=124 icode= >,
 <Residue ASP het=  resseq=125 icode= >,
 <Residue GLU het=  resseq=126 icode= >,
 <Residue ARG het=  resseq=127 icode= >,
 <Residue SER het=  resseq=128 icode= >,
 <Residue VAL het=  resseq=129 icode= >,
 <Residue PHE het=  resseq=130 icode= >,
 <Residue ASN het=  resseq=131 icode= >,
 <Residue ASP het=  resseq=132 icode= >,
 <Residue ASP het=  resseq=133 icode= >,
 <Residue GLU het=  resseq=134 icode= >,
 <Residue ARG het=  resseq=135 icode= >,
 <Residue LEU het=  resseq=136 icode= >,
 <Residue VAL het=  resseq=137 icode= >,
 <Residue GLU het=  resseq=138 icode= >,
 <Residue LEU het=  resseq=139 icode= >,
 <Residue ALA het=  resseq=140 icode= >,
 <Residue VAL het=  resseq=141 icode= >,
 <Residue GLY het=  resseq=142 icode= >,
 <Residue ALA het=  resseq=143 icode= >,
 <Residue GLY het=  resseq=144 icode= >,
 <Residue LEU het=  resseq=145 icode= >,
 <Residue ASP het=  resseq=146 icode= >,
 <Residue ALA het=  resseq=147 icode= >,
 <Residue GLU het=  resseq=148 icode= >,
 <Residue GLU het=  resseq=149 icode= >,
 <Residue VAL het=  resseq=150 icode= >,
 <Residue ARG het=  resseq=151 icode= >,
 <Residue ALA het=  resseq=152 icode= >,
 <Residue VAL het=  resseq=153 icode= >,
 <Residue LEU het=  resseq=154 icode= >,
 <Residue ALA het=  resseq=155 icode= >,
 <Residue ASP het=  resseq=156 icode= >,
 <Residue PRO het=  resseq=157 icode= >,
 <Residue ALA het=  resseq=158 icode= >,
 <Residue ALA het=  resseq=159 icode= >,
 <Residue TYR het=  resseq=160 icode= >,
 <Residue ALA het=  resseq=161 icode= >,
 <Residue ASP het=  resseq=162 icode= >,
 <Residue GLU het=  resseq=163 icode= >,
 <Residue VAL het=  resseq=164 icode= >,
 <Residue ARG het=  resseq=165 icode= >,
 <Residue ALA het=  resseq=166 icode= >,
 <Residue ASP het=  resseq=167 icode= >,
 <Residue GLU het=  resseq=168 icode= >,
 <Residue ARG het=  resseq=169 icode= >,
 <Residue GLU het=  resseq=170 icode= >,
 <Residue ALA het=  resseq=171 icode= >,
 <Residue ALA het=  resseq=172 icode= >,
 <Residue GLN het=  resseq=173 icode= >,
 <Residue LEU het=  resseq=174 icode= >,
 <Residue GLY het=  resseq=175 icode= >,
 <Residue ALA het=  resseq=176 icode= >,
 <Residue THR het=  resseq=177 icode= >,
 <Residue GLY het=  resseq=178 icode= >,
 <Residue VAL het=  resseq=179 icode= >,
 <Residue PRO het=  resseq=180 icode= >,
 <Residue PHE het=  resseq=181 icode= >,
 <Residue PHE het=  resseq=182 icode= >,
 <Residue VAL het=  resseq=183 icode= >,
 <Residue LEU het=  resseq=184 icode= >,
 <Residue ASP het=  resseq=185 icode= >,
 <Residue ARG het=  resseq=186 icode= >,
 <Residue ALA het=  resseq=187 icode= >,
 <Residue TYR het=  resseq=188 icode= >,
 <Residue GLY het=  resseq=189 icode= >,
 <Residue VAL het=  resseq=190 icode= >,
 <Residue SER het=  resseq=191 icode= >,
 <Residue GLY het=  resseq=192 icode= >,
 <Residue ALA het=  resseq=193 icode= >,
 <Residue GLN het=  resseq=194 icode= >,
 <Residue PRO het=  resseq=195 icode= >,
 <Residue ALA het=  resseq=196 icode= >,
 <Residue GLU het=  resseq=197 icode= >,
 <Residue VAL het=  resseq=198 icode= >,
 <Residue PHE het=  resseq=199 icode= >,
 <Residue THR het=  resseq=200 icode= >,
 <Residue GLN het=  resseq=201 icode= >,
 <Residue ALA het=  resseq=202 icode= >,
 <Residue LEU het=  resseq=203 icode= >,
 <Residue THR het=  resseq=204 icode= >]

In [ ]: