In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
# from small_script.myFunctions import *
sys.path.insert(0, "/Users/weilu/openmmawsem")
from helperFunctions.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = np.array([16.18033, 10]) #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})
In [71]:
# pre = "/Users/weilu/Research/server_backup/feb_2019/jan_optimization/gammas/"
# pre = "/Users/weilu/Research/server/april_2019/optimization_test/gammas/"
pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/gammas/"
# pp = "cath-dataset-nonredundant-S20Clean_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0"
# pp = "proteins_name_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
pp = f"protein_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = pp + "_A"
B_name = pp + "_B"
B_filtered_name = pp + "_B_filtered"
P_name = pp + "_P"
Gamma_name = pp + "_gamma"
Gamma_filtered_name = pp + "_gamma_filtered"
Lamb_name = pp + "_lamb"
Lamb_filtered_name = pp + "_lamb_filtered"
A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
half_B_name = pp + "_half_B"
half_B = np.loadtxt(pre+half_B_name)
other_half_B_name = pp + "_other_half_B"
other_half_B = np.loadtxt(pre+other_half_B_name)
std_half_B_name = pp + "_std_half_B"
std_half_B = np.loadtxt(pre+std_half_B_name)
# pre = "/Users/weilu/Research/server/april_2019/"
location = pre + "../../phis/protein_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_phi_decoy_summary.txt"
A_prime = np.loadtxt(location)
In [63]:
pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/gammas/"
name_pre = f"protein_list_phi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A_name = f"{name_pre}_A"
B_name = f"{name_pre}_B"
B_filtered_name = f"{name_pre}_B_filtered"
P_name = f"{name_pre}_P"
Gamma_name = f"{name_pre}_gamma"
Gamma_filtered_name = f"{name_pre}_gamma_filtered"
Lamb_name = f"{name_pre}_lamb"
Lamb_filtered_name = f"{name_pre}_lamb_filtered"
A = np.loadtxt(pre+A_name)
B = np.loadtxt(pre+B_name)
B_filtered = np.loadtxt(pre+B_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Gamma = np.loadtxt(pre+Gamma_name)
Gamma_filtered = np.loadtxt(pre+Gamma_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb = np.loadtxt(pre+Lamb_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
Lamb_filtered = np.loadtxt(pre+Lamb_filtered_name, dtype=complex, converters={
0: lambda s: complex(s.decode().replace('+-', '-'))})
In [100]:
# maximum difference between loaded and computed is 1e-5.
max(lamb-Lamb)
Out[100]:
In [99]:
plt.plot(Lamb)
plt.yscale("log")
In [105]:
lamb, P = np.linalg.eig(B)
lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)
filtered_lamb = np.copy(lamb)
cutoff_mode = 100
filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(filtered_lamb,
cutoff_mode, P)
filtered_gamma = np.dot(filtered_B_inv, A)
filtered_B = np.linalg.inv(filtered_B_inv)
plot_contact_well(filtered_gamma[:210], inferBound=True)
plot_contact_well(filtered_gamma[210:420], inferBound=True)
plot_contact_well(filtered_gamma[420:], inferBound=True)
In [106]:
save_gamma_pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/"
np.savetxt(f"{save_gamma_pre}/cutoff100", filtered_gamma)
In [107]:
os.chdir('/Users/weilu/opt/notebook/Optimization')
Out[107]:
In [109]:
os.chdir("/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/cutoff100"
data = validate_hamiltonian_wei("phi_list.txt", "protein_list_tiny", gamma_file_name, "shuffle", 1000, mode=0)
data
Out[109]:
In [113]:
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/converted_original_gamma.dat"
original = np.loadtxt(gamma_file_name)
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/converted_original_gamma_2.dat"
original_2 = np.loadtxt(gamma_file_name)
In [117]:
data.plot("Protein", "E_native")
Out[117]:
In [120]:
os.chdir("/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
# gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/cutoff100"
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/original_gamma"
data = validate_hamiltonian_wei("phi_list.txt", "protein_list_tiny", gamma_file_name, "shuffle", 1000, mode=0)
data
Out[120]:
In [121]:
data
Out[121]:
In [123]:
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/original_gamma"
original_gamma = np.loadtxt(gamma_file_name)
In [124]:
np.dot(A_prime, original_gamma)
Out[124]:
In [126]:
# we want to impose additional contraint so that A' * gamma = constnat.(-562.23)
c = -562.23
B_inv = filtered_B_inv
lambda_2 = (A_prime.dot(B_inv).dot(A) - c) / (A_prime.dot(B_inv).dot(A_prime) )
gamma_new = B_inv.dot(A-A_prime*lambda_2)
In [127]:
np.dot(A_prime, gamma_new)
Out[127]:
In [128]:
plot_contact_well(filtered_gamma[:210], inferBound=True)
plot_contact_well(filtered_gamma[210:420], inferBound=True)
plot_contact_well(filtered_gamma[420:], inferBound=True)
In [129]:
# impose A'gamma
save_gamma_pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/"
np.savetxt(f"{save_gamma_pre}/cutoff100_impose_Aprime_constraint", gamma_new)
In [130]:
os.chdir("/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
# gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/cutoff100"
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/cutoff100_impose_Aprime_constraint"
data = validate_hamiltonian_wei("phi_list.txt", "protein_list_tiny", gamma_file_name, "shuffle", 1000, mode=0)
data
Out[130]:
In [131]:
data
Out[131]:
In [133]:
# mix gammas so that we don't overfitting too much.
alpha = 0.95
mixed_gamma = alpha*original_gamma + (1-alpha)*gamma_new
save_gamma_pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/"
np.savetxt(f"{save_gamma_pre}/mixed_original_and_cutoff100_impose_Aprime_constraint", mixed_gamma)
In [134]:
os.chdir("/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/")
# gamma_file_name = "gamma_iter1_combined_mar06.dat"
# gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/cutoff100"
gamma_file_name = "/Users/weilu/Research/server/sep_2019/peptide_optimization/saved_gammas/mixed_original_and_cutoff100_impose_Aprime_constraint"
data = validate_hamiltonian_wei("phi_list.txt", "protein_list_tiny", gamma_file_name, "shuffle", 1000, mode=0)
data
Out[134]:
In [135]:
data
Out[135]:
In [ ]:
In [110]:
# with additional constraint
data
Out[110]:
In [ ]:
In [ ]:
# contraints T_f = T_c
B_inverse = np.linalg.pinv(B)
up = A.dot(B_inverse).dot(A) - A_prime.dot(B_inverse).dot(A)
down = A.dot(B_inverse).dot(A_prime) - A_prime.dot(B_inverse).dot(A_prime)
lambda_1 = up / down
print(lambda_1)
g = B_inverse.dot(A - A_prime*lambda_1)
In [102]:
# lamb, P = np.linalg.eig(B)
# lamb, P = sort_eigenvalues_and_eigenvectors(lamb, P)
# filtered_lamb = np.copy(lamb)
# cutoff_mode = 200
# filtered_B_inv, filtered_lamb, P = get_filtered_B_inv_lambda_and_P(filtered_lamb,
# cutoff_mode, P)
# filtered_gamma = np.dot(filtered_B_inv, A)
# filtered_B = np.linalg.inv(filtered_B_inv)
# plot_contact_well(filtered_gamma[:210], inferBound=True)
In [ ]:
In [ ]:
In [ ]:
# "mkdir -p database/dompdb"
# "mkdir -p database/S20_seq"
# "mkdir optimization"
# cp ~/opt/optimization/phi_list_contact.txt phi_list.txt
In [9]:
def mycp(source, target):
os.system(f"cp {source} {target}")
do = os.system
def getSeq(fileLocation):
p = PDBParser()
s = p.get_structure("test", fileLocation)
seq = ""
residues = list(s.get_residues())
for residue in residues:
res_id = residue.get_id()[0]
if res_id==' ':
residue_name = residue.get_resname()
seq += three_to_one(residue_name)
return seq
# get chains nad seq
def getChainsAndSeq(fileLocation):
# fileLocation = "/Users/weilu/Research/examples/optimization/optimization/Structure_Ensemble/1.pdb"
p = PDBParser()
pdb = p.get_structure("test", fileLocation)
residues = list(pdb.get_residues())
seq = ""
chains = ""
for residue in residues:
res_id = residue.get_id()[0]
chain = residue.get_full_id()[2]
if res_id==' ':
residue_name = residue.get_resname()
seq += three_to_one(residue_name)
chains += chain
return chains, seq
In [40]:
pdbFolderList = glob.glob("/Users/weilu/Downloads/Optimization_Xfunnel/Structure_Ensemble_*")
In [34]:
pdbFolderList = [ '/Users/weilu/Downloads/Optimization_Xfunnel/Structure_Ensemble_2BNQ',
'/Users/weilu/Downloads/Optimization_Xfunnel/Structure_Ensemble_4FTV']
In [4]:
pdbFolder = pdbFolderList[0]
In [5]:
pdbName = pdbFolder.split("_")[-1]
In [6]:
pdbName
Out[6]:
In [41]:
len(pdbFolderList)
Out[41]:
In [43]:
# pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization_specific_test"
pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization_trial_4_duplicate"
do(f"mkdir -p {pre}/database/dompdb")
do(f"mkdir -p {pre}/database/S20_seq")
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
source = pdbFolder + f"/*ab.pdb"
p_list = glob.glob(source)
for p in p_list:
target = f"{pre}/database/dompdb/{pdbName}.pdb"
## move native pdbs to dompdb
mycp(p, target)
## move native seq to S20_seq
seq = getSeq(target)
fileLocation = f"{pre}/database/S20_seq/{pdbName}.seq"
with open(fileLocation, "w") as out:
out.write(seq+"\n")
In [44]:
do(f"mkdir -p {pre}/optimization")
## write protein_list
fileLocation = f"{pre}/optimization/protein_list"
with open(fileLocation, "w") as out:
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
out.write(f"{pdbName}\n")
In [45]:
do(f"mkdir -p {pre}/optimization/decoys/shuffle")
# generate decoys
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
source = pdbFolder + f"/*ab.pdb"
p_list = glob.glob(source)
# print(p_list, source)
assert len(p_list) == 1
chain_seq, seq = getChainsAndSeq(p_list[0])
print(pdbName, i, len(seq), len(chain_seq))
decoy_list = []
with open(f"{pdbFolder}/peptide.txt") as f:
for line in f:
pep = line.strip()
assert len(pep) == 9
for c in list(set(chain_seq)):
if chain_seq.count(c) == 9:
first_c = chain_seq.find(c)
a = list(seq)
a[first_c:first_c+9] = pep
decoy = "".join(a)
decoy_list.append(decoy)
fileLocation = f"{pre}/optimization/decoys/shuffle/{pdbName}.decoys"
with open(fileLocation, "w") as out:
for decoy in decoy_list:
out.write(decoy+"\n")
In [46]:
do(f"mkdir -p {pre}/phis")
do(f"cp ~/opt/optimization/phi_list_contact.txt {pre}/optimization/phi_list.txt")
Out[46]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [29]:
do(f"mkdir -p {pre}/optimization/decoys/shuffle")
# generate decoys
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
source = pdbFolder + f"/*ab.pdb"
p_list = glob.glob(source)
if pdbName != "3D39":
continue
# print(p_list, source)
assert len(p_list) == 1
chain_seq, seq = getChainsAndSeq(p_list[0])
print(pdbName, i, len(seq), len(chain_seq))
decoy_list = []
decoy = ""
with open(f"{pdbFolder}/peptide.txt") as f:
for line in f:
pep = line.strip()
assert len(pep) == 9
for c in list(set(chain_seq)):
if chain_seq.count(c) == 9:
first_c = chain_seq.find(c)
a = list(seq)
a[first_c:first_c+9] = pep
decoy = "".join(a)
decoy_list.append(decoy)
fileLocation = f"{pre}/optimization/decoys/shuffle/{pdbName}.decoys"
with open(fileLocation, "w") as out:
for decoy in decoy_list:
out.write(decoy+"\n")
In [31]:
list(set(chain_seq))
Out[31]:
In [32]:
chain_seq.count("Y")
Out[32]:
In [ ]:
In [205]:
## write protein_list
fileLocation = f"{pre}/optimization/protein_list"
with open(fileLocation, "w") as out:
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
for i in range(1, 6):
out.write(f"{pdbName}_{i}\n")
# generate decoys
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
for i in range(1, 6):
source = pdbFolder + f"/{i}.pdb"
source = pdbFolder + f"/*ab.pdb"
p_list = glob.glob(source)
assert len(p_list) == 1
chain_seq, seq = getChainsAndSeq(source)
print(pdbName, i, len(seq), len(chain_seq))
decoy_list = []
with open(f"{pdbFolder}/peptide.txt") as f:
for line in f:
pep = line.strip()
assert len(pep) == 9
for c in list(set(chain_seq)):
if chain_seq.count(c) == 9:
first_c = chain_seq.find(c)
a = list(seq)
a[first_c:first_c+9] = pep
decoy = "".join(a)
decoy_list.append(decoy)
fileLocation = f"{pre}/optimization/decoys/shuffle/{pdbName}_{i}.decoys"
with open(fileLocation, "w") as out:
for decoy in decoy_list:
out.write(decoy+"\n")
In [19]:
In [168]:
# ensure they all 9 residue peptide.
for pdbFolder in pdbFolderList:
pdbName = pdbFolder.split("_")[-1]
# print(pdbName)
with open(f"{pdbFolder}/peptide.txt") as f:
for line in f:
pep = line.strip()
assert len(pep) == 9
In [160]:
fileLocation
Out[160]:
In [164]:
for c in "ABCDEFG":
if chain_seq.count(c) == 9:
first_c = chain_seq.find(c)
a = list(seq)
a[first_c:first_c+9] = pep
decoy = "".join(a)
decoy_list.append(decoy)
Out[164]:
In [ ]:
In [3]:
# get seq
fileLocation = "/Users/weilu/Research/examples/optimization/optimization/Structure_Ensemble/1.pdb"
p = PDBParser()
pdb = p.get_structure("test", fileLocation)
residues = list(pdb.get_residues())
seq = ""
chains = ""
for residue in residues:
res_id = residue.get_id()[0]
chain = residue.get_full_id()[2]
if res_id==' ':
residue_name = residue.get_resname()
seq += three_to_one(residue_name)
chains += chain
In [5]:
# get decoy
decoy_list = []
with open("/Users/weilu/Research/examples/optimization/optimization/Structure_Ensemble/peptide.txt") as f:
for line in f:
pep = line.strip()
assert len(pep) == 9
a = list(seq)
a[180:189] = pep
decoy = "".join(a)
decoy_list.append(decoy)
In [38]:
for i in range(1, 91):
pre = "/Users/weilu/Research/server/sep_2019/peptide_optimization"
fileLocation = f"{pre}/database/S20_seq/{i}.seq"
with open(fileLocation, "w") as out:
out.write(seq+"\n")
In [40]:
for i in range(1, 91):
fileLocation = f"{pre}/optimization/decoys/shuffle/{i}.decoys"
with open(fileLocation, "w") as out:
for decoy in decoy_list:
out.write(decoy+"\n")
In [42]:
with open("/Users/weilu/Research/server/sep_2019/peptide_optimization/optimization/protein_list", "w") as out:
for i in range(1, 91):
out.write(f"{i}\n")
In [41]:
for i in range(1, 91):
os.system(f"cp /Users/weilu/Research/examples/optimization/optimization/Structure_Ensemble/{i}.pdb /Users/weilu/Research/server/sep_2019/peptide_optimization/database/dompdb/")
In [39]:
len(decoy_list)
Out[39]:
In [6]:
seq[180:189]
Out[6]:
In [49]:
a = list(seq)
a[180:189] = list('FIFLLFLTL')
In [151]:
In [26]:
all_seq = []
for i in range(1, 91):
fileLocation = f"/Users/weilu/Research/examples/optimization/optimization/Structure_Ensemble/{i}.pdb"
seq = getSeq(fileLocation)
assert len(seq) == 414
all_seq.append(seq)
# assert preSeq == seq
# preSeq = seq
In [27]:
all_seq[0]
Out[27]:
In [28]:
all_seq[0] == all_seq[1]
Out[28]:
In [29]:
for i in range(90):
if all_seq[i] != all_seq[0]:
print(i)
In [9]:
def getAllFrames(movieLocation):
# movieLocation = "/Users/weilu/Research/examples/openMM_simulation/test_2/movie.pdb"
location = movieLocation
with open(location) as f:
a = f.readlines()
n = len(a)
# get the position of every model title
model_title_index_list = []
for i in range(n):
if len(a[i]) >= 5 and a[i][:5] == "MODEL":
model_title_index = i
model_title_index_list.append(model_title_index)
model_title_index_list.append(n)
check_array = np.diff(model_title_index_list)
if np.allclose(check_array, check_array[0]):
size = check_array[0]
elif np.allclose(check_array[:-1], check_array[0]) and check_array[-1] == check_array[0] + 1:
# this is ok. with extra "END"
size = check_array[0]
else:
print("!!!! Someting is wrong !!!!")
print(check_array)
return a
In [10]:
num_of_frames = int(n/size)
In [11]:
frame = 5
oneFrame = a[size*frame:size*(frame+1)]
In [60]:
frame = num_of_frames
oneFrame = a[size*frame:size*(frame+1)]
In [61]:
oneFrame
Out[61]:
In [45]:
# s = p.get_structure("test", f)
# residues = list(s.get_residues())
In [24]:
import io
f = io.StringIO("".join(oneFrame))
MAX_OFFSET=4
DISTANCE_CUTOFF=9.5
s = p.get_structure("test", f)
chains = s[0].get_list()
# import pdb file
native_coords = []
for chain in chains:
dis = []
all_res = []
for res in chain:
is_regular_res = res.has_id('CA') and res.has_id('O')
res_id = res.get_id()[0]
if (res.get_resname()=='GLY'):
native_coords.append(res['CA'].get_coord())
elif (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res:
native_coords.append(res['CB'].get_coord())
else:
print('ERROR: irregular residue at %s!' % res)
exit()
native_contacts_table = compute_native_contacts(native_coords, MAX_OFFSET, DISTANCE_CUTOFF)
In [25]:
native_contacts_table
Out[25]:
In [62]:
plt.imshow(native_contacts_table, origin=[0,0])
Out[62]:
In [ ]: