In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
from small_script.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [34]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})
In [3]:
In [233]:
pdb_list =['4a2n', '3kp9', '5xpd', '2xov_complete', '5d91', '6e67A']
# pdb_list = ["2xov_complete", "6e67A", "5xpd", "3kp9", "4a2n", "5d91", "2jo1"]
In [308]:
length_info = pd.read_csv("/Users/weilu/Research/server/jun_2019/simluation_hybrid/length_info.csv", index_col=0)
length_info = length_info.sort_values("Length").reset_index()
pdb_list_sorted_by_length = list(length_info.Protein.unique())
length_info_sorted_by_length = list(length_info.Length.unique())
label_list = []
for p, n in zip(pdb_list_sorted_by_length, length_info_sorted_by_length):
label_list.append(p+f"\n{n}")
In [234]:
simulationType = "simluation_hybrid"
# folder = "original"
folder = "second_small_batch"
all_data = []
for pdb in pdb_list:
for i in range(3):
for restart in range(1):
location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/{pdb}/{i}_middle/info.dat"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Restart=restart)
all_data.append(tmp)
except:
print(pdb, i, restart)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}_middle.csv")
In [235]:
fileLocation = "/Users/weilu/Research/data/openMM/simluation_hybrid_second_small_batch_06-30_middle.csv"
ha = pd.read_csv(fileLocation, index_col=0)
In [236]:
fileLocation = "/Users/weilu/Research/data/openMM/simluation_hybrid_second_small_batch_06-29.csv"
single = pd.read_csv(fileLocation, index_col=0)
In [237]:
combined = pd.concat([single.assign(Frag="single"), ha.assign(Frag="ha")])
In [272]:
plt.rcParams.update({'font.size': 12})
native_energy = combined.query("Steps < 1 and Run == 0").reset_index(drop=True)
y_show = "Fragment"
g = sns.FacetGrid(combined.query("Steps > 100"), col="Protein",col_wrap=2, hue="Frag", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Q_wat", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
energy = native_energy.query(f"Protein == '{name}'")[y_show].iloc[0]
ax.axhline(energy, ls="--", color="blue", linewidth=4)
try:
energy = native_energy.query(f"Protein == '{name}'")[y_show].iloc[1]
ax.axhline(energy, ls="--", color="orange", linewidth=4)
except:
pass
In [292]:
Out[292]:
In [295]:
max_Q_data
Out[295]:
In [ ]:
frame = t["Q_wat"].idxmax()
In [294]:
d = combined.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list)
# max_Q_data = d.groupby(["Protein", "Frag"])["Q_wat"].max().reset_index()
t = d.groupby(["Protein", "Frag"])["Q_wat"].idxmax().reset_index()
max_Q_data = d.iloc[t["Q_wat"].to_list()].reset_index(drop=True)
ax = sns.lineplot(x="Protein", y="Q_wat", hue="Frag", style="Frag", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=label_list[1:], rotation=0, ha='center')
In [241]:
d = combined.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list)
max_Q_data = d.groupby(["Protein", "Frag"])["Q_wat"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_wat", hue="Frag", style="Frag", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=label_list[1:], rotation=0, ha='center')
In [242]:
d = combined.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_Q_data = d.groupby(["Protein", "Frag"])["Q_mem"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_mem", hue="Frag", style="Frag", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=label_list[1:], rotation=0, ha='center')
In [160]:
simulationType = "simluation_hybrid"
# folder = "original"
folder = "second_small_batch"
all_data = []
for pdb in pdb_list:
for i in range(3):
for restart in range(1):
location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/{pdb}/{i}/info.dat"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Restart=restart)
all_data.append(tmp)
except:
print(pdb, i, restart)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv")
In [195]:
fileLocation = "/Users/weilu/Research/data/openMM/simluation_hybrid_second_small_batch_06-29.csv"
data = pd.read_csv(fileLocation, index_col=0)
In [162]:
d = data.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_Q_data = d.groupby(["Protein", "Run"])["Q_wat"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_wat", hue="Run", style="Run", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=pdb_list, rotation=45, ha='right')
In [163]:
d = data.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_Q_data = d.groupby(["Protein", "Run"])["Q_mem"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_mem", hue="Run", style="Run", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=pdb_list, rotation=45, ha='right')
In [75]:
data = data.merge(length_info, on="Protein")
In [79]:
data = data.sort_values("Length").reset_index(drop=True)
In [86]:
In [90]:
In [96]:
max_Q_data
Out[96]:
In [109]:
d = data.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list_sorted_by_length,
ordered=True)
# max_Q_data = d.groupby(["Protein", "Run"])["Q_wat"].max().reset_index()
max_Q_data = d.sort_values("Q_wat").groupby(["Protein", "Run"]).tail(1).reset_index()
ax = sns.lineplot(x="Protein", y="Q_wat", hue="Run", style="Run", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=label_list, rotation=0, ha='center')
In [92]:
d = data.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list_sorted_by_length,
ordered=True)
max_Q_data = d.groupby(["Protein", "Run"])["Q_mem"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_mem", hue="Run", style="Run", markers=True, ms=10, data=max_Q_data)
_ = ax.set_xticklabels(labels=label_list, rotation=0, ha='center')
In [57]:
d = data.query("Steps > 2").reset_index(drop=True)
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_Q_data = d.groupby(["Protein", "Run"])["Q_mem"].max().reset_index()
ax = sns.lineplot(x="Protein", y="Q_mem", hue="Run", style="Run", markers=True, ms=10, data=max_Q_data)
# _ = ax.set_xticklabels(labels=pdb_list, rotation=0, ha='right')
In [ ]:
In [24]:
simulationType = "simluation_hybrid"
# folder = "original"
folder = "second_small_batch"
all_data = []
for pdb in pdb_list:
for i in range(3):
for restart in range(1):
location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/{pdb}/{i}/frag_ha_energy_{i}.dat"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Restart=restart)
all_data.append(tmp)
except:
print(pdb, i, restart)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}_HA.csv")
In [27]:
fileLocation = "/Users/weilu/Research/data/openMM/simluation_hybrid_second_small_batch_06-29_HA.csv"
ha = pd.read_csv(fileLocation, index_col=0)
In [29]:
combined = pd.concat([data.assign(Frag="single"), ha.assign(Frag="ha")])
In [47]:
native_energy = combined.query("Steps < 1 and Run == 0").reset_index(drop=True)
In [48]:
native_energy
Out[48]:
In [49]:
y_show = "Fragment"
g = sns.FacetGrid(combined.query("Steps > 100"), col="Protein",col_wrap=2, hue="Frag", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Q_mem", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
energy = native_energy.query(f"Protein == '{name}'")[y_show].iloc[0]
ax.axhline(energy, ls="--", color="blue", linewidth=4)
energy = native_energy.query(f"Protein == '{name}'")[y_show].iloc[1]
ax.axhline(energy, ls="--", color="orange", linewidth=4)
In [37]:
y_show = "Fragment"
g = sns.FacetGrid(combined, col="Protein",col_wrap=2, hue="Frag", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Q_wat", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
# ax.axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axhline(energy, ls="--", color="orange", linewidth=4)
In [ ]:
In [296]:
infoLocation = "/Users/weilu/Research/server/jun_2019/simluation_hybrid/native/3kp9/info.dat"
In [297]:
info = pd.read_csv(infoLocation, sep="\s+")
In [302]:
info.plot("Steps", "Q_mem")
Out[302]:
In [303]:
info.plot("Steps", "Q_wat")
Out[303]:
In [ ]:
pdb = "2jo1"
info = []
for pdb in pdb_list:
pdbLocation = f"/Users/weilu/Research/server/jun_2019/simluation_hybrid/setup/{pdb}/{pdb}.pdb"
parser = PDBParser()
structure = parser.get_structure('X', pdbLocation)
n = len(list(structure.get_residues()))
info.append([pdb, n])
length_info = pd.DataFrame(info, columns=["Protein", "Length"])
length_info.to_csv("/Users/weilu/Research/server/jun_2019/simluation_hybrid/length_info.csv")
In [ ]:
In [ ]:
In [14]:
# data1 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_membrane_structure_prediction_with_pulling_06-18.csv", index_col=0)
# data2 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_multiLetter_frags_original_06-04.csv", index_col=0)
# data3 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_multiLetter_jun02_original_06-03.csv", index_col=0)
d = pd.concat([
# data3.assign(Scheme="original"),
data.assign(Scheme="singleMemory").query("Steps > 100"),
])
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_data = d.groupby(["Protein", "Scheme"])["Qc"].max().reset_index()
# plt.plot
sns.boxplot("Protein", "Q_wat", hue="Scheme", data=d)
# sns.boxplot("Qw", "Name", hue="Scheme", data=d)
_ = plt.xticks(rotation=30)
In [13]:
# data1 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_membrane_structure_prediction_with_pulling_06-18.csv", index_col=0)
# data2 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_multiLetter_frags_original_06-04.csv", index_col=0)
# data3 = pd.read_csv("/Users/weilu/Research/data/openMM/openMM_multiLetter_jun02_original_06-03.csv", index_col=0)
d = pd.concat([
# data3.assign(Scheme="original"),
data.assign(Scheme="singleMemory").query("Steps > 100"),
])
d.Protein = pd.Categorical(d.Protein,
categories=pdb_list,
ordered=True)
max_data = d.groupby(["Protein", "Scheme"])["Qc"].max().reset_index()
# plt.plot
sns.boxplot("Protein", "Q_mem", hue="Scheme", data=d)
# sns.boxplot("Qw", "Name", hue="Scheme", data=d)
_ = plt.xticks(rotation=30)
In [134]:
location = "/Users/weilu/Research/server/jun_2019/simluation_hybrid/second_small_batch/2xov_complete/0_longer/info.dat"
t = pd.read_csv(location, sep="\s+")
# skip first two.
t = t.query("Steps > 1").reset_index(drop=True)
frame = t["Q_wat"].idxmax()
# get the position of every model title
model_title_index_list = []
for i in range(n):
if len(a[i]) >= 5 and a[i][:5] == "MODEL":
model_title_index = i
model_title_index_list.append(model_title_index)
model_title_index_list.append(n)
check_array = np.diff(model_title_index_list)
if not np.allclose(check_array, check_array[0]):
print("!!!! Someting is wrong !!!!")
else:
size = check_array[0]
with open(f"max_Q_wat_frame_{frame}.pdb", "w") as out:
out.write("".join(a[size*frame:size*(frame+1)]))
In [164]:
t["Q_wat"].idxmax()
Out[164]:
In [138]:
location = "/Users/weilu/Research/server/jun_2019/simluation_hybrid/second_small_batch/2xov_complete/0_longer/movie.pdb"
with open(location) as f:
a = f.readlines()
In [142]:
n = len(a)
In [143]:
n
Out[143]:
In [153]:
model_title_index_list = np.array(model_title_index_list)
In [157]:
In [ ]:
frame = t["Q_wat"].idxmax()
size*frame:size*(frame+1)
In [151]:
In [ ]:
a[n//1000]
In [ ]:
n = len(a)
for i in range(n-1,-1,-1):
if len(a[i]) >= 5 and a[i][:5] == "MODEL":
print(i)
break
In [ ]:
In [123]:
t.iloc[t["Q_wat"].idxmax()]
Out[123]:
In [243]:
a = glob.glob("/Users/weilu/openmmawsem/PDBs/*.pdb")
In [244]:
len(a)
Out[244]:
In [264]:
parser = PDBParser(QUIET=True)
for loc in a[:10]:
# print(loc)
structure = parser.get_structure('X', loc)
res_list = list(structure.get_residues())
n = len(res_list)
res = res_list[0]
resId = res.get_id()
preResidueIndex = resId[1]
for res in res_list[1:]:
resId = res.get_id()
if resId[0] == " ":
if resId[1] == preResidueIndex + 1:
pass
else:
print("Error?", loc, res.get_full_id(), resId[1], preResidueIndex)
break
preResidueIndex = resId[1]
In [250]:
res = res_list[0]
In [276]:
structure = parser.get_structure('X', "/Users/weilu/openmmawsem/PDBs/1F8E.pdb")
seq = ""
for res in structure.get_residues():
if res.get_id()[0] == " ":
seq += three_to_one(res.get_resname())
In [277]:
seq
Out[277]:
In [279]:
seq[85:95]
Out[279]:
In [280]:
s = "RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPDECRFYALSQGTTIRGKHSNGTIHDRSQYRALISWPLSSPPTVYNSRVECIGWSSTSCHDGKTRMSICISGPNNNASAVIWYNRRPVTEINTWARNILRTQESECVCHNGVCPVVFTDGSATGPAETRIYYFKEGKILKWEPLAGTAKHIEECSCYGERAEITCTCRDNWQGSNRPVIRIDPVAMTHTSQYICSPVLTDNPRPNDPTVGKCNDPYPGNNNNGVKGFSYLDGVNTWLGRTISIASRSGYEMLKVPNALTDDKSKPTQGQTIVLNTDWSGYSGSFMDYWAEGECYRACFYVELIRGRPKEDKVWWTSNSIVSMCSSTEFLGQWDWPDGAKIEYFL"
In [281]:
s[85:95]
Out[281]:
In [ ]: