In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
from small_script.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = np.array([16.18033, 10]) #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})
In [3]:
pdb_list = ["1akr", "1opd", "1ptf", "1tig", "1tmy", "2acy", "5nul"]
In [100]:
simulationType = "evaluation_simulation"
run_n = 5
# folder_list = ["iteration_14_frag_zBias_2", "iteration_14_contact_stronger_frag_zBias_2", "iteration_13_frag_zBias_2", "iteration_12_frag_zBias_2", "run2_frag_cbd_shift_center_iter10_include_native", "run2_frag_cbd_shift_center_iter4", "run2_frag_old", "run2_frag_cbd", "run2_frag_cbd_shift_center"]
folder_list = ["iteration_16_frag_zBias_3", "iteration_15_frag_zBias_2", "iteration_14_frag_zBias_2", "iteration_14_contact_stronger_frag_zBias_2", "iteration_12_frag_zBias_2", "run2_frag_old", "run2_frag_cbd", "run2_frag_cbd_shift_center"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/jun_week1_2020/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder, location)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [101]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/evaluation_simulation_run2_frag_cbd_shift_center_06-03.csv")
data = data.query("Folder !='iteration_12_frag_zBias_2'").reset_index(drop=True)
y = "Q"
d = data.query("Steps > 1950").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
# plt.legend(["original model", "CBD model"])
In [95]:
d50 = data.query("Steps > 1950 and Folder !='run2_frag_old'").reset_index(drop=True)
g = sns.FacetGrid(d50, col="Protein", hue="Folder", col_wrap=3, sharex=False)
g = g.map(sns.scatterplot, "Fragment", "Q").add_legend()
# plt.legend()
In [96]:
d50 = data.query("Steps > 1950").reset_index(drop=True)
g = sns.FacetGrid(d50, col="Protein", hue="Folder", col_wrap=3, sharex=False)
g = g.map(sns.scatterplot, "Contact", "Q").add_legend()
# plt.legend()
In [32]:
d50 = data.query("Steps > 1950").reset_index(drop=True)
g = sns.FacetGrid(d50, col="Protein", hue="Folder", col_wrap=3, sharex=False)
g = g.map(sns.scatterplot, "Contact", "Q").add_legend()
# plt.legend()
In [27]:
d50 = data.query("Steps > 1950").reset_index(drop=True)
g = sns.FacetGrid(d50, col="Protein", hue="Folder", col_wrap=3, sharex=False)
g = g.map(sns.scatterplot, "Contact", "Q").add_legend()
# plt.legend()
In [33]:
sns.boxplot("Protein", "Q", hue="Folder", data=d50)
Out[33]:
In [19]:
sns.boxplot("Protein", "Contact", hue="Folder", data=d50)
Out[19]:
In [71]:
max_Q_data = d.sort_values("Q").groupby(["Protein", "Folder"]).tail(1).reset_index(drop=True)
sub_data = max_Q_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
In [58]:
sub_data
Out[58]:
In [28]:
simulationType = "evaluation_simulation"
run_n = 3
folder_list = ["run1_cbd_withBeta_stronger_side_chain_new_exclude", "run1_cbd_withBeta_stronger_side_chain", "run1", "run1_cbd_withBeta"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/may_week1_2020//{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder, location)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [29]:
In [33]:
plt.rcParams['figure.figsize'] = 0.5 * np.array([16.18033, 10]) #golden ratio
In [63]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/evaluation_simulation_run1_cbd_withBeta_05-04.csv")
data = data.reset_index(drop=True)
y = "Q"
d = data.query("Steps > 1000").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data.query("Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
plt.legend(["original model", "CBD model"])
Out[63]:
In [30]:
y = "Q"
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [36]:
data["Folder_Run"]= data["Folder"] + data["Run"].astype(str)
In [39]:
plt.rcParams['figure.figsize'] = np.array([16.18033, 10]) #golden ratio
In [50]:
chosen = data.query("Steps > 100 and Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
sns.boxplot("Q", "Protein", data=chosen, hue="Folder_Run")
Out[50]:
In [42]:
chosen = data.query("Steps > 1500 and Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
sns.boxplot("Q", "Protein", data=chosen, hue="Folder_Run")
Out[42]:
In [53]:
plt.rcParams['figure.figsize'] = 0.5 * np.array([16.18033, 10]) #golden ratio
# chosen = data.query("Steps > 100 and Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
chosen = data.query("Steps > 1500")
max_Q_data = chosen.sort_values("Q").groupby(["Folder", "Protein"]).tail(1)
# sub_data = max_Q_data.query("Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
sub_data = max_Q_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
# plt.legend(["original model", "CBD model"])
In [49]:
plt.rcParams['figure.figsize'] = 0.5 * np.array([16.18033, 10]) #golden ratio
max_Q_data = chosen.sort_values("Q").groupby(["Folder", "Protein"]).tail(1)
sub_data = max_Q_data.query("Folder != 'run1_cbd_withBeta' and Folder !='run1_cbd_withBeta_stronger_side_chain'")
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
plt.legend(["original model", "CBD model"])
Out[49]:
In [10]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_0_02-07.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_1_02-10.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteratiteration_native_new_4ion_2_02-11.csv", index_col=0)
data = pd.read_csv("/Users/weilu/Research/data/openMM/evaluation_simulation_run1_cbd_withBeta_05-03.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Folder != 'iter3_environment'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[10]:
In [83]:
sub_data
Out[83]:
In [84]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[84]:
In [76]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_3_02-17.csv", index_col=0)
# sub_pdb_list = pdb_list
# data.Protein = pd.Categorical(data.Protein,
# categories=sub_pdb_list)
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Folder != 'iter3_environment'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[76]:
In [79]:
sns.boxplot("Fragment", "Folder", data=sub_data)
Out[79]:
In [77]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[77]:
In [401]:
raw_data_all = pd.read_csv("/Users/weilu/Research/frustration_selection/data.csv", index_col=0)
In [433]:
pre
Out[433]:
In [449]:
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/mar_2020/{simulationType}/{folder}/{pdb}/{i}"
timeFile = f"{pre}/time.dat"
time = float(np.loadtxt(timeFile))
Length = len(read_fasta(f"{pre}/crystal_structure.fasta"))
all_data.append([pdb, i, time, Length, folder])
In [450]:
data = pd.DataFrame(all_data, columns=["Protein", "Run", "Time", "Length", "Folder"])
In [459]:
data_selected = data.query("Folder=='iter4_shift_well' or Folder=='iteration_4' or Folder=='iter3_shift_well'")
sns.lmplot("Length", "Time", hue="Folder", data=data_selected)
Out[459]:
In [64]:
simulationType = "mass_iterative_run"
run_n = 2
folder_list = ["iteration_0_cbd", "iteration_1_cbd", "iteration_2_cbd", "iteration_start_native", "iteration_start_native_iter2"]
folder_list = ["iteration_0_cbd", "iteration_start_native", "iteration_start_native_iter2",
"iteration_start_native_iter3", "iteration_new_1",
"iteration_start_native_iter4", "iteration_start_native_iter5", "iteration_new_1"]
folder_list = ["iteration_0_cbd", "iteration_start_native", "iteration_new_1",
"iteration_start_native_iter4", "iteration_start_native_iter5",
"iteration_new_2", "iteration_new_3", "iteration_new_4", "iteration_native_new_4", "iteration_new_4_without_burial", "iteration_new_4_without_burial_shift_well"]
folder_list = ["iteration_0_cbd", "iteration_start_native",
"iteration_new_4", "iteration_native_new_4", "iteration_new_4_without_burial", "iteration_new_4_without_burial_shift_well",
"iteration_0_stronger_exclude_volume", "iteration_0_stronger_exclude_volume_k10"]
folder_list = ["iteration_0_cbd", "iteration_0_stronger_exclude_volume", "iteration_0_stronger_exclude_volume_k10",
"iteration_1_stronger_exclude", "iteration_1_stronger_exclude_withoutBurial", "iteration_native_new_4",
"iteration_2_stronger_exclude_withoutBurial", "iteration_1_stronger_exclude_withoutBurial_k10",
"iteration_1_stronger_exclude_withoutBurial_bugfix"]
folder_list = ["iteration_0_cbd", "iteration_0_stronger_exclude_volume", "iteration_1_stronger_exclude_withoutBurial",
# "iteration_2_stronger_exclude_withoutBurial",
"iteration_1_stronger_exclude_withoutBurial_bugfix", "iteration_0_stronger_exclude_volume_shift_well",
"iteration_0_stronger_exclude_volume_stronger_side_chain", "iteration_0_stronger_exclude_volume_stronger_side_chain_k4",
"iteration_2_bug_fixed"]
folder_list = ["iteration_3", "iteration_2_bug_fixed", "iteration_0_stronger_exclude_volume", "iteration_4", "iteration_4_z_weighted_2",
"iter3_shift_well", "iter4_shift_well", "iter5_shift_well"]
folder_list = ["iter1_environment", "iter1_environment", "iteration_0_stronger_exclude_volume", "iteration_4_z_weighted_2",
"iter3_shift_well", "iter4_shift_well", "iter5_shift_well", "iter0_environment"]
folder_list = ["iter4_environment", "iter3_environment", "iter2_environment", "iter1_environment", "iteration_4_z_weighted_2",
"iter5_shift_well", "iter0_environment"]
folder_list = ["iteration_4_z_weighted_2", "new_iter1_environment_new", "new_iter2_environment_new", "new_iter3_environment_new", "new_iter4_environment_new", "new_iter5_environment_new"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/apr_2020/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [69]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_0_02-07.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_1_02-10.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteratiteration_native_new_4ion_2_02-11.csv", index_col=0)
data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_new_iter5_environment_new_05-01.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
In [70]:
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data.query("Folder !='new_iter1_environment_new'")
# sub_data = max_Q_data.query("Folder != 'iter3_environment'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[70]:
In [66]:
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
In [67]:
# sub_data = max_Q_data.query("Folder != 'iter3_environment'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[67]:
In [30]:
sub_data = max_Q_data.query("Folder != 'iter3_environment'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[30]:
In [18]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[18]:
In [31]:
sns.boxplot("Q", "Folder", data=sub_data)
Out[31]:
In [32]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[32]:
In [14]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[14]:
In [474]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[474]:
In [432]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[432]:
In [393]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[393]:
In [396]:
sub_data.query("Folder=='iteration_4'").sort_values("Contact")
Out[396]:
In [385]:
sub_data = max_Q_data.query("Folder=='iteration_0_stronger_exclude_volume_stronger_side_chain' or\
Folder=='iteration_0_stronger_exclude_volume_stronger_side_chain_k4' or\
Folder=='iteration_0_stronger_exclude_volume'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[385]:
In [381]:
sub_data = max_Q_data.query("Folder=='iteration_1_stronger_exclude_withoutBurial_bugfix' or\
Folder=='iteration_2_bug_fixed' or\
Folder=='iteration_0_stronger_exclude_volume'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[381]:
In [379]:
sub_data = max_Q_data.query("Folder!='iteration_1_stronger_exclude_withoutBurial' and Folder!='iteration_0_stronger_exclude_volume_k10'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
Out[379]:
In [370]:
sub_data.query("Protein == '1cxc'")
Out[370]:
In [367]:
sns.boxplot("Q", "Folder", data=sub_data)
Out[367]:
In [349]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[349]:
In [331]:
a = sub_data.query("Folder == 'iteration_0_stronger_exclude_volume'")[["Q", "Contact", "Fragment", "Exclude_Side", "Run", "Protein", "Folder"]].reset_index(drop=True)
b = sub_data.query("Folder == 'iteration_2_stronger_exclude_withoutBurial'")[["Q", "Contact", "Fragment", "Exclude_Side", "Run", "Protein", "Folder"]].reset_index(drop=True)
In [332]:
c = a.merge(b, on=["Run", "Protein"])
In [333]:
c["Q_diff"] = c["Q_x"] - c["Q_y"]
In [334]:
c.sort_values("Contact_y").head()
Out[334]:
In [338]:
c.sort_values("Q_diff").tail()
Out[338]:
In [339]:
sub_data.query("Protein == '1erv'")
Out[339]:
In [335]:
sub_data = max_Q_data.query("Folder=='iteration_0_stronger_exclude_volume' or Folder=='iteration_2_stronger_exclude_withoutBurial'")
sns.lineplot("Protein", "Q", hue="Folder", data=sub_data)
Out[335]:
In [336]:
sub_data = max_Q_data.query("Folder=='iteration_0_stronger_exclude_volume' or Folder=='iteration_2_stronger_exclude_withoutBurial'")
sns.lineplot("Protein", "Exclude_Side", hue="Folder", data=sub_data)
Out[336]:
In [323]:
a = pd.concat([selected, selected2])
sns.lineplot("Protein", "Exclude_Side", hue="Folder", data=a)
Out[323]:
In [321]:
selected = data.query("Folder == 'iteration_native_new_4' and Steps == 0").reset_index(drop=True)
selected2 = data.query("Folder == 'iteration_1_stronger_exclude_withoutBurial' and Steps == 2001").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", data=selected, color="red")
sns.scatterplot("Contact", "Exclude_Side", hue="Folder", data=selected2)
Out[321]:
In [307]:
# selected = data.query("Folder == 'iteration_0_stronger_exclude_volume_k10' and Steps == 2001").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Folder", data=sub_data)
Out[307]:
In [255]:
sub_data = max_Q_data.query("Folder=='iteration_0_cbd' or Folder=='iteration_0_stronger_exclude_volume' or Folder=='iteration_0_stronger_exclude_volume_k10'")
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend()
Out[255]:
In [259]:
selected.query("Protein == '1vcc'")
Out[259]:
In [258]:
selected = data.query("Folder == 'iteration_0_stronger_exclude_volume' and Steps == 2001").reset_index(drop=True)
selected.sort_values("Q").head(5)
Out[258]:
In [237]:
selected = data.query("Folder == 'iteration_0_stronger_exclude_volume_k10' and Steps == 2001").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[237]:
In [239]:
selected.sort_values("Exclude_Side").tail(5)
Out[239]:
In [240]:
selected = data.query("Folder == 'iteration_0_stronger_exclude_volume' and Steps == 2001").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[240]:
In [241]:
selected.sort_values("Exclude_Side").tail(5)
Out[241]:
In [212]:
selected = data.query("Folder == 'iteration_native_new_4' and Steps == 1").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[212]:
In [213]:
selected.sort_values("Exclude_Side").tail(1)
Out[213]:
In [214]:
selected = data.query("Folder == 'iteration_native_new_4' and Steps == 0").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[214]:
In [215]:
selected.sort_values("Exclude_Side").tail(1)
Out[215]:
In [202]:
selected = data.query("Folder == 'iteration_0_cbd' and Steps > 2000").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[202]:
In [203]:
selected = data.query("Folder == 'iteration_new_4' and Steps > 2000").reset_index(drop=True)
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[203]:
In [219]:
selected = data.query("Folder == 'iteration_new_4_without_burial' and Steps > 2000").reset_index(drop=True)
In [220]:
selected["ratio"] = selected["Contact"] / selected["Fragment"]
In [217]:
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
def get_contact_gamma_info(gammaFile):
# check the gamma.
# read in gamma, and sort by size.
# gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
return contact_gammas
In [187]:
# check the gamma.
# read in gamma, and sort by size.
gammaFile = "/Users/weilu/Research/server/mar_2020/mass_iterative_optimization/optimization_new_4_withoutBurial/saved_gammas/new_4_cutoff600_impose_Aprime_constraint"
gamma = np.loadtxt(gammaFile)
res_type_map_letters = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
inverse_res_type_map = dict(list(zip(list(range(20)), res_type_map_letters)))
c = 0
info_ = []
for i in range(20):
for j in range(i, 20):
info_.append(["Direct", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Direct", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
c += 1
for i in range(20):
for j in range(i, 20):
info_.append(["Protein", res_type_map_letters[i], res_type_map_letters[j], c, round(gamma[c],3)])
if i != j:
info_.append(["Protein", res_type_map_letters[j], res_type_map_letters[i], c, round(gamma[c],3)])
info_.append(["Water", res_type_map_letters[i], res_type_map_letters[j], c+210, round(gamma[c+210],3)])
if i != j:
info_.append(["Water", res_type_map_letters[j], res_type_map_letters[i], c+210, round(gamma[c+210],3)])
c += 1
In [188]:
len(gamma)
Out[188]:
In [193]:
contact_gammas = pd.DataFrame(info_, columns=["Interaction", "Res1", "Res2", "Index", "Gamma"])
In [216]:
contact_gammas.sort_values("Gamma").head()
Out[216]:
In [197]:
sns.scatterplot("Contact", "Exclude_Side", hue="Q", data=selected)
Out[197]:
In [222]:
selected.sort_values("ratio").tail(1)
Out[222]:
In [184]:
sns.scatterplot("Contact", "Fragment", data=selected)
Out[184]:
In [168]:
sub_data.query("Protein == '7rsa'")
Out[168]:
In [161]:
a = selected.query("Steps == 1").reset_index(drop=True)
In [162]:
a.sort_values("SideChain")
Out[162]:
In [159]:
a.hist("Exclude_Side")
Out[159]:
In [108]:
sub_data.query("Folder == 'iteration_new_2'").sort_values("Q")
Out[108]:
In [100]:
sub_data.query("Folder == 'iteration_start_native_iter5'").sort_values("Q")
Out[100]:
In [80]:
sns.boxplot("Folder", "Q", data=sub_data)
Out[80]:
In [38]:
simulationType = "mass_iterative_run"
run_n = 2
folder_list = ["iteration_0_cbd", "iteration_1_cbd", "iteration_2_cbd", "iteration_start_native", "iteration_start_native_iter2"]
folder_list = ["iteration_0_cbd", "iteration_start_native", "iteration_start_native_iter2",
"iteration_start_native_iter3", "iteration_new_1"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/mar_2020/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [39]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_0_02-07.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_1_02-10.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_2_02-11.csv", index_col=0)
data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_new_1_03-10.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
In [40]:
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
In [41]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend()
Out[41]:
In [42]:
sub_data.query("Folder == 'iteration_start_native_iter3'").sort_values("Q")
Out[42]:
In [10]:
sns.boxplot("Folder", "Q", data=sub_data)
Out[10]:
In [4]:
simulationType = "mass_iterative_run"
run_n = 1
folder_list = ["iteration_0", "iteration_1", "iteration_2", "iteration_3"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/feb_2020/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [357]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_0_02-07.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_1_02-10.csv", index_col=0)
# data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_2_02-11.csv", index_col=0)
data = pd.read_csv("/Users/weilu/Research/data/openMM/mass_iterative_run_iteration_3_02-17.csv", index_col=0)
# sub_pdb_list = pdb_list
# data.Protein = pd.Categorical(data.Protein,
# categories=sub_pdb_list)
In [358]:
y = "Steps"
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
In [359]:
def distplot_fig(data, x, hue=None, row=None, col=None, legend=True, hist=False, **kwargs):
"""A figure-level distribution plot with support for hue, col, row arguments."""
bins = kwargs.pop('bins', None)
if (bins is None) and hist:
# Make sure that the groups have equal-sized bins
bins = np.histogram_bin_edges(data[x].dropna())
g = sns.FacetGrid(data, hue=hue, row=row, col=col)
g.map(sns.distplot, x, bins=bins, hist=hist, **kwargs)
if legend and (hue is not None) and (hue not in [x, row, col]):
g.add_legend(title=hue)
return g
In [363]:
sns.boxplot("Contact", "Folder", data=sub_data)
Out[363]:
In [19]:
g = sns.FacetGrid(sub_data, hue="Folder", height=5, aspect=1.618)
g = g.map(sns.distplot, "Q")
plt.legend()
Out[19]:
In [20]:
sns.boxplot("Folder", "Q", data=sub_data)
Out[20]:
In [26]:
print(new_order)
In [24]:
new_order = max_Q_data.query("Folder == 'iteration_3'").sort_values("Q")["Protein"].unique().to_list()
sub_data = max_Q_data.sort_values("Q").reset_index(drop=True).reset_index()
sub_data.Protein = sub_data.Protein.astype(str)
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=new_order)
ax = sns.lineplot(x="Protein", y="Q", markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False, sort=True)
In [73]:
new_order = max_Q_data.query("Folder == 'iteration_2'").sort_values("Q")["Protein"].unique().to_list()
sub_data = max_Q_data.sort_values("Q").reset_index(drop=True).reset_index()
sub_data.Protein = sub_data.Protein.astype(str)
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=new_order)
ax = sns.lineplot(x="Protein", y="Q", markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False, sort=True)
In [71]:
new_order = max_Q_data.query("Folder == 'iteration_0'").sort_values("Q")["Protein"].unique().to_list()
sub_data = max_Q_data.sort_values("Q").reset_index(drop=True).reset_index()
sub_data.Protein = sub_data.Protein.astype(str)
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=new_order)
ax = sns.lineplot(x="Protein", y="Q", markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False, sort=True)
In [67]:
new_order = max_Q_data.query("Folder == 'iteration_0'").sort_values("Q")["Protein"].unique().to_list()
sub_data = max_Q_data.sort_values("Q").reset_index(drop=True).reset_index()
sub_data.Protein = sub_data.Protein.astype(str)
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=new_order)
ax = sns.lineplot(x="Protein", y="Q", markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False, sort=True)
In [47]:
y = "Q"
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
In [50]:
new_order = max_Q_data.query("Folder == 'iteration_0'").sort_values("Q")["Protein"].unique().to_list()
In [59]:
sub_data = max_Q_data.sort_values("Q").reset_index(drop=True).reset_index()
sub_data.Protein = sub_data.Protein.astype(str)
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=new_order)
In [61]:
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False, sort=True)
In [22]:
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [10]:
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [ ]:
In [ ]:
In [165]:
pdb_list = dataset["optimization_cath"]
simulationType = "optimization_database"
run_n = 5
folder_list = ["iter0_gpu"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(run_n):
pre = f"/Users/weilu/Research/server/dec_2019/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [167]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/optimization_database_iter0_gpu_12-29.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
In [170]:
max_Q_data
Out[170]:
In [169]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [153]:
d = pd.read_csv("/Users/weilu/Research/server/dec_2019/iterative_optimization/original_pdbs/first_test_set.csv", index_col=0)
d = d.sort_values("Lpdb").reset_index(drop=True)
pdb_list = d.PDB.str.lower().to_list()
In [192]:
pdb_list = dataset["optimization"]
simulationType = "iterative_optimization"
# folder = "original"
# folder = "first"
# folder = "second_withoutExclusion"
# folder_list = ["first", "second_withoutExclusion"]
# "first",
folder_list = ["iter7_gpu_long", "iter7_gpu", "iter6_gpu", "iter5_gpu", "iter5_withBiased_gpu", "iter0_gpu_less_beta", "iter4_gpu", "first_cpu2", "first_iter1_cpu4", "first_gpu", "iter2_gpu", "iter2_real_gpu", "iter3_gpu"]
all_data = []
for folder in folder_list:
for pdb in pdb_list:
for i in range(10):
pre = f"/Users/weilu/Research/server/dec_2019/{simulationType}/{folder}/{pdb}/{i}"
info_file = "info.dat"
location = f"{pre}/{info_file}"
try:
tmp = pd.read_csv(location, sep="\s+")
tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
all_data.append(tmp)
except:
print(pdb, i, folder)
pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)
In [4]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/iterative_optimization_iter3_gpu_01-03.csv", index_col=0)
In [194]:
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
In [196]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
#
t = d.groupby(["Protein", "Run", "Folder"]).tail(20)
ax = sns.boxenplot(x="Protein", y=y, hue="Folder", data=t)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [5]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
sub_data = max_Q_data.query("Protein != '1hcd'").reset_index(drop=True)
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
dataset["optimization_v2"] = ['1e0m', '1w4e', '1e0g', '2wqg', '1jo8', '1fex', '2l6r', '1c8c', '1g6p', '1mjc', '2jmc', '1hdn', '1st7', '1n88', '1d6o', '2ga5', '1j5u', '3o4d']
sub_data.Protein = pd.Categorical(sub_data.Protein,
categories=dataset["optimization_v2"])
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [ ]:
sub_data
In [195]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [191]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [185]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [184]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder=='iter5_gpu' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [160]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [161]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [149]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
t = d.groupby(["Protein", "Run", "Folder"]).tail(20)
ax = sns.boxenplot(x="Protein", y=y, hue="Folder", data=t)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [144]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [140]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder != 'iter2_gpu'")
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [121]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [ ]:
y = "Steps"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [ ]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/iterative_optimization_first_12-14.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein,
categories=sub_pdb_list)
In [24]:
y = "Steps"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [21]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [27]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [28]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])["Steps"].idxmax().reset_index()
max_Q_data = d.iloc[t["Steps"].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')
In [16]:
data
Out[16]:
In [30]:
parser = PDBParser()
In [35]:
movie_dcd = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/4/movie.dcd"
In [36]:
s = parser.get_structure("X", movie_dcd)
In [31]:
movie = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/4/movie.pdb"
In [37]:
movie = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/0/movie.pdb"
In [38]:
s = parser.get_structure("X", movie)
In [45]:
complete_models = []
for i in range(10):
movie = f"/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/{i}/movie.pdb"
s = parser.get_structure("X", movie)
complete_models += list(s.get_models())
In [65]:
t = data.query("Protein == '2wqg' and Steps > 1").reset_index(drop=True)
t = t.rename(columns={"Q":"Qw"})
In [49]:
len(complete_models)
Out[49]:
In [73]:
print(pdb_list)
In [ ]:
folder_list = ["first"]
folder = "first"
pre = f"/scratch/wl45/dec_2019/iterative_optimization/{folder}"
to_folder = "."
os.system(f"mkdir -p {to_folder}/decoys/openMM")
complete_models = []
for pdb in pdb_list:
for i in range(10):
movie = f"{pre}/{pdb}/{i}/movie.pdb"
s = parser.get_structure("X", movie)
complete_models += list(s.get_models())
t = data.query(f"Protein == '{pdb}' and Steps > 1").reset_index(drop=True)
t["structure"] = complete_models
t = t.rename(columns={"Q":"Qw"})
last50 = t.groupby("Run").tail(50).reset_index(drop=True)
to_folder = "."
last50.to_pickle(f"{to_folder}/decoys/openMM/{folder}_{pdb}")
In [75]:
folder_list = ["first"]
folder = "first"
pdb = "1w4e"
pre = f"/Users/weilu/Research/server/dec_2019/iterative_optimization/{folder}"
complete_models = []
for i in range(10):
movie = f"{pre}/{pdb}/{i}/movie.pdb"
s = parser.get_structure("X", movie)
complete_models += list(s.get_models())
t = data.query(f"Protein == '{pdb}' and Steps > 1").reset_index(drop=True)
t["structure"] = complete_models
t = t.rename(columns={"Q":"Qw"})
In [76]:
len(t)
Out[76]:
In [77]:
len(complete_models)
Out[77]:
In [112]:
t = pd.read_pickle("/Users/weilu/Research/server/dec_2019/multiDensityOptimization/optimization_iteration1/optimization/decoys/openMM/1c8c_first.pkl")
In [113]:
structures = t["structure"].to_list()
In [89]:
print(structures[0])
In [114]:
all_res = list(structures[0].get_residues())
In [115]:
all_res[0]
Out[115]:
In [116]:
is_hetero(all_res[0])
Out[116]:
In [ ]:
In [ ]:
all_res
In [96]:
all_res[0].id[0]
Out[96]:
In [70]:
last50 = t.groupby("Run").tail(50).reset_index(drop=True)
In [ ]:
to_folder = "."
last50.to_pickle(f"{to_folder}/decoys/openMM/{folder}_{pdb}")
In [55]:
t["structure"] = complete_models
In [ ]:
sampled["structure"] = sampled.apply(getStructures, all_movies=all_movies, axis=1)
In [ ]:
import io
from Bio.PDB.PDBParser import PDBParser
simulation_location, name = args.label.split("__")
simulation_location_name = f"{simulation_location}_{name}"
def getStructures(x, all_movies):
index = int(x["index"])+1
run = int(x["Run"])
start = index * size
end = (index + 1) * size
f = io.StringIO("".join(all_movies[run][start:end]))
parser = PDBParser()
return parser.get_structure(f"{index}", f)
a = pd.read_csv(f"{database_location}/Q_{simulation_location_name}", index_col=0).query(f"Rank < {decoy_n*3}")
sampled = a.sample(decoy_n)
all_movies = {}
for i in sampled["Run"].unique():
with open(f"{database_location}/{simulation_location_name}_{i}/movie.pdb") as f:
movie = f.readlines()
all_movies[i] = movie
size = 0
for line in movie:
size += 1
if line == "ENDMDL\n":
break
print(simulation_location_name, size)
sampled["structure"] = sampled.apply(getStructures, all_movies=all_movies, axis=1)
sampled["Qw"] = sampled[" Qw"].round(3)
sampled.drop(" Qw", axis=1)
sampled.to_pickle(f"decoys/lammps/{name}_{simulation_location}.pkl")
In [ ]:
a