In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
from small_script.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = np.array([16.18033, 10])    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})

In [3]:
dataset = {}
dataset["optimization"] = ['1e0m', '1w4e', '1e0g', '2wqg', '1jo8', '1fex', '2l6r', '1c8c', '1g6p', '1mjc', '2jmc', '1hdn', '1st7', '1n88', '1d6o', '1hcd', '2ga5', '1j5u', '3o4d', '1k0s']
dataset["optimization_cath"] = ['1a75A00', '1bekA01', '1bqbA02', '1cpcB00', '1cscA02', '1cy5A00', '1dv5A00', '1e8yA05', '1evyA02', '1in4A03', '1l1fA03', '1vq8P01', '1xmkA00', '1zcaA02', '2grhA00', '2ii2A04', '2q6fB03', '2wh6A00', '3g0vA00', '3geuA00', '3h99A03', '3hrdD02', '3ju5A01', '3p1wA03', '4cxfA01', '4i2aA01', '4i4tB03', '4i6uB00', '5kn9A02']

In [165]:
pdb_list = dataset["optimization_cath"]
simulationType = "optimization_database"
run_n = 5
folder_list = ["iter0_gpu"]
all_data = []

for folder in folder_list:
    for pdb in pdb_list:
        for i in range(run_n):
                pre = f"/Users/weilu/Research/server/dec_2019/{simulationType}/{folder}/{pdb}/{i}"
                info_file = "info.dat"
                location = f"{pre}/{info_file}"
                try:
                    tmp = pd.read_csv(location, sep="\s+")
                    tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
                    all_data.append(tmp)
                except:
                    print(pdb, i, folder)
                    pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/optimization_database_iter0_gpu_12-29.csv

In [167]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/optimization_database_iter0_gpu_12-29.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein, 
                      categories=sub_pdb_list)

In [170]:
max_Q_data


Out[170]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q Beta Pap Helical Total Run Protein Folder
0 366 0.56 11.60 408.51 -574.28 -220.53 -202.01 0.0 0.0 0.0 -24.93 -0.00 0.0 -613.24 0 1a75A00 iter0_gpu
1 478 0.30 15.25 533.60 -758.93 -330.51 -274.89 0.0 0.0 0.0 -27.71 -1.52 0.0 -859.96 0 1bekA01 iter0_gpu
2 271 0.39 14.98 584.51 -793.09 -181.79 -237.88 0.0 0.0 0.0 -30.24 -0.02 0.0 -658.51 4 1bqbA02 iter0_gpu
3 497 0.41 15.49 556.91 -1226.64 -328.39 -345.68 0.0 0.0 0.0 -59.33 -0.00 0.0 -1403.12 2 1cpcB00 iter0_gpu
4 447 0.46 13.25 369.69 -675.83 -179.42 -183.97 0.0 0.0 0.0 -24.79 -0.00 0.0 -694.33 2 1cscA02 iter0_gpu
5 336 0.69 11.34 361.05 -668.22 -163.57 -194.35 0.0 0.0 0.0 -25.60 -0.00 0.0 -690.68 4 1cy5A00 iter0_gpu
6 478 0.59 10.54 231.20 -352.70 -150.07 -142.83 0.0 0.0 0.0 -19.35 -0.00 0.0 -433.75 0 1dv5A00 iter0_gpu
7 181 0.36 17.51 892.96 -1120.93 -340.00 -323.41 0.0 0.0 0.0 -44.09 -1.14 0.0 -936.62 2 1e8yA05 iter0_gpu
8 407 0.40 13.83 512.47 -855.85 -307.81 -259.17 0.0 0.0 0.0 -40.74 -0.21 0.0 -951.30 4 1evyA02 iter0_gpu
9 484 0.82 10.89 221.51 -556.43 -119.58 -171.95 0.0 0.0 0.0 -28.40 -0.01 0.0 -654.85 3 1in4A03 iter0_gpu
10 446 0.68 12.93 153.87 -275.61 -60.38 -85.50 0.0 0.0 0.0 -11.71 0.00 0.0 -279.33 3 1l1fA03 iter0_gpu
11 500 0.83 9.86 141.58 -301.40 -80.48 -107.35 0.0 0.0 0.0 -18.13 -0.47 0.0 -366.24 0 1vq8P01 iter0_gpu
12 406 0.78 11.94 256.65 -457.75 -105.64 -169.22 0.0 0.0 0.0 -29.10 -1.50 0.0 -506.56 0 1xmkA00 iter0_gpu
13 364 0.40 13.69 437.49 -762.61 -195.02 -222.52 0.0 0.0 0.0 -30.81 -0.01 0.0 -773.48 0 1zcaA02 iter0_gpu
14 442 0.40 14.12 468.02 -1043.29 -260.68 -287.82 0.0 0.0 0.0 -40.31 -0.00 0.0 -1164.09 3 2grhA00 iter0_gpu
15 461 0.73 11.17 263.81 -518.62 -128.63 -160.49 0.0 0.0 0.0 -23.37 -0.00 0.0 -567.30 2 2ii2A04 iter0_gpu
16 485 0.63 11.88 341.95 -605.81 -195.26 -200.22 0.0 0.0 0.0 -25.86 -0.80 0.0 -686.00 1 2q6fB03 iter0_gpu
17 182 0.47 15.13 733.34 -1085.91 -270.03 -313.92 0.0 0.0 0.0 -47.65 -0.00 0.0 -984.17 2 2wh6A00 iter0_gpu
18 495 0.68 11.59 238.12 -436.30 -120.51 -153.13 0.0 0.0 0.0 -17.93 -0.00 0.0 -489.75 0 3g0vA00 iter0_gpu
19 168 0.43 20.60 914.93 -1208.80 -281.06 -355.37 0.0 0.0 0.0 -49.50 -0.00 0.0 -979.79 0 3geuA00 iter0_gpu
20 437 0.65 14.47 493.29 -903.02 -266.94 -290.74 0.0 0.0 0.0 -39.15 -0.00 0.0 -1006.56 4 3h99A03 iter0_gpu
21 401 0.72 11.66 270.38 -484.64 -127.43 -156.23 0.0 0.0 0.0 -21.57 -0.00 0.0 -519.48 2 3hrdD02 iter0_gpu
22 296 0.56 13.19 378.41 -341.56 -135.57 -142.35 0.0 0.0 0.0 -14.94 -0.00 0.0 -256.02 4 3ju5A01 iter0_gpu
23 284 0.44 13.10 434.58 -530.43 -172.48 -176.01 0.0 0.0 0.0 -21.73 -0.00 0.0 -466.07 4 3p1wA03 iter0_gpu
24 382 0.80 12.87 283.79 -612.06 -136.34 -182.44 0.0 0.0 0.0 -26.01 -0.01 0.0 -673.07 1 4cxfA01 iter0_gpu
25 490 0.70 12.60 274.80 -556.95 -149.63 -191.75 0.0 0.0 0.0 -20.00 -0.00 0.0 -643.53 3 4i2aA01 iter0_gpu
26 453 0.84 13.36 169.70 -423.72 -59.95 -117.80 0.0 0.0 0.0 -18.61 -0.00 0.0 -450.38 0 4i4tB03 iter0_gpu
27 290 0.76 11.24 326.89 -536.59 -128.89 -161.98 0.0 0.0 0.0 -26.45 -0.00 0.0 -527.03 1 4i6uB00 iter0_gpu
28 478 0.58 12.52 301.63 -751.65 -191.98 -226.75 0.0 0.0 0.0 -30.85 -0.00 0.0 -899.60 2 5kn9A02 iter0_gpu

In [169]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [153]:
d = pd.read_csv("/Users/weilu/Research/server/dec_2019/iterative_optimization/original_pdbs/first_test_set.csv", index_col=0)
d = d.sort_values("Lpdb").reset_index(drop=True)
pdb_list = d.PDB.str.lower().to_list()

In [192]:
pdb_list = dataset["optimization"]
simulationType = "iterative_optimization"
# folder = "original"
# folder = "first"
# folder = "second_withoutExclusion"
# folder_list = ["first", "second_withoutExclusion"]
# "first", 
folder_list = ["iter7_gpu_long", "iter7_gpu", "iter6_gpu", "iter5_gpu", "iter5_withBiased_gpu", "iter0_gpu_less_beta", "iter4_gpu", "first_cpu2", "first_iter1_cpu4", "first_gpu", "iter2_gpu", "iter2_real_gpu", "iter3_gpu"]
all_data = []
for folder in folder_list:
    for pdb in pdb_list:
        for i in range(10):
                pre = f"/Users/weilu/Research/server/dec_2019/{simulationType}/{folder}/{pdb}/{i}"
                info_file = "info.dat"
                location = f"{pre}/{info_file}"
                try:
                    tmp = pd.read_csv(location, sep="\s+")
                    tmp = tmp.assign(Run=i, Protein=pdb, Folder=folder)
                    all_data.append(tmp)
                except:
                    print(pdb, i, folder)
                    pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/iterative_optimization_iter3_gpu_01-03.csv

In [4]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/iterative_optimization_iter3_gpu_01-03.csv", index_col=0)


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/arraysetops.py:522: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [194]:
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein, 
                      categories=sub_pdb_list)

In [196]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
#
t = d.groupby(["Protein", "Run", "Folder"]).tail(20)
ax = sns.boxenplot(x="Protein", y=y, hue="Folder", data=t)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [5]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
sub_data = max_Q_data.query("Protein != '1hcd'").reset_index(drop=True)
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
dataset["optimization_v2"] = ['1e0m', '1w4e', '1e0g', '2wqg', '1jo8', '1fex', '2l6r', '1c8c', '1g6p', '1mjc', '2jmc', '1hdn', '1st7', '1n88', '1d6o', '2ga5', '1j5u', '3o4d']

sub_data.Protein = pd.Categorical(sub_data.Protein, 
                      categories=dataset["optimization_v2"])
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [ ]:
sub_data

In [195]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter7_gpu_long' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [191]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu' or Folder == 'iter7_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [185]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu' or Folder == 'iter6_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [184]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
d = data.query("Folder=='iter5_gpu' or Folder == 'iter5_withBiased_gpu' or Folder == 'first_gpu'").reset_index(drop=True)
# d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [160]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
# d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [161]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
# d = data.query("Folder=='iter0_gpu_less_beta' or Folder == 'iter4_gpu'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [149]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
t = d.groupby(["Protein", "Run", "Folder"]).tail(20)
ax = sns.boxenplot(x="Protein", y=y, hue="Folder", data=t)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [144]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder!='iter2_gpu' and Folder != 'first_cpu2'").reset_index(drop=True)
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [140]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data.query("Folder != 'iter2_gpu'")
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [121]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [ ]:
y = "Steps"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')

CPU, thread 1


In [ ]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/iterative_optimization_first_12-14.csv", index_col=0)
sub_pdb_list = pdb_list
data.Protein = pd.Categorical(data.Protein, 
                      categories=sub_pdb_list)

In [24]:
y = "Steps"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [21]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [27]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])[y].idxmax().reset_index()
max_Q_data = d.iloc[t[y].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [28]:
y = "Q"
# d = data.query("Steps > 1500").reset_index(drop=True)
d = data
t = d.groupby(["Protein", "Folder", "Run"])["Steps"].idxmax().reset_index()
max_Q_data = d.iloc[t["Steps"].to_list()].reset_index(drop=True)
sub_data = max_Q_data
# sub_data = max_Q_data.query("Scheme in ['hybrid contact', 'contact as in water', 'contact as in membrane']")
# sub_data = max_Q_mem_data
ax = sns.lineplot(x="Protein", y=y, markers=True, ms=10, style="Folder", hue="Folder", data=sub_data, dashes=False)
# _ = ax.set_xticklabels(labels=sub_label_list, rotation=0, ha='center')



In [16]:
data


Out[16]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q Beta Pap Helical Total Run Protein Folder
0 0 0.05 36.04 68.02 -111.94 -21.21 -13.23 0.0 0.0 0.0 -0.00 0.00 0.0 -78.36 0 1e0m first
1 1 0.10 33.51 1.35 -118.56 -33.90 -21.35 0.0 0.0 0.0 -0.00 0.00 0.0 -172.46 0 1e0m first
2 2 0.20 22.52 193.61 -61.70 -28.71 -37.44 0.0 0.0 0.0 -0.00 0.00 0.0 65.76 0 1e0m first
3 3 0.15 20.12 214.98 -83.38 -29.58 -30.61 0.0 0.0 0.0 -0.00 0.00 0.0 71.41 0 1e0m first
4 4 0.26 17.40 202.66 -80.84 -29.53 -42.52 0.0 0.0 0.0 -9.62 0.00 0.0 40.15 0 1e0m first
5 5 0.32 17.36 237.12 -81.77 -30.40 -43.85 0.0 0.0 0.0 -4.70 -0.01 0.0 76.38 0 1e0m first
6 6 0.35 13.83 193.53 -75.48 -29.74 -48.77 0.0 0.0 0.0 -11.08 -3.34 0.0 25.11 0 1e0m first
7 7 0.31 11.38 209.04 -75.46 -30.34 -39.83 0.0 0.0 0.0 -11.80 -2.85 0.0 48.77 0 1e0m first
8 8 0.28 14.80 201.89 -66.88 -31.54 -42.82 0.0 0.0 0.0 -20.81 -8.83 0.0 31.01 0 1e0m first
9 9 0.30 18.44 213.88 -63.19 -30.29 -48.24 0.0 0.0 0.0 -7.52 -0.00 0.0 64.64 0 1e0m first
10 10 0.33 17.25 224.73 -76.77 -29.93 -48.51 0.0 0.0 0.0 -16.29 -4.45 0.0 48.78 0 1e0m first
11 11 0.32 14.94 216.67 -64.24 -29.71 -46.80 0.0 0.0 0.0 -12.07 -8.74 0.0 55.11 0 1e0m first
12 12 0.31 14.53 216.29 -74.71 -30.91 -42.52 0.0 0.0 0.0 -19.72 -0.00 0.0 48.44 0 1e0m first
13 13 0.31 18.14 200.53 -66.42 -28.62 -46.56 0.0 0.0 0.0 -19.72 -3.88 0.0 35.33 0 1e0m first
14 14 0.38 11.11 201.16 -64.05 -32.61 -46.30 0.0 0.0 0.0 -16.94 -0.00 0.0 41.27 0 1e0m first
15 15 0.33 11.73 213.63 -74.41 -32.29 -40.14 0.0 0.0 0.0 -1.77 -2.00 0.0 63.02 0 1e0m first
16 16 0.50 10.78 193.64 -75.89 -35.98 -50.87 0.0 0.0 0.0 -18.44 -8.71 0.0 3.75 0 1e0m first
17 17 0.45 11.55 215.44 -80.24 -33.30 -52.92 0.0 0.0 0.0 -28.88 -7.40 0.0 12.69 0 1e0m first
18 18 0.57 10.33 179.26 -74.30 -29.23 -54.20 0.0 0.0 0.0 -49.27 -8.80 0.0 -36.54 0 1e0m first
19 19 0.53 9.85 159.16 -86.94 -36.67 -51.08 0.0 0.0 0.0 -30.36 -16.46 0.0 -62.36 0 1e0m first
20 20 0.57 9.70 189.18 -84.97 -34.21 -55.71 0.0 0.0 0.0 -36.75 -13.10 0.0 -35.55 0 1e0m first
21 21 0.53 10.87 207.76 -68.66 -32.55 -55.63 0.0 0.0 0.0 -18.35 -8.42 0.0 24.15 0 1e0m first
22 22 0.59 9.72 203.41 -82.07 -35.50 -57.64 0.0 0.0 0.0 -29.05 -9.63 0.0 -10.48 0 1e0m first
23 23 0.68 9.78 188.21 -72.73 -34.19 -55.21 0.0 0.0 0.0 -35.96 -9.59 0.0 -19.48 0 1e0m first
24 24 0.64 9.96 211.36 -79.77 -33.71 -63.32 0.0 0.0 0.0 -50.02 -14.05 0.0 -29.52 0 1e0m first
25 25 0.65 9.89 222.12 -86.93 -35.93 -58.56 0.0 0.0 0.0 -42.27 -10.08 0.0 -11.65 0 1e0m first
26 26 0.68 9.52 168.49 -83.48 -33.21 -60.33 0.0 0.0 0.0 -45.32 -12.64 0.0 -66.48 0 1e0m first
27 27 0.72 9.07 209.63 -84.38 -36.18 -61.44 0.0 0.0 0.0 -47.44 -12.59 0.0 -32.39 0 1e0m first
28 28 0.54 10.83 226.00 -75.01 -34.56 -56.70 0.0 0.0 0.0 -22.22 -8.22 0.0 29.29 0 1e0m first
29 29 0.69 9.86 206.89 -76.79 -36.37 -64.11 0.0 0.0 0.0 -35.56 -13.65 0.0 -19.59 0 1e0m first
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42376 45 0.19 25.73 780.42 -428.77 -227.53 -234.46 0.0 0.0 0.0 -127.95 -39.57 0.0 -277.85 9 1k0s first
42377 46 0.21 22.63 891.06 -437.32 -234.44 -251.03 0.0 0.0 0.0 -119.45 -42.15 0.0 -193.33 9 1k0s first
42378 47 0.21 20.61 845.80 -442.51 -225.29 -245.23 0.0 0.0 0.0 -125.74 -42.13 0.0 -235.09 9 1k0s first
42379 48 0.18 20.78 872.71 -430.17 -254.79 -236.21 0.0 0.0 0.0 -97.43 -41.01 0.0 -186.89 9 1k0s first
42380 49 0.20 18.76 879.36 -408.75 -260.47 -227.67 0.0 0.0 0.0 -114.41 -38.94 0.0 -170.89 9 1k0s first
42381 50 0.22 18.43 856.17 -439.06 -234.71 -235.84 0.0 0.0 0.0 -118.61 -34.18 0.0 -206.24 9 1k0s first
42382 51 0.22 17.59 796.16 -434.52 -249.91 -237.90 0.0 0.0 0.0 -117.17 -47.37 0.0 -290.70 9 1k0s first
42383 52 0.24 16.10 808.36 -424.59 -264.71 -235.35 0.0 0.0 0.0 -150.40 -50.39 0.0 -317.08 9 1k0s first
42384 53 0.29 14.72 826.68 -431.99 -329.03 -233.93 0.0 0.0 0.0 -113.88 -52.62 0.0 -334.77 9 1k0s first
42385 54 0.30 14.28 842.07 -435.49 -351.71 -234.37 0.0 0.0 0.0 -126.62 -66.65 0.0 -372.78 9 1k0s first
42386 55 0.32 14.28 846.51 -409.18 -363.74 -239.56 0.0 0.0 0.0 -159.77 -71.12 0.0 -396.86 9 1k0s first
42387 56 0.32 14.38 803.47 -452.69 -339.00 -245.28 0.0 0.0 0.0 -146.26 -61.60 0.0 -441.36 9 1k0s first
42388 57 0.32 14.44 866.53 -436.85 -320.88 -249.16 0.0 0.0 0.0 -168.94 -58.65 0.0 -367.94 9 1k0s first
42389 58 0.33 14.94 826.33 -418.03 -313.50 -252.50 0.0 0.0 0.0 -130.43 -48.77 0.0 -336.89 9 1k0s first
42390 59 0.31 14.85 773.55 -395.37 -360.85 -240.87 0.0 0.0 0.0 -143.16 -56.03 0.0 -422.73 9 1k0s first
42391 60 0.31 14.41 774.87 -409.17 -316.67 -249.46 0.0 0.0 0.0 -139.07 -70.98 0.0 -410.48 9 1k0s first
42392 61 0.30 14.63 850.36 -410.72 -370.97 -244.40 0.0 0.0 0.0 -147.42 -63.49 0.0 -386.62 9 1k0s first
42393 62 0.31 14.15 858.83 -427.87 -327.65 -254.44 0.0 0.0 0.0 -123.96 -58.39 0.0 -333.47 9 1k0s first
42394 63 0.35 14.15 810.61 -440.21 -358.33 -252.36 0.0 0.0 0.0 -144.58 -65.40 0.0 -450.27 9 1k0s first
42395 64 0.34 14.54 825.57 -437.31 -320.07 -261.02 0.0 0.0 0.0 -131.32 -63.89 0.0 -388.03 9 1k0s first
42396 65 0.33 14.85 747.03 -435.97 -325.16 -250.14 0.0 0.0 0.0 -145.07 -53.06 0.0 -462.36 9 1k0s first
42397 66 0.36 14.35 790.67 -414.35 -333.56 -260.94 0.0 0.0 0.0 -111.63 -55.45 0.0 -385.27 9 1k0s first
42398 67 0.34 14.58 764.20 -412.78 -327.85 -260.89 0.0 0.0 0.0 -127.18 -56.43 0.0 -420.94 9 1k0s first
42399 68 0.33 14.56 858.28 -417.42 -336.67 -258.01 0.0 0.0 0.0 -149.93 -56.90 0.0 -360.66 9 1k0s first
42400 69 0.34 14.46 813.71 -436.84 -345.83 -258.06 0.0 0.0 0.0 -115.37 -57.28 0.0 -399.67 9 1k0s first
42401 70 0.32 14.96 913.64 -423.38 -331.08 -254.40 0.0 0.0 0.0 -126.57 -66.72 0.0 -288.52 9 1k0s first
42402 71 0.31 15.30 821.02 -422.38 -323.29 -242.96 0.0 0.0 0.0 -134.05 -61.05 0.0 -362.70 9 1k0s first
42403 72 0.31 14.88 770.12 -460.80 -340.11 -265.73 0.0 0.0 0.0 -125.62 -63.45 0.0 -485.60 9 1k0s first
42404 73 0.30 14.77 879.47 -421.11 -338.21 -237.73 0.0 0.0 0.0 -153.01 -65.99 0.0 -336.57 9 1k0s first
42405 74 0.31 14.58 741.65 -436.88 -320.05 -244.28 0.0 0.0 0.0 -146.53 -64.67 0.0 -470.75 9 1k0s first

42406 rows × 17 columns


In [30]:
parser = PDBParser()

In [35]:
movie_dcd = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/4/movie.dcd"

In [36]:
s = parser.get_structure("X", movie_dcd)


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-36-891398dda1cd> in <module>
----> 1 s = parser.get_structure("X", movie_dcd)

~/anaconda3/envs/py36/lib/python3.6/site-packages/Bio/PDB/PDBParser.py in get_structure(self, id, file)
     84 
     85             with as_handle(file, mode='rU') as handle:
---> 86                 self._parse(handle.readlines())
     87 
     88             self.structure_builder.set_header(self.header)

~/anaconda3/envs/py36/lib/python3.6/codecs.py in decode(self, input, final)
    319         # decode input (taking the buffer into account)
    320         data = self.buffer + input
--> 321         (result, consumed) = self._buffer_decode(data, self.errors, final)
    322         # keep undecoded input until the next call
    323         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 20: invalid start byte

In [31]:
movie = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/4/movie.pdb"

In [37]:
movie = "/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/0/movie.pdb"

In [38]:
s = parser.get_structure("X", movie)

In [45]:
complete_models = []
for i in range(10):
    movie = f"/Users/weilu/Research/server/dec_2019/iterative_optimization/first/2wqg/{i}/movie.pdb"
    s = parser.get_structure("X", movie)
    complete_models += list(s.get_models())

In [65]:
t = data.query("Protein == '2wqg' and Steps > 1").reset_index(drop=True)
t = t.rename(columns={"Q":"Qw"})

In [49]:
len(complete_models)


Out[49]:
3409

In [73]:
print(pdb_list)


['1e0m', '1w4e', '1e0g', '2wqg', '1jo8', '1fex', '2l6r', '1c8c', '1g6p', '1mjc', '2jmc', '1hdn', '1st7', '1n88', '1d6o', '1hcd', '2ga5', '1j5u', '3o4d', '1k0s']

In [ ]:
folder_list = ["first"]
folder = "first"
pre = f"/scratch/wl45/dec_2019/iterative_optimization/{folder}"
to_folder = "."
os.system(f"mkdir -p {to_folder}/decoys/openMM")
complete_models = []
for pdb in pdb_list:
    for i in range(10):
        movie = f"{pre}/{pdb}/{i}/movie.pdb"
        s = parser.get_structure("X", movie)
        complete_models += list(s.get_models())
    t = data.query(f"Protein == '{pdb}' and Steps > 1").reset_index(drop=True)
    t["structure"] = complete_models
    t = t.rename(columns={"Q":"Qw"})
    last50 = t.groupby("Run").tail(50).reset_index(drop=True)
    to_folder = "."
    last50.to_pickle(f"{to_folder}/decoys/openMM/{folder}_{pdb}")

In [75]:
folder_list = ["first"]
folder = "first"
pdb = "1w4e"
pre = f"/Users/weilu/Research/server/dec_2019/iterative_optimization/{folder}"
complete_models = []
for i in range(10):
    movie = f"{pre}/{pdb}/{i}/movie.pdb"
    s = parser.get_structure("X", movie)
    complete_models += list(s.get_models())
t = data.query(f"Protein == '{pdb}' and Steps > 1").reset_index(drop=True)
t["structure"] = complete_models
t = t.rename(columns={"Q":"Qw"})

In [76]:
len(t)


Out[76]:
3756

In [77]:
len(complete_models)


Out[77]:
3756

In [112]:
t = pd.read_pickle("/Users/weilu/Research/server/dec_2019/multiDensityOptimization/optimization_iteration1/optimization/decoys/openMM/1c8c_first.pkl")

In [113]:
structures = t["structure"].to_list()

In [89]:
print(structures[0])


<Model id=172>

In [114]:
all_res = list(structures[0].get_residues())

In [115]:
all_res[0]


Out[115]:
<Residue MET het=  resseq=1 icode= >

In [116]:
is_hetero(all_res[0])


Out[116]:
False

In [ ]:


In [ ]:
all_res

In [96]:
all_res[0].id[0]


Out[96]:
'H_NGP'

In [70]:
last50 = t.groupby("Run").tail(50).reset_index(drop=True)

In [ ]:
to_folder = "."
last50.to_pickle(f"{to_folder}/decoys/openMM/{folder}_{pdb}")

In [55]:
t["structure"] = complete_models

In [ ]:
sampled["structure"] = sampled.apply(getStructures, all_movies=all_movies, axis=1)

In [ ]:
import io
from Bio.PDB.PDBParser import PDBParser
simulation_location, name = args.label.split("__")
simulation_location_name = f"{simulation_location}_{name}"

def getStructures(x, all_movies):
    index = int(x["index"])+1
    run = int(x["Run"])

    start = index * size
    end = (index + 1) * size
    f = io.StringIO("".join(all_movies[run][start:end]))
    parser = PDBParser()
    return parser.get_structure(f"{index}", f)

a = pd.read_csv(f"{database_location}/Q_{simulation_location_name}", index_col=0).query(f"Rank < {decoy_n*3}")
sampled = a.sample(decoy_n)
all_movies = {}
for i in sampled["Run"].unique():
    with open(f"{database_location}/{simulation_location_name}_{i}/movie.pdb") as f:
        movie = f.readlines()
    all_movies[i] = movie
size = 0
for line in movie:
    size += 1
    if line == "ENDMDL\n":
        break
print(simulation_location_name, size)
sampled["structure"] = sampled.apply(getStructures, all_movies=all_movies, axis=1)
sampled["Qw"] = sampled[" Qw"].round(3)
sampled.drop(" Qw", axis=1)
sampled.to_pickle(f"decoys/lammps/{name}_{simulation_location}.pkl")

In [ ]:
a