In [2]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
sys.path.insert(0,'/Users/weilu/Research/opt_server/')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
from small_script.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [3]:
plt.rcParams['figure.figsize'] = 0.5 * np.array([16.18033, 10])    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})

In [4]:
figureFolder = "/Users/weilu/Dropbox/openAWSEM/figures/"

In [5]:
parser = PDBParser()

In [6]:
def get_cys_info(fileLocation):
    s = parser.get_structure("X", fileLocation)
    cys_bonds = []
    for model, m in enumerate(s.get_models()):
        all_res = list(m.get_residues())
        n = len(all_res)
        for i in range(n):
            res1 = all_res[i]
            if res1.resname != "CYS":
                continue
            for j in range(i+1, n):
                res2 = all_res[j]
                if res2.resname != "CYS":
                    continue
                r = res1["CB"] - res2["CB"]
                if r < 4.5:
                    cys_bonds.append([model, i, j, r])
                    # print(i, j, r)
    a = pd.DataFrame(cys_bonds, columns=["model", "i", "j", "r"])
    # a.to_csv("cys_bonds.csv")
    return a
def correct_fraction(native_cys_bond, prediction_cys_bond):
    # FN, TP
    TP = FN = FP = 0
    native_cys_bond["pair"] = native_cys_bond["i"].astype(str) + "_" + native_cys_bond["j"].astype(str)
    native_pair_list = native_cys_bond["pair"].to_list()

    prediction_cys_bond["pair"] = prediction_cys_bond["i"].astype(str) + "_" + prediction_cys_bond["j"].astype(str)
    prediction_pair_list = prediction_cys_bond["pair"].to_list()

    for native_pair in native_pair_list:
        if native_pair in prediction_pair_list:
            TP += 1
        else:
            FN += 1
    # FP,
    for prediction_pair in prediction_pair_list:
        if prediction_pair not in native_pair_list:
            FP += 1
    return TP, FN, FP

def correct_fraction_v2(native_cys_bond, prediction_cys_bond):
    TP, FN, FP = correct_fraction(native_cys_bond, prediction_cys_bond)
    f = TP/(TP+FN+FP)
    return f, TP, FN, FP


def correct_fraction_v3(prediction_cys_bond, native_cys_bond=None):
    TP, FN, FP = correct_fraction(native_cys_bond, prediction_cys_bond)
    f = TP/(TP+FN+FP)
    return pd.DataFrame([[f, TP, FN, FP]], columns=["fraction", "TP", "FN", "FP"])

In [211]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_lesser_frag_06-17.csv", index_col=0)
# # data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_lesser_frag_06-16.csv", index_col=0)

# a = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_jun10.csv", index_col=0)
# a = a.sort_values("Length").reset_index(drop=True)
# data = data.merge(a, on="Protein")

# # a = a.append({'Protein':'1ppb_H', 'Length':259}, ignore_index=True)
# # a = a.sort_values("Length").reset_index(drop=True)
# # a.to_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_jun10.csv")

# pdb_list = ["1ppb_H", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# d_list =[]
# for pdb in pdb_list:
#     t = pd.read_csv(f"/Users/weilu/Research/data/openMM/{pdb}_cys_06_17.csv")
#     d_list.append(t)
# d_cys = pd.concat(d_list).reset_index(drop=True)
# d_cys["Steps"] = d_cys["model"] + 2
# data.Submode = data.Submode.astype(int)
# data = data.merge(d_cys, how='left', on=["Steps", "Protein", "Submode", "Run"])
# data["fraction"] = data["fraction"].fillna(0)

# data.to_csv("/Users/weilu/Research/data/openMM/disulfide_jun17.csv")

data = pd.read_csv("/Users/weilu/Research/data/openMM/disulfide_jun17.csv", index_col=0)


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [134]:
data["Protein"].unique()


Out[134]:
array(['1hn4_A', '1ppb_H', '1lmm', '1tcg', '1fs3', '1bpi'], dtype=object)

In [135]:
# data = data.query("Run < 10").reset_index(drop=True)

In [203]:
# scheme_dic = {"330":"k_0", "331":"k_1", "332":"k_2", "333":"k_5", "3300":"original"}
scheme_dic = {"330":"k=0", "331":"k=1", "332":"k=2", "333":"k=5", "3300":"Standard AWSEM"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 2200 and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode", "Run"]).tail(1)


# a = a.query("Protein != '1hn4'").reset_index(drop=True)
# selected.Protein = pd.Categorical(selected.Protein, categories=a.Protein, ordered=True)

In [212]:
selected.query("Protein =='1lmm' and Submode=='332'")


Out[212]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q ... Submode Length Scheme Unnamed: 0 model level_4 fraction TP FN FP
667832 2300 0.58 9.83 120.35 -110.38 -54.34 -66.76 0.0 0.0 0.0 ... 332 40 k=2 124073.0 2298.0 0.0 0.250000 1.0 2.0 1.0
697793 2237 0.58 9.44 135.87 -108.87 -63.56 -75.89 0.0 0.0 0.0 ... 332 40 k=2 119338.0 2235.0 0.0 0.666667 2.0 1.0 0.0
670381 2347 0.59 9.86 122.83 -119.28 -55.75 -69.55 0.0 0.0 0.0 ... 332 40 k=2 127619.0 2345.0 0.0 0.333333 1.0 2.0 0.0
693032 2480 0.59 9.83 119.31 -111.04 -62.61 -75.23 0.0 0.0 0.0 ... 332 40 k=2 137799.0 2478.0 0.0 0.000000 0.0 3.0 1.0
685340 2294 0.62 9.84 111.07 -126.12 -62.25 -68.29 0.0 0.0 0.0 ... 332 40 k=2 123651.0 2292.0 0.0 0.666667 2.0 1.0 0.0
672944 2408 0.62 10.35 114.15 -118.57 -48.34 -72.05 0.0 0.0 0.0 ... 332 40 k=2 132217.0 2406.0 0.0 0.666667 2.0 1.0 0.0
690534 2484 0.63 9.86 95.21 -112.81 -60.99 -78.71 0.0 0.0 0.0 ... 332 40 k=2 138103.0 2482.0 0.0 0.000000 0.0 3.0 2.0
675529 2491 0.63 10.24 116.13 -118.17 -52.01 -75.08 0.0 0.0 0.0 ... 332 40 k=2 138616.0 2489.0 0.0 1.000000 3.0 0.0 0.0
652995 2475 0.64 9.71 103.24 -117.64 -56.45 -71.98 0.0 0.0 0.0 ... 332 40 k=2 137347.0 2473.0 0.0 0.333333 1.0 2.0 0.0
657776 2252 0.65 10.47 114.58 -110.40 -43.51 -78.42 0.0 0.0 0.0 ... 332 40 k=2 120408.0 2250.0 0.0 0.250000 1.0 2.0 1.0
680481 2439 0.65 9.79 128.46 -121.38 -59.64 -77.92 0.0 0.0 0.0 ... 332 40 k=2 134590.0 2437.0 0.0 0.666667 2.0 1.0 0.0
678016 2476 0.65 9.71 97.22 -112.49 -58.08 -76.63 0.0 0.0 0.0 ... 332 40 k=2 137468.0 2474.0 0.0 0.666667 2.0 1.0 0.0
688047 2499 0.65 9.96 111.56 -119.90 -59.83 -73.46 0.0 0.0 0.0 ... 332 40 k=2 139243.0 2497.0 0.0 0.333333 1.0 2.0 0.0
660456 2430 0.66 9.43 118.91 -117.41 -64.99 -73.98 0.0 0.0 0.0 ... 332 40 k=2 133895.0 2428.0 0.0 0.500000 2.0 1.0 1.0
665301 2271 0.67 9.90 101.86 -106.44 -55.09 -76.53 0.0 0.0 0.0 ... 332 40 k=2 121864.0 2269.0 0.0 0.666667 2.0 1.0 0.0
662868 2340 0.67 9.48 119.96 -104.21 -51.93 -80.92 0.0 0.0 0.0 ... 332 40 k=2 127103.0 2338.0 0.0 0.000000 0.0 3.0 3.0
700381 2323 0.68 9.80 105.53 -110.64 -55.19 -79.86 0.0 0.0 0.0 ... 332 40 k=2 125858.0 2321.0 0.0 0.333333 1.0 2.0 0.0
695384 2330 0.68 10.20 127.07 -112.10 -54.95 -78.24 0.0 0.0 0.0 ... 332 40 k=2 126392.0 2328.0 0.0 0.250000 1.0 2.0 1.0
682833 2289 0.70 9.97 131.14 -101.81 -50.90 -73.21 0.0 0.0 0.0 ... 332 40 k=2 123275.0 2287.0 0.0 0.666667 2.0 1.0 0.0
655428 2406 0.76 9.94 112.20 -120.66 -56.96 -78.81 0.0 0.0 0.0 ... 332 40 k=2 132027.0 2404.0 0.0 0.000000 0.0 3.0 2.0

20 rows × 27 columns


In [213]:
a = selected.groupby(["Protein", "Submode"])["fraction"].max().reset_index()
a["Scheme"] = a["Submode"].apply(lambda x: scheme_dic[x])

In [214]:
# a = a.sort_values(["Scheme"]).reset_index()
a.Scheme = pd.Categorical(a.Scheme, categories=['k=5', 'k=2', 'k=1', 'k=0', 'Standard AWSEM'], ordered=True)
a = a.sort_values(["Scheme"]).reset_index()

a.Protein = pd.Categorical(a.Protein, categories=['1tcg', '1lmm', '1bpi', '1fs3', '1ppb_H', '1hn4_A'], ordered=True)
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=a, dashes=True)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(3)'
labels[1] = '1lmm\n(3)'
labels[2] = '1bpi\n(3)'
labels[3] = '1fs3\n(4)'
labels[4] = '1ppb\n(4)'
labels[5] = '1hn4\n(7)'
ax.set_xticklabels(labels)

plt.xlabel("Protein(number of disulfide bonds)")
plt.ylabel("Fraction")

plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction_v2.png", dpi=300, bbox_inches='tight')



In [ ]:


In [221]:


In [222]:
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 2200 and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)

selected.Scheme = pd.Categorical(selected.Scheme, categories=['k=5', 'k=2', 'k=1', 'k=0', 'Standard AWSEM'], ordered=True)
selected = selected.sort_values(["Scheme"]).reset_index()

selected.Protein = pd.Categorical(selected.Protein, categories=['1tcg', '1lmm', '1bpi', '1fs3', '1hn4_A', '1ppb_H'], ordered=True)
ax = sns.lineplot("Protein", "Q", hue="Scheme", style="Scheme", markers=True, data=selected.sort_values(["Scheme", "Protein"]).reset_index(drop=True), dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Best Q")
plt.xlabel("Protein(Length)")
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(22)'
labels[1] = '1lmm\n(40)'
labels[2] = '1bpi\n(58)'
labels[3] = '1fs3\n(124)'
labels[4] = '1hn4\n(129)'
labels[5] = '1ppb\n(259)'
ax.set_xticklabels(labels)
plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_bestQ_v2.png", dpi=300, bbox_inches='tight')



In [225]:
data["k_cys"] = data["Scheme"]
filtered_data = data.query("Steps > 2200")

# filtered_data = filtered_data.query("k_cys != 'original'")
Q_max = filtered_data.sort_values('Q').groupby(["Protein", "k_cys", "Run"]).tail(1).sort_values(["Protein", 'k_cys', "Q"]).reset_index(drop=True)
Q_max["Annealing Index"] = Q_max.groupby(["Protein", "k_cys"])["Q"].rank(method="first", ascending=False).astype(int)
Q_max.k_cys = pd.Categorical(Q_max.k_cys,  ['k=5', 'k=2', 'k=1', 'k=0', 'Standard AWSEM'], ordered=True)
Q_max["Scheme"] = Q_max["k_cys"]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(1.5*16.18033, 2*10))

protein = '1tcg'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1bpi'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1lmm'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")



protein = '1fs3'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1hn4_A'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"1hn4")


protein = '1ppb_H'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")


# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='center right', bbox_to_anchor=(0.7, 0.15))
fig.legend((['k=0', 'k=1', 'k=2', 'k=5', 'Standard AWSEM']), bbox_to_anchor=(0.98, 0.5), loc=2, borderpad=1.5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc="lower left", ncol=5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='lower center')
# axes[-1, -1].axis('off')

plt.tight_layout()
plt.savefig(f"{figureFolder}/seperate_run_best_include_1ppb_v2.png", dpi=300, bbox_inches='tight')



In [115]:
selected = selected.reset_index(drop=True)

In [116]:
selected.to_csv("/Users/weilu/Research/server/jun_week3_2020/disulfide_bond/selected_jun17.csv")

In [118]:
# read cys bond information.

fraction = []
for i, line in selected.iterrows():
    run = line["Run"]
    protein = line["Protein"]
    frame = line["Steps"] - 2
    submode = line["Submode"]
    print(run, protein, frame, submode)
    fileLocation = f"/Users/weilu/Research/server/jun_week3_2020/disulfide_bond/chosen_frames_jun17/{protein}_{submode}.pdb"
    prediction_cys_bond = get_cys_info(fileLocation)
    fileLocation = f"/Users/weilu/Research/server/jun_week3_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
    native_cys_bond = get_cys_info(fileLocation)
    TP, FN, FP = correct_fraction(native_cys_bond, prediction_cys_bond)
    fraction.append([TP, FN, FP])

selected = selected.reset_index(drop=True)
fraction_ = pd.DataFrame(fraction, columns=["TP", "FN", "FP"])
selected = pd.concat([selected, fraction_], axis=1)
selected["fraction"] = selected["TP"] / (selected["TP"] + selected["FN"] + selected["FP"])


4 1ppb_H 2359 3300
14 1ppb_H 2421 330
4 1ppb_H 2345 332
19 1ppb_H 2450 333
15 1ppb_H 2369 331
8 1fs3 2303 330
8 1fs3 2404 3300
13 1fs3 2321 331
3 1hn4_A 2384 330
13 1hn4_A 2487 331
6 1hn4_A 2414 3300
19 1lmm 2485 330
17 1hn4_A 2396 332
15 1hn4_A 2476 333
10 1lmm 2386 331
7 1fs3 2304 332
1 1lmm 2404 332
17 1lmm 2440 3300
6 1fs3 2434 333
12 1lmm 2420 333
19 1tcg 2424 330
12 1tcg 2397 331
9 1bpi 2434 3300
9 1bpi 2443 331
3 1bpi 2345 330
18 1bpi 2359 332
7 1bpi 2329 333
17 1tcg 2348 3300
3 1tcg 2399 332
10 1tcg 2487 333

In [119]:
selected.Protein = pd.Categorical(selected.Protein, categories=['1tcg', '1lmm', '1bpi', '1fs3', '1ppb_H', '1hn4_A'], ordered=True)
plot_data = selected.query("Protein != '1hn4'").sort_values(["Scheme"])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)



In [122]:
selected.query("Protein=='1lmm'")


Out[122]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q ... Total Run Protein Submode Length Scheme TP FN FP fraction
11 2487 0.70 10.12 107.39 -120.78 -56.87 -75.21 0.0 0.0 0.0 ... -164.40 19 1lmm 330 40 k_0 1 2 0 0.333333
14 2388 0.72 9.78 118.14 -110.12 -55.62 -76.94 0.0 0.0 0.0 ... -146.10 10 1lmm 331 40 k_1 2 1 0 0.666667
16 2406 0.76 9.94 112.20 -120.66 -56.96 -78.81 0.0 0.0 0.0 ... -167.24 1 1lmm 332 40 k_2 0 3 2 0.000000
17 2442 0.77 10.00 104.09 -122.12 -59.83 -78.34 0.0 0.0 0.0 ... -178.56 17 1lmm 3300 40 original 2 1 1 0.500000
19 2422 0.78 10.01 98.63 -109.36 -51.39 -79.55 0.0 0.0 0.0 ... -166.96 12 1lmm 333 40 k_5 3 0 0 1.000000

5 rows × 24 columns


In [120]:
selected.Protein = pd.Categorical(selected.Protein, categories=['1tcg', '1lmm', '1bpi', '1fs3', '1ppb_H', '1hn4_A'], ordered=True)
plot_data = selected.query("Protein != '1hn4'").sort_values(["Scheme"])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))

labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(3)'
labels[1] = '1lmm\n(3)'
labels[2] = '1bpi\n(3)'
labels[3] = '1fs3\n(4)'
labels[4] = '1ppb\n(4)'
labels[5] = '1hn4\n(7)'
ax.set_xticklabels(labels)

plt.xlabel("Protein(number of disulfide bonds)")
plt.ylabel("Fraction")
# plt.tight_layout()
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction.png", dpi=300, bbox_inches='tight')


Out[120]:
Text(0, 0.5, 'Fraction')

In [97]:
# data_cys = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_lesser_frag_06-16.csv", index_col=0)
data_cys = pd.read_csv("/Users/weilu/Research/data/openMM/cys_all_run2_single_timeStep2_lesser_frag_06-17.csv", index_col=0)


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [98]:
# pdb_list = ["1ppb_H", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# # pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1bpi"]
# for pdb in pdb_list:
#     print(pdb)
#     protein = pdb
#     fileLocation = f"/Users/weilu/Research/server/jun_week3_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
#     native_cys_bond = get_cys_info(fileLocation)

#     data_selected = data_cys.query(f"Protein=='{protein}'")
#     result = data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v3, native_cys_bond=native_cys_bond).reset_index()
#     result.to_csv(f"/Users/weilu/Research/data/openMM/{protein}_cys_06_17.csv")


1ppb_H
1lmm
1tcg
1hn4_A
1fs3
1bpi

In [99]:
# pdb_list = ["1ppb_H", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# d_list =[]
# for pdb in pdb_list:
#     t = pd.read_csv(f"/Users/weilu/Research/data/openMM/{pdb}_cys_06_17.csv")
#     d_list.append(t)
# d_cys = pd.concat(d_list).reset_index(drop=True)
# d_cys["Steps"] = d_cys["model"] + 2
# data.Submode = data.Submode.astype(int)
# data = data.merge(d_cys, how='left', on=["Steps", "Protein", "Submode", "Run"])
# data["fraction"] = data["fraction"].fillna(0)

In [178]:


In [179]:



Out[179]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q ... Submode Length Scheme Unnamed: 0 model level_4 fraction TP FN FP
0 0 1.00 14.40 98.80 -591.89 -171.71 -392.08 0.0 0.0 0.0 ... 3300 129 original NaN NaN NaN 0.00 NaN NaN NaN
1 1 0.99 14.37 11.99 -651.52 -202.55 -382.90 0.0 0.0 0.0 ... 3300 129 original NaN NaN NaN 0.00 NaN NaN NaN
2 2 0.14 30.37 687.03 -393.95 -117.08 -176.17 0.0 0.0 0.0 ... 3300 129 original NaN NaN NaN 0.00 NaN NaN NaN
3 3 0.13 31.32 750.42 -426.97 -115.67 -193.09 0.0 0.0 0.0 ... 3300 129 original NaN NaN NaN 0.00 NaN NaN NaN
4 4 0.15 32.49 713.22 -457.57 -117.92 -198.58 0.0 0.0 0.0 ... 3300 129 original 58.0 2.0 0.0 0.00 0.0 7.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1501195 2497 0.60 10.83 146.94 -161.35 -91.95 -121.35 0.0 0.0 0.0 ... 333 58 k_5 101921.0 2495.0 0.0 1.00 3.0 0.0 0.0
1501196 2498 0.70 10.52 152.68 -157.84 -94.66 -121.72 0.0 0.0 0.0 ... 333 58 k_5 101993.0 2496.0 0.0 1.00 3.0 0.0 0.0
1501197 2499 0.68 10.61 172.81 -168.02 -93.50 -132.26 0.0 0.0 0.0 ... 333 58 k_5 102077.0 2497.0 0.0 1.00 3.0 0.0 0.0
1501198 2500 0.61 10.56 179.79 -172.66 -91.88 -125.71 0.0 0.0 0.0 ... 333 58 k_5 102158.0 2498.0 0.0 1.00 3.0 0.0 0.0
1501199 2501 0.62 10.39 147.10 -153.08 -102.00 -126.94 0.0 0.0 0.0 ... 333 58 k_5 102236.0 2499.0 0.0 0.75 3.0 0.0 1.0

1501200 rows × 27 columns


In [176]:
a.fillna(0)


Out[176]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q Beta Pap Helical Disulfide Total Run Protein Submode Length Scheme
0 0 1.00 14.40 98.80 -591.89 -171.71 -392.08 0.0 0.0 0.0 -45.19 -0.77 0.0 0.0 -1102.85 0 1hn4_A 3300 129 original
1 1 0.99 14.37 11.99 -651.52 -202.55 -382.90 0.0 0.0 0.0 -46.76 -0.89 0.0 0.0 -1272.63 0 1hn4_A 3300 129 original
2 2 0.14 30.37 687.03 -393.95 -117.08 -176.17 0.0 0.0 0.0 -12.22 -0.49 0.0 0.0 -12.89 0 1hn4_A 3300 129 original
3 3 0.13 31.32 750.42 -426.97 -115.67 -193.09 0.0 0.0 0.0 -11.06 -0.00 0.0 0.0 3.64 0 1hn4_A 3300 129 original
4 4 0.15 32.49 713.22 -457.57 -117.92 -198.58 0.0 0.0 0.0 -19.18 -0.00 0.0 0.0 -80.03 0 1hn4_A 3300 129 original

In [ ]:


In [152]:
a = data.head()

In [129]:
d_cys


Out[129]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP Steps
0 0 1 4 1ppb_H 333 0 0.333333 1 2 0 3
1 1 1 5 1ppb_H 333 0 0.333333 1 2 0 3
2 2 1 10 1ppb_H 332 0 0.000000 0 3 1 3
3 3 2 4 1ppb_H 333 0 0.333333 1 2 0 4
4 4 2 5 1ppb_H 333 0 0.333333 1 2 0 4
... ... ... ... ... ... ... ... ... ... ... ...
751064 102232 2499 18 1bpi 332 0 0.500000 2 1 1 2501
751065 102233 2499 18 1bpi 333 0 1.000000 3 0 0 2501
751066 102234 2499 19 1bpi 331 0 0.333333 1 2 0 2501
751067 102235 2499 19 1bpi 332 0 0.000000 0 3 2 2501
751068 102236 2499 19 1bpi 333 0 0.750000 3 0 1 2501

751069 rows × 11 columns


In [172]:


In [174]:


In [108]:
d_cys.query("Protein=='1ppb_H' and Submode=='3300' and Run==8 and model	 > 2140")


Out[108]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP
96261 96261 2152 8 1ppb_H 3300 0 0.333333 1 2 0
96657 96657 2158 8 1ppb_H 3300 0 0.333333 1 2 0
96854 96854 2161 8 1ppb_H 3300 0 0.333333 1 2 0
97045 97045 2164 8 1ppb_H 3300 0 0.333333 1 2 0
97500 97500 2171 8 1ppb_H 3300 0 0.333333 1 2 0
... ... ... ... ... ... ... ... ... ... ...
117776 117776 2472 8 1ppb_H 3300 0 0.333333 1 2 0
117990 117990 2475 8 1ppb_H 3300 0 0.333333 1 2 0
118205 118205 2478 8 1ppb_H 3300 0 0.333333 1 2 0
118979 118979 2489 8 1ppb_H 3300 0 0.333333 1 2 0
119268 119268 2493 8 1ppb_H 3300 0 0.333333 1 2 0

88 rows × 10 columns


In [102]:
selected


Out[102]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q ... Total Run Protein Submode Length Scheme TP FN FP fraction
0 2142 0.27 16.99 820.47 -812.05 -604.67 -540.43 0.0 0.0 0.0 ... -1220.14 8 1ppb_H 3300 259 original 0 3 0 0.000000
1 2482 0.30 16.19 730.98 -842.40 -709.34 -532.61 0.0 0.0 0.0 ... -1486.54 19 1ppb_H 333 259 k_5 3 0 0 1.000000
2 2468 0.30 16.50 707.17 -828.36 -733.26 -531.28 0.0 0.0 0.0 ... -1496.30 4 1ppb_H 332 259 k_2 0 3 0 0.000000
3 2029 0.30 16.39 828.12 -782.02 -622.85 -533.87 0.0 0.0 0.0 ... -1209.29 14 1ppb_H 330 259 k_0 0 3 0 0.000000
4 2019 0.35 17.10 873.43 -789.46 -634.53 -515.25 0.0 0.0 0.0 ... -1185.94 15 1ppb_H 331 259 k_1 1 2 0 0.333333
5 2040 0.61 14.90 415.47 -591.89 -192.31 -281.73 0.0 0.0 0.0 ... -724.04 8 1fs3 330 124 k_0 1 3 0 0.250000
6 2046 0.65 14.76 395.02 -577.79 -194.74 -296.06 0.0 0.0 0.0 ... -747.24 16 1fs3 331 124 k_1 1 3 0 0.250000
7 2386 0.66 14.31 405.55 -572.62 -205.48 -313.38 0.0 0.0 0.0 ... -720.40 3 1hn4_A 330 129 k_0 3 4 0 0.428571
8 2238 0.67 14.94 385.31 -569.85 -187.93 -293.72 0.0 0.0 0.0 ... -734.04 7 1fs3 3300 124 original 1 3 0 0.250000
9 2416 0.67 13.59 376.04 -579.00 -229.16 -331.14 0.0 0.0 0.0 ... -798.73 6 1hn4_A 3300 129 original 3 4 0 0.428571
10 2273 0.68 13.96 371.44 -594.96 -206.90 -320.25 0.0 0.0 0.0 ... -788.08 10 1hn4_A 331 129 k_1 5 2 0 0.714286
11 2487 0.70 10.12 107.39 -120.78 -56.87 -75.21 0.0 0.0 0.0 ... -164.40 19 1lmm 330 40 k_0 1 2 0 0.333333
12 2398 0.71 14.24 347.70 -583.72 -212.66 -319.42 0.0 0.0 0.0 ... -809.60 17 1hn4_A 332 129 k_2 5 2 0 0.714286
13 2306 0.73 14.30 392.96 -576.44 -190.69 -304.84 0.0 0.0 0.0 ... -760.45 7 1fs3 332 124 k_2 3 1 0 0.750000
14 2185 0.73 14.39 423.70 -586.07 -207.24 -323.28 0.0 0.0 0.0 ... -766.56 7 1hn4_A 333 129 k_5 7 0 0 1.000000
15 2406 0.76 9.94 112.20 -120.66 -56.96 -78.81 0.0 0.0 0.0 ... -167.24 1 1lmm 332 40 k_2 0 3 2 0.000000
16 2436 0.77 14.36 350.26 -544.59 -190.38 -320.47 0.0 0.0 0.0 ... -807.87 6 1fs3 333 124 k_5 4 0 0 1.000000
17 2442 0.77 10.00 104.09 -122.12 -59.83 -78.34 0.0 0.0 0.0 ... -178.56 17 1lmm 3300 40 original 2 1 1 0.500000
18 2184 0.77 9.74 105.98 -113.72 -56.87 -87.09 0.0 0.0 0.0 ... -169.71 15 1lmm 331 40 k_1 2 1 0 0.666667
19 2314 0.78 7.00 65.97 -38.61 -21.10 -38.32 0.0 0.0 0.0 ... -32.23 2 1tcg 330 22 k_0 2 1 1 0.500000
20 2249 0.80 7.00 59.76 -34.77 -24.15 -40.18 0.0 0.0 0.0 ... -39.66 8 1tcg 331 22 k_1 2 1 1 0.500000
21 2347 0.81 10.79 155.96 -169.58 -83.09 -132.83 0.0 0.0 0.0 ... -256.05 3 1bpi 330 58 k_0 1 2 0 0.333333
22 2445 0.81 10.80 166.00 -165.91 -86.49 -129.70 0.0 0.0 0.0 ... -238.84 9 1bpi 331 58 k_1 1 2 0 0.333333
23 2293 0.81 10.62 172.03 -155.10 -90.74 -129.71 0.0 0.0 0.0 ... -221.82 1 1bpi 3300 58 original 1 2 0 0.333333
24 2284 0.83 10.01 115.34 -115.81 -55.51 -84.87 0.0 0.0 0.0 ... -172.20 12 1lmm 333 40 k_5 3 0 0 1.000000
25 2361 0.83 10.74 147.24 -174.74 -85.10 -138.19 0.0 0.0 0.0 ... -285.85 18 1bpi 332 58 k_2 2 1 0 0.666667
26 2350 0.84 6.96 78.74 -35.11 -25.70 -41.93 0.0 0.0 0.0 ... -29.16 17 1tcg 3300 22 original 1 2 2 0.200000
27 2331 0.84 10.56 191.58 -154.43 -94.30 -135.11 0.0 0.0 0.0 ... -235.48 7 1bpi 333 58 k_5 3 0 0 1.000000
28 2401 0.86 7.09 58.29 -37.30 -22.88 -43.68 0.0 0.0 0.0 ... -51.80 3 1tcg 332 22 k_2 3 0 1 0.750000
29 2068 0.87 7.00 77.68 -38.76 -21.85 -42.94 0.0 0.0 0.0 ... -42.35 10 1tcg 333 22 k_5 3 0 1 0.750000

30 rows × 24 columns


In [ ]:


In [ ]:
plot_data = selected.query("Protein !='1ppb_H' and Protein != '1hn4'").sort_values(["Scheme"])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))

labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(3)'
labels[1] = '1lmm\n(3)'
labels[2] = '1bpi\n(3)'
labels[3] = '1fs3\n(4)'
labels[4] = '1ppb\n(4)'
labels[5] = '1hn4\n(7)'
ax.set_xticklabels(labels)

plt.xlabel("Protein(number of disulfide bonds)")
plt.ylabel("Fraction")
# plt.tight_layout()
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction.png", dpi=300, bbox_inches='tight')

In [60]:
native_pair = ['65_72', '40_95', '58_110', '26_84']
# group they by pair
t = data_cys.query("Protein=='1fs3' and Submode=='3300'").reset_index(drop=True)
t["pair"] = (1 + t["i"]).astype(str) + "_" + (1 + t["j"]).astype(str)

# tt = t.query("Run==5").reset_index(drop=True)
tt = t
pair_list = tt["pair"].unique().tolist()


n_pair = len(pair_list)

pair_dict = {}
for i, pair in enumerate(pair_list):
    pair_dict[pair] = i

pair_timeline = np.zeros((n_pair, 2500))

for i, line in tt.iterrows():
    frame = line["model"]
    pair = line["pair"]
    if pair in native_pair:
        pair_timeline[pair_dict[pair]][frame] = 1
    else:
        pair_timeline[pair_dict[pair]][frame] = 2

In [61]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto', cmap="hot_r")
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.xlabel("Frames")
plt.ylabel("Formation of disulfide bonds")
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')


Out[61]:
Text(0, 0.5, 'Formation of disulfide bonds')

In [66]:
native_pair = ['65_72', '40_95', '58_110', '26_84']
# group they by pair
t = data_cys.query("Protein=='1fs3' and Submode=='333'").reset_index(drop=True)
t["pair"] = (1 + t["i"]).astype(str) + "_" + (1 + t["j"]).astype(str)

# tt = t.query("Run==5").reset_index(drop=True)
tt = t
pair_list = tt["pair"].unique().tolist()


n_pair = len(pair_list)

pair_dict = {}
for i, pair in enumerate(pair_list):
    pair_dict[pair] = i

pair_timeline = np.zeros((n_pair, 2500))

for i, line in tt.iterrows():
    frame = line["model"]
    pair = line["pair"]
    if pair in native_pair:
        pair_timeline[pair_dict[pair]][frame] = 1
    else:
        pair_timeline[pair_dict[pair]][frame] = 2

In [67]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto', cmap="hot_r")
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.xlabel("Frames")
plt.ylabel("Formation of disulfide bonds")
# plt.colorbar()
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')


Out[67]:
Text(0, 0.5, 'Formation of disulfide bonds')

In [63]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto', cmap="hot_r")
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.xlabel("Frames")
plt.ylabel("Formation of disulfide bonds")
# plt.colorbar()
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')


Out[63]:
Text(0, 0.5, 'Formation of disulfide bonds')

In [51]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto', cmap="hot_r")
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.xlabel("Frames")
plt.ylabel("Formation of disulfide bonds")
# plt.colorbar()
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')


Out[51]:
Text(0, 0.5, 'Formation of disulfide bonds')

In [ ]:


In [10]:
selected.Scheme = pd.Categorical(selected.Scheme,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
ax = sns.lineplot("Protein", "Q", hue="Scheme", style="Scheme", markers=True, data=selected.sort_values(["Scheme", "Protein"]).reset_index(drop=True), dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Best Q")
plt.xlabel("Protein(Length)")
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(22)'
labels[1] = '1lmm\n(40)'
labels[2] = '1bpi\n(58)'
labels[3] = '1fs3\n(124)'
labels[4] = '1hn4\n(129)'
labels[5] = '1ppb\n(259)'
ax.set_xticklabels(labels)
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_bestQ.png", dpi=300, bbox_inches='tight')


Out[10]:
[Text(0, 0, '1tcg\n(22)'),
 Text(0, 0, '1lmm\n(40)'),
 Text(0, 0, '1bpi\n(58)'),
 Text(0, 0, '1fs3\n(124)'),
 Text(0, 0, '1hn4\n(129)'),
 Text(0, 0, '1ppb\n(259)'),
 Text(0, 0, '')]

In [41]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_less_frag_06-10.csv", index_col=0)

a = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_jun10.csv", index_col=0)
a = a.sort_values("Length").reset_index(drop=True)
data = data.merge(a, on="Protein")

# a = a.append({'Protein':'1ppb_H', 'Length':259}, ignore_index=True)
# a = a.sort_values("Length").reset_index(drop=True)
# a.to_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_jun10.csv")


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [42]:
data["Protein"].unique()


Out[42]:
array(['1hn4_A', '1ppb_H', '1lmm', '1tcg', '1fs3', '1bpi'], dtype=object)

In [43]:
scheme_dic = {"230":"k_0", "231":"k_1", "232":"k_2", "233":"k_5", "2300":"original"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 1500 and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)


a = a.query("Protein != '1hn4'").reset_index(drop=True)
selected.Protein = pd.Categorical(selected.Protein, categories=a.Protein, ordered=True)

In [46]:
selected.Scheme = pd.Categorical(selected.Scheme,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
ax = sns.lineplot("Protein", "Q", hue="Scheme", style="Scheme", markers=True, data=selected.sort_values(["Scheme", "Protein"]).reset_index(drop=True), dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Best Q")
plt.xlabel("Protein(Length)")
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(22)'
labels[1] = '1lmm\n(40)'
labels[2] = '1bpi\n(58)'
labels[3] = '1fs3\n(124)'
labels[4] = '1hn4\n(129)'
labels[5] = '1ppb\n(259)'
ax.set_xticklabels(labels)
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_bestQ.png", dpi=300, bbox_inches='tight')


Out[46]:
[Text(0, 0, '1tcg\n(22)'),
 Text(0, 0, '1lmm\n(40)'),
 Text(0, 0, '1bpi\n(58)'),
 Text(0, 0, '1fs3\n(124)'),
 Text(0, 0, '1hn4\n(129)'),
 Text(0, 0, '1ppb\n(259)'),
 Text(0, 0, '')]

In [48]:
data["k_cys"] = data["Scheme"]
filtered_data = data.query("Steps > 1500")

# filtered_data = filtered_data.query("k_cys != 'original'")
Q_max = filtered_data.sort_values('Q').groupby(["Protein", "k_cys", "Run"]).tail(1).sort_values(["Protein", 'k_cys', "Q"]).reset_index(drop=True)
Q_max["Annealing Index"] = Q_max.groupby(["Protein", "k_cys"])["Q"].rank(method="first", ascending=False).astype(int)
Q_max.k_cys = pd.Categorical(Q_max.k_cys,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
Q_max["Scheme"] = Q_max["k_cys"]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(1.5*16.18033, 2*10))

protein = '1tcg'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1bpi'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1lmm'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")



protein = '1fs3'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1hn4_A'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"1hn4")


protein = '1ppb_H'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")


# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='center right', bbox_to_anchor=(0.7, 0.15))
fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), bbox_to_anchor=(0.98, 0.5), loc=2, borderpad=1.5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc="lower left", ncol=5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='lower center')
# axes[-1, -1].axis('off')

plt.tight_layout()
# plt.savefig(f"{figureFolder}/seperate_run_best_include_1ppb.png", dpi=300, bbox_inches='tight')



In [ ]:


In [16]:
data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_06-06.csv", index_col=0)

a = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_may21.csv", index_col=0)
a = a.sort_values("Length").reset_index(drop=True)
data = data.merge(a, on="Protein")


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [17]:
scheme_dic = {"130":"k_0", "131":"k_1", "132":"k_2", "133":"k_5", "1300":"original"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 1500 and Protein != '1ppb_H' and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)


a = a.query("Protein != '1hn4'").reset_index(drop=True)
selected.Protein = pd.Categorical(selected.Protein, categories=a.Protein, ordered=True)

In [18]:
selected.Scheme = pd.Categorical(selected.Scheme,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
ax = sns.lineplot("Protein", "Q", hue="Scheme", style="Scheme", markers=True, data=selected.sort_values(["Scheme", "Protein"]).reset_index(drop=True), dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Best Q")
plt.xlabel("Protein(Length)")
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(22)'
labels[1] = '1lmm\n(40)'
labels[2] = '1bpi\n(58)'
labels[3] = '1fs3\n(124)'
labels[4] = '1hn4\n(129)'
labels[5] = '1ppb\n(295)'
ax.set_xticklabels(labels)
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_bestQ.png", dpi=300, bbox_inches='tight')


Out[18]:
[Text(0, 0, '1tcg\n(22)'),
 Text(0, 0, '1lmm\n(40)'),
 Text(0, 0, '1bpi\n(58)'),
 Text(0, 0, '1fs3\n(124)'),
 Text(0, 0, '1hn4\n(129)'),
 Text(0, 0, '1ppb\n(295)')]

In [20]:
data["k_cys"] = data["Scheme"]
filtered_data = data.query("Steps > 1500")

# filtered_data = filtered_data.query("k_cys != 'original'")
Q_max = filtered_data.sort_values('Q').groupby(["Protein", "k_cys", "Run"]).tail(1).sort_values(["Protein", 'k_cys', "Q"]).reset_index(drop=True)
Q_max["Annealing Index"] = Q_max.groupby(["Protein", "k_cys"])["Q"].rank(method="first", ascending=False).astype(int)
Q_max.k_cys = pd.Categorical(Q_max.k_cys,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
Q_max["Scheme"] = Q_max["k_cys"]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(1.5*16.18033, 2*10))

protein = '1tcg'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1bpi'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1lmm'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")



protein = '1fs3'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1hn4_A'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"1hn4")


protein = '1ppb'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")


# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='center right', bbox_to_anchor=(0.7, 0.15))
fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), bbox_to_anchor=(0.98, 0.5), loc=2, borderpad=1.5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc="lower left", ncol=5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='lower center')
# axes[-1, -1].axis('off')

plt.tight_layout()
# plt.savefig(f"{figureFolder}/seperate_run_best_include_1ppb.png", dpi=300, bbox_inches='tight')



In [19]:
plot_data = selected.query("Protein !='1ppb_H' and Protein != '1hn4'").sort_values(["Scheme"])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))

labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(3)'
labels[1] = '1lmm\n(3)'
labels[2] = '1bpi\n(3)'
labels[3] = '1fs3\n(4)'
labels[4] = '1ppb\n(4)'
labels[5] = '1hn4\n(7)'
ax.set_xticklabels(labels)

plt.xlabel("Protein(number of disulfide bonds)")
plt.ylabel("Fraction")
# plt.tight_layout()
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction.png", dpi=300, bbox_inches='tight')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-286921c3ed36> in <module>()
      1 
      2 plot_data = selected.query("Protein !='1ppb_H' and Protein != '1hn4'").sort_values(["Scheme"])
----> 3 ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)
      4 # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
      5 leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))

~/anaconda/lib/python3.6/site-packages/seaborn/relational.py in lineplot(x, y, hue, size, style, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, dashes, markers, style_order, units, estimator, ci, n_boot, seed, sort, err_style, err_kws, legend, ax, **kwargs)
   1129         dashes=dashes, markers=markers, style_order=style_order,
   1130         units=units, estimator=estimator, ci=ci, n_boot=n_boot, seed=seed,
-> 1131         sort=sort, err_style=err_style, err_kws=err_kws, legend=legend,
   1132     )
   1133 

~/anaconda/lib/python3.6/site-packages/seaborn/relational.py in __init__(self, x, y, hue, size, style, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, dashes, markers, style_order, units, estimator, ci, n_boot, seed, sort, err_style, err_kws, legend)
    698 
    699         plot_data = self.establish_variables(
--> 700             x, y, hue, size, style, units, data
    701         )
    702 

~/anaconda/lib/python3.6/site-packages/seaborn/relational.py in establish_variables(self, x, y, hue, size, style, units, data)
    140                 if isinstance(var, str):
    141                     err = "Could not interpret input '{}'".format(var)
--> 142                     raise ValueError(err)
    143 
    144             # Extract variable names

ValueError: Could not interpret input 'fraction'

In [13]:
# data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-18.csv", index_col=0)
data = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-21.csv", index_col=0)

# a = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_12-01.csv", index_col=0)
# a = a.append({"Protein":"1ppb", "Length":295}, ignore_index=True)
# a = a.append({"Protein":"1hn4_A", "Length":129}, ignore_index=True)
# a.to_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_may21.csv")
# data = data.merge(a, on="Protein")

a = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/length_info_may21.csv", index_col=0)
a = a.sort_values("Length").reset_index(drop=True)
data = data.merge(a, on="Protein")


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [14]:
scheme_dic = {"30":"k_0", "31":"k_1", "32":"k_2", "33":"k_5", "300":"original"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 1500 and Protein != '1ppb_H' and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)


a = a.query("Protein != '1hn4'").reset_index(drop=True)
selected.Protein = pd.Categorical(selected.Protein, categories=a.Protein, ordered=True)

In [15]:
selected.Scheme = pd.Categorical(selected.Scheme,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
ax = sns.lineplot("Protein", "Q", hue="Scheme", style="Scheme", markers=True, data=selected.sort_values(["Scheme", "Protein"]).reset_index(drop=True), dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Best Q")
plt.xlabel("Protein(Length)")
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(22)'
labels[1] = '1lmm\n(40)'
labels[2] = '1bpi\n(58)'
labels[3] = '1fs3\n(124)'
labels[4] = '1hn4\n(129)'
labels[5] = '1ppb\n(295)'
ax.set_xticklabels(labels)
plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_bestQ.png", dpi=300, bbox_inches='tight')



In [103]:
# selected.to_csv("/Users/weilu/Dropbox/openAWSEM/data/selected_may21.csv")
selected = pd.read_csv("/Users/weilu/Dropbox/openAWSEM/data/selected_may21.csv")
selected.Protein = pd.Categorical(selected.Protein, categories=['1tcg', '1lmm', '1bpi', '1fs3', '1ppb', '1hn4_A'], ordered=True)

In [11]:
plot_data = selected.query("Protein !='1ppb_H' and Protein != '1hn4'").sort_values(["Scheme"])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=plot_data, dashes=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))

labels = [item.get_text() for item in ax.get_xticklabels()]
labels[0] = '1tcg\n(3)'
labels[1] = '1lmm\n(3)'
labels[2] = '1bpi\n(3)'
labels[3] = '1fs3\n(4)'
labels[4] = '1ppb\n(4)'
labels[5] = '1hn4\n(7)'
ax.set_xticklabels(labels)

plt.xlabel("Protein(number of disulfide bonds)")
plt.ylabel("Fraction")
# plt.tight_layout()
plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction.png", dpi=300, bbox_inches='tight')



In [16]:
data["k_cys"] = data["Scheme"]
filtered_data = data.query("Steps > 1500")

# filtered_data = filtered_data.query("k_cys != 'original'")
Q_max = filtered_data.sort_values('Q').groupby(["Protein", "k_cys", "Run"]).tail(1).sort_values(["Protein", 'k_cys', "Q"]).reset_index(drop=True)
Q_max["Annealing Index"] = Q_max.groupby(["Protein", "k_cys"])["Q"].rank(method="first", ascending=False).astype(int)
Q_max.k_cys = pd.Categorical(Q_max.k_cys,  ["original", "k_0", "k_1", "k_2", "k_5"], ordered=True)
Q_max["Scheme"] = Q_max["k_cys"]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(1.5*16.18033, 2*10))

protein = '1tcg'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1bpi'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[0][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1lmm'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")



protein = '1fs3'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[1][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")

protein = '1hn4_A'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][0])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"1hn4")


protein = '1ppb'
Q_max_one = Q_max.query(f"Protein == '{protein}'")
ax = sns.lineplot("Annealing Index", "Q", hue="Scheme", data=Q_max_one, legend=False,
                  markers=True, style="Scheme", dashes=False, ax=axes[2][1])
_ = ax.set_xticks(np.arange(1, 21, 1))
ax.set_ylabel("Best Q")
ax.set_title(f"{protein}")


# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='center right', bbox_to_anchor=(0.7, 0.15))
fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), bbox_to_anchor=(0.98, 0.5), loc=2, borderpad=1.5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc="lower left", ncol=5)
# fig.legend((['k_0', 'k_1', 'k_2', 'k_5', 'original']), loc='lower center')
# axes[-1, -1].axis('off')

plt.tight_layout()
plt.savefig(f"{figureFolder}/seperate_run_best_include_1ppb.png", dpi=300, bbox_inches='tight')



In [ ]:


In [ ]:


In [ ]:


In [105]:
selected.sort_values("FP")


Out[105]:
Unnamed: 0 Steps Q Rg Backbone Rama Contact Fragment Membrane ER ... Run Protein Submode Scheme k_cys Length TP FN FP fraction
0 0 2492 0.24 165.17 840.82 -895.09 -766.97 -565.18 0.0 0.0 ... 0 1ppb 32 k_2 k_2 295 1 3 0 0.250000
32 32 2349 0.78 10.77 162.36 -174.75 -87.66 -221.84 0.0 0.0 ... 19 1bpi 31 k_1 k_1 58 1 2 0 0.333333
31 31 2166 0.77 10.62 195.49 -167.63 -94.99 -222.81 0.0 0.0 ... 8 1bpi 30 k_0 k_0 58 1 2 0 0.333333
30 30 2394 0.76 10.73 138.21 -175.36 -96.23 -226.00 0.0 0.0 ... 16 1bpi 300 original original 58 0 3 0 0.000000
29 29 2384 0.73 14.55 403.46 -587.54 -205.03 -887.50 0.0 0.0 ... 15 1hn4_A 33 k_5 k_5 129 7 0 0 1.000000
28 28 2100 0.73 14.28 403.14 -585.84 -209.67 -880.02 0.0 0.0 ... 2 1hn4_A 32 k_2 k_2 129 6 1 0 0.857143
27 27 2412 0.72 14.12 378.97 -606.38 -194.19 -870.77 0.0 0.0 ... 7 1hn4_A 31 k_1 k_1 129 4 3 0 0.571429
26 26 1979 0.72 90.12 817.87 -1235.45 -409.34 -1731.11 0.0 0.0 ... 6 NaN 33 k_5 k_5 129 14 0 0 1.000000
25 25 2040 0.67 14.14 438.27 -589.05 -223.19 -853.55 0.0 0.0 ... 9 1hn4_A 30 k_0 k_0 129 4 3 0 0.571429
24 24 2129 0.67 14.39 402.30 -579.06 -200.07 -856.77 0.0 0.0 ... 2 1hn4_A 300 original original 129 2 5 0 0.285714
23 23 2064 0.65 120.05 895.24 -1211.73 -403.56 -1708.51 0.0 0.0 ... 17 NaN 32 k_2 k_2 129 12 2 0 0.857143
22 22 2283 0.63 163.20 796.64 -1261.25 -399.85 -1721.90 0.0 0.0 ... 10 NaN 300 original original 129 4 10 0 0.285714
21 21 2416 0.62 14.50 334.19 -565.53 -200.57 -443.81 0.0 0.0 ... 3 1fs3 33 k_5 k_5 124 3 1 0 0.750000
20 20 1992 0.62 162.68 957.08 -1225.58 -400.00 -1700.90 0.0 0.0 ... 9 NaN 31 k_1 k_1 129 7 7 0 0.500000
33 33 2446 0.81 10.51 159.66 -155.93 -95.24 -225.04 0.0 0.0 ... 19 1bpi 32 k_2 k_2 58 3 0 0 1.000000
17 17 2027 0.57 10.05 141.57 -110.70 -62.44 -12.39 0.0 0.0 ... 1 1lmm 33 k_5 k_5 40 2 1 0 0.666667
8 8 2433 0.44 15.74 404.01 -595.69 -197.88 -437.26 0.0 0.0 ... 3 1fs3 31 k_1 k_1 124 1 3 0 0.250000
1 1 2325 0.25 161.57 910.52 -874.89 -749.50 -558.36 0.0 0.0 ... 2 1ppb 31 k_1 k_1 295 1 3 0 0.250000
2 2 2459 0.25 16.95 859.19 -879.74 -874.45 -558.29 0.0 0.0 ... 5 1ppb 33 k_5 k_5 295 3 1 0 0.750000
3 3 1892 0.25 82.82 1066.44 -818.97 -692.67 -571.70 0.0 0.0 ... 3 1ppb 30 k_0 k_0 295 0 4 0 0.000000
4 4 2140 0.25 131.85 956.48 -857.20 -714.33 -554.70 0.0 0.0 ... 10 1ppb 300 original original 295 1 3 0 0.250000
5 5 2068 0.40 15.28 412.83 -577.66 -193.95 -433.05 0.0 0.0 ... 16 1fs3 300 original original 124 1 3 0 0.250000
6 6 2116 0.43 17.41 405.01 -593.92 -172.69 -438.99 0.0 0.0 ... 14 1fs3 30 k_0 k_0 124 1 3 0 0.250000
7 7 2334 0.44 7.45 69.74 -37.29 -17.55 0.00 0.0 0.0 ... 14 1tcg 30 k_0 k_0 22 0 3 0 0.000000
34 34 2215 0.83 10.59 172.12 -161.87 -81.89 -225.93 0.0 0.0 ... 14 1bpi 33 k_5 k_5 58 3 0 0 1.000000
15 15 2481 0.55 9.53 126.35 -125.34 -66.35 -12.91 0.0 0.0 ... 10 1lmm 31 k_1 k_1 40 0 3 1 0.000000
10 10 2363 0.51 6.73 77.73 -40.15 -21.43 0.00 0.0 0.0 ... 11 1tcg 33 k_5 k_5 22 1 2 1 0.250000
11 11 2063 0.51 7.47 69.77 -13.25 -24.53 0.00 0.0 0.0 ... 2 1tcg 300 original original 22 0 3 1 0.000000
12 12 2324 0.51 6.83 72.51 -39.43 -21.23 0.00 0.0 0.0 ... 6 1tcg 32 k_2 k_2 22 1 2 1 0.250000
13 13 2297 0.55 10.16 130.34 -119.23 -55.31 -11.64 0.0 0.0 ... 8 1lmm 32 k_2 k_2 40 1 2 1 0.250000
19 19 2272 0.61 168.51 800.97 -1251.45 -407.42 -1736.69 0.0 0.0 ... 2 NaN 30 k_0 k_0 129 8 6 1 0.533333
18 18 1962 0.58 9.77 159.01 -114.59 -52.45 -12.37 0.0 0.0 ... 9 1lmm 300 original original 40 0 3 1 0.000000
14 14 2364 0.55 9.89 113.80 -110.44 -60.06 -11.57 0.0 0.0 ... 12 1lmm 30 k_0 k_0 40 0 3 1 0.000000
9 9 2142 0.45 7.34 70.03 -42.74 -22.12 0.00 0.0 0.0 ... 12 1tcg 31 k_1 k_1 22 0 3 1 0.000000
16 16 2151 0.57 15.47 462.86 -576.96 -193.77 -442.14 0.0 0.0 ... 4 1fs3 32 k_2 k_2 124 2 2 2 0.333333

35 rows × 26 columns


In [17]:
# read all cys information.
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
subMode_list += [300, 30, 31, 32, 33]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run1_ho"
pre_base = "/Users/weilu/Research/server/may_week2_2020/disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/cys_bonds.csv"
            try:
                tmp = pd.read_csv(location, index_col=0)
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-21.csv

In [18]:
today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/cys_info_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/cys_info_run1_ho_05-21.csv

In [333]:
data_cys = pd.read_csv("/Users/weilu/Research/data/openMM/cys_info_run1_ho_05-21.csv", index_col=0)


/Users/weilu/anaconda/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [ ]:


In [409]:
# group they by pair
t = data_cys.query("Protein=='1fs3' and Submode=='33'").reset_index(drop=True)
t["pair"] = (1 + t["i"]).astype(str) + "_" + (1 + t["j"]).astype(str)

In [ ]:
# pair_list = ['65_72', '40_95', '58_110', '26_84', '26_40', '26_72', '58_72', '26_58', '84_95', '40_65', '26_110', '40_110', '40_72', '72_110', '65_110', '26_65', '65_84', '65_95', '72_84', '58_84', '26_95', '40_58', '58_65', '40_84', '58_95']

In [445]:
print(pair_list)


['26_40', '26_72', '58_72', '26_58', '65_72', '84_95', '40_65', '40_95', '26_110', '40_110', '40_72', '72_110', '65_110', '26_65', '65_84', '65_95', '58_110', '72_84', '58_84', '26_95', '40_58', '26_84', '58_65', '40_84', '58_95']

In [456]:
tt = t.query("Run==12").reset_index(drop=True)
pair_list = tt["pair"].unique().tolist()


n_pair = len(pair_list)

pair_dict = {}
for i, pair in enumerate(pair_list):
    pair_dict[pair] = i

pair_timeline = np.zeros((n_pair, 2500))

for i, line in tt.iterrows():
    frame = line["model"]
    pair = line["pair"]
    pair_timeline[pair_dict[pair]][frame] = 1

In [457]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto')
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')



In [458]:
native_pair = ['65_72', '40_95', '58_110', '26_84']

In [461]:
tt = t.query("Run==8").reset_index(drop=True)
pair_list = tt["pair"].unique().tolist()


n_pair = len(pair_list)

pair_dict = {}
for i, pair in enumerate(pair_list):
    pair_dict[pair] = i

pair_timeline = np.zeros((n_pair, 2500))

for i, line in tt.iterrows():
    frame = line["model"]
    pair = line["pair"]
    if pair in native_pair:
        pair_timeline[pair_dict[pair]][frame] = 2
    else:
        pair_timeline[pair_dict[pair]][frame] = 1

f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto')
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
# plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')



In [473]:
native_pair = ['65_72', '40_95', '58_110', '26_84']
tt = t.query("Run==3").reset_index(drop=True)
pair_list = tt["pair"].unique().tolist()


n_pair = len(pair_list)

pair_dict = {}
for i, pair in enumerate(pair_list):
    pair_dict[pair] = i

pair_timeline = np.zeros((n_pair, 2500))

for i, line in tt.iterrows():
    frame = line["model"]
    pair = line["pair"]
    if pair in native_pair:
        pair_timeline[pair_dict[pair]][frame] = 1
    else:
        pair_timeline[pair_dict[pair]][frame] = 2

In [477]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto', cmap="hot_r")
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.xlabel("Frames")
plt.ylabel("Formation of disulfide bonds")
plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')



In [455]:
f = plt.figure(figsize = (20,10))
ax = f.add_subplot(111)
ax.yaxis.tick_right()
plt.imshow(pair_timeline, aspect='auto')
_ = plt.yticks(ticks=range(n_pair), labels=pair_list)
#  yaxis.tick_right()
plt.savefig(f"{figureFolder}/1fs3_run3_mode_33_trajectory.png", dpi=300, bbox_inches='tight')



In [350]:
t.query("Run==3").groupby("pair").count().sort_values("r")


Out[350]:
model i j r Run Protein Submode
pair
58_65 1 1 1 1 1 1 1
40_58 1 1 1 1 1 1 1
58_95 4 4 4 4 4 4 4
58_84 5 5 5 5 5 5 5
72_84 5 5 5 5 5 5 5
65_84 9 9 9 9 9 9 9
65_95 9 9 9 9 9 9 9
40_110 9 9 9 9 9 9 9
40_72 12 12 12 12 12 12 12
40_65 23 23 23 23 23 23 23
26_65 26 26 26 26 26 26 26
26_58 51 51 51 51 51 51 51
26_72 73 73 73 73 73 73 73
65_110 132 132 132 132 132 132 132
26_84 136 136 136 136 136 136 136
40_95 136 136 136 136 136 136 136
26_40 153 153 153 153 153 153 153
40_84 247 247 247 247 247 247 247
72_110 271 271 271 271 271 271 271
26_95 353 353 353 353 353 353 353
84_95 373 373 373 373 373 373 373
65_72 382 382 382 382 382 382 382
26_110 424 424 424 424 424 424 424
58_72 429 429 429 429 429 429 429
58_110 700 700 700 700 700 700 700

In [368]:
t.query("model > 2400 and Run==3").groupby("pair").count().sort_values("r")


Out[368]:
model i j r Run Protein Submode
pair
58_72 11 11 11 11 11 11 11
58_110 44 44 44 44 44 44 44
72_110 62 62 62 62 62 62 62
26_84 94 94 94 94 94 94 94
40_95 94 94 94 94 94 94 94

In [353]:
t.query("model > 2000 and Run==3").groupby("pair").count().sort_values("r")


Out[353]:
model i j r Run Protein Submode
pair
26_40 1 1 1 1 1 1 1
58_72 28 28 28 28 28 28 28
72_110 84 84 84 84 84 84 84
26_84 128 128 128 128 128 128 128
40_95 128 128 128 128 128 128 128
40_84 190 190 190 190 190 190 190
65_72 239 239 239 239 239 239 239
26_95 323 323 323 323 323 323 323
58_110 414 414 414 414 414 414 414

In [389]:
a = data_cys.query("model > 2000").groupby(["Run", "Protein", "Submode", "model", "j"])["r"].count().reset_index().sort_values("r")
b = a.query("r==3").groupby(["Protein", "Submode"])["r"].count().reset_index()
b


Out[389]:
Protein Submode r
0 1fs3 31 1
1 1fs3 300 1
2 1hn4_A 30 8
3 1hn4_A 31 4
4 1hn4_A 32 1
5 1hn4_A 300 4
6 1lmm 30 2
7 1lmm 31 3
8 1lmm 32 6
9 1lmm 300 6
10 1tcg 31 3
11 1tcg 33 1
12 1tcg 300 1

In [398]:
a = data_cys.query("model > 2000 and r < 4").groupby(["Run", "Protein", "Submode", "model", "j"])["r"].count().reset_index().sort_values("r")
b = a.query("r==2").groupby(["Protein", "Submode"])["r"].count().reset_index()
b


Out[398]:
Protein Submode r
0 1bpi 30 73
1 1bpi 31 17
2 1bpi 32 22
3 1bpi 33 2
4 1bpi 300 59
5 1fs3 30 33
6 1fs3 31 87
7 1fs3 32 41
8 1fs3 33 35
9 1fs3 300 25
10 1hn4_A 30 366
11 1hn4_A 31 132
12 1hn4_A 32 58
13 1hn4_A 33 8
14 1hn4_A 300 91
15 1lmm 30 56
16 1lmm 31 34
17 1lmm 32 24
18 1lmm 33 16
19 1lmm 300 33
20 1ppb 30 13
21 1ppb 31 1
22 1ppb 32 9
23 1ppb 33 1
24 1tcg 30 6
25 1tcg 31 4
26 1tcg 32 5
27 1tcg 33 7
28 1tcg 300 10

In [405]:
a = data_cys.query("model > 2000 and i==25 and j==83")

In [407]:
a.groupby("Submode").count()


Out[407]:
model i j r Run Protein
Submode
30 3 3 3 3 3 3
31 12 12 12 12 12 12
32 15 15 15 15 15 15
33 518 518 518 518 518 518
300 4 4 4 4 4 4

In [281]:
t = data_cys.query("Protein=='1fs3' and Submode=='33' and Run==3")

In [195]:
tt = t.query("model==2414").reset_index(drop=True)

In [256]:
protein = "1fs3"
fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
native_cys_bond = get_cys_info(fileLocation)
correct_fraction_v2(native_cys_bond, tt)


Out[256]:
(0.75, 3, 1, 0)

In [224]:
pd.DataFrame([[f, TP, FN, FP]], columns=["f", "TP", "FN", "FP"])


Out[224]:
f TP FN FP
0 0.4 2 2 1

In [226]:
a = t.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v3, native_cys_bond)

In [282]:
tt = t.query("model==3").reset_index(drop=True)

In [283]:
correct_fraction_v2(native_cys_bond, tt)


Out[283]:
(0.0, 0, 3, 1)

In [284]:
correct_fraction(native_cys_bond, tt)


Out[284]:
(0, 3, 1)

In [308]:
native_cys_bond


Out[308]:
model i j r pair
0 0 25 83 3.627835 25_83
1 0 39 94 3.783132 39_94
2 0 57 109 3.748634 57_109
3 0 64 71 4.101942 64_71

In [309]:
correct_fraction_v3(tt, native_cys_bond)


Out[309]:
fraction TP FN FP
0 0.0 0 4 1

In [304]:


In [317]:
data_selected = data_cys.query(f"Protein=='{protein}' and Submode=='33' and Run==3").head(4)
data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v3, native_cys_bond=native_cys_bond).reset_index()


Out[317]:
model Run Protein Submode level_4 fraction TP FN FP
0 3 3 1fs3 33 0 0.0 0 4 1
1 22 3 1fs3 33 0 0.0 0 4 1
2 65 3 1fs3 33 0 0.0 0 4 1
3 68 3 1fs3 33 0 0.0 0 4 1

In [311]:
# from functools import partial
# correct_fraction_v3_one_protein = partial(correct_fraction_v3, native_cys_bond=native_cys_bond)

In [318]:
pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1bpi"]
for pdb in pdb_list:
    print(pdb)
    protein = pdb
    fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
    native_cys_bond = get_cys_info(fileLocation)

    data_selected = data_cys.query(f"Protein=='{protein}'")
    result = data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v3, native_cys_bond=native_cys_bond).reset_index()
    result.to_csv(f"/Users/weilu/Research/data/openMM/{protein}_cys_05-22.csv")


1ppb
1lmm
1tcg
1hn4_A
1fs3
1bpi

In [319]:
pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
d_list =[]
for pdb in pdb_list:
    t = pd.read_csv(f"/Users/weilu/Research/data/openMM/{pdb}_cys_05-22.csv")
    d_list.append(t)
d_cys = pd.concat(d_list).reset_index(drop=True)

In [320]:


In [321]:
d_cys.query("Protein=='1fs3' and Submode=='33' and Run==3")


Out[321]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP
474275 13 3 3 1fs3 33 0 0.000000 0 4 1
474384 122 22 3 1fs3 33 0 0.000000 0 4 1
474771 509 65 3 1fs3 33 0 0.000000 0 4 1
474807 545 68 3 1fs3 33 0 0.000000 0 4 1
474905 643 77 3 1fs3 33 0 0.000000 0 4 1
... ... ... ... ... ... ... ... ... ... ...
561394 87132 2495 3 1fs3 33 0 0.400000 2 2 1
561448 87186 2496 3 1fs3 33 0 0.500000 3 1 2
561503 87241 2497 3 1fs3 33 0 0.400000 2 2 1
561560 87298 2498 3 1fs3 33 0 0.333333 2 2 2
561614 87352 2499 3 1fs3 33 0 0.400000 2 2 1

1958 rows × 10 columns


In [322]:
d_cys.query("Protein=='1fs3' and Submode=='33' and Run==3")


Out[322]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP
474275 13 3 3 1fs3 33 0 0.000000 0 4 1
474384 122 22 3 1fs3 33 0 0.000000 0 4 1
474771 509 65 3 1fs3 33 0 0.000000 0 4 1
474807 545 68 3 1fs3 33 0 0.000000 0 4 1
474905 643 77 3 1fs3 33 0 0.000000 0 4 1
... ... ... ... ... ... ... ... ... ... ...
561394 87132 2495 3 1fs3 33 0 0.400000 2 2 1
561448 87186 2496 3 1fs3 33 0 0.500000 3 1 2
561503 87241 2497 3 1fs3 33 0 0.400000 2 2 1
561560 87298 2498 3 1fs3 33 0 0.333333 2 2 2
561614 87352 2499 3 1fs3 33 0 0.400000 2 2 1

1958 rows × 10 columns


In [174]:


In [249]:
scheme_dic = {"30":"k_0", "31":"k_1", "32":"k_2", "33":"k_5", "300":"original"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
selected = data.query("Steps > 1500 and Protein != '1ppb_H' and Protein != '1hn4'").sort_values("Q").groupby(["Protein", "Submode", "Run"]).tail(1)

In [252]:
selected.query("Protein=='1fs3' and Submode=='300'")


Out[252]:
Steps Q Rg Backbone Rama Contact Fragment Membrane ER TBM_Q Beta Pap Helical Disulfide Total Run Protein Submode Length Scheme
1267962 1950 0.32 15.33 461.25 -559.52 -202.28 -426.93 0.0 0.0 0.0 -48.48 -14.88 0.0 0.0 -790.84 6 1fs3 300 124 original
1278496 2476 0.32 14.13 341.30 -576.19 -228.30 -428.14 0.0 0.0 0.0 -74.76 -19.19 0.0 0.0 -985.29 10 1fs3 300 124 original
1283050 2026 0.32 15.48 445.52 -566.58 -206.81 -416.75 0.0 0.0 0.0 -40.78 -17.72 0.0 0.0 -803.12 12 1fs3 300 124 original
1281010 2488 0.33 15.29 350.13 -582.34 -216.84 -413.87 0.0 0.0 0.0 -48.14 -12.42 0.0 0.0 -923.47 11 1fs3 300 124 original
1300507 1969 0.33 15.71 458.44 -561.40 -199.57 -435.02 0.0 0.0 0.0 -52.27 -13.47 0.0 0.0 -803.28 19 1fs3 300 124 original
1295558 2024 0.34 17.60 430.70 -565.39 -160.42 -430.64 0.0 0.0 0.0 -54.15 -19.15 0.0 0.0 -799.05 17 1fs3 300 124 original
1265849 2339 0.34 16.20 372.51 -577.70 -193.96 -427.36 0.0 0.0 0.0 -45.72 -17.79 0.0 0.0 -890.02 5 1fs3 300 124 original
1263503 2495 0.34 14.08 385.20 -594.96 -229.00 -426.24 0.0 0.0 0.0 -59.84 -17.60 0.0 0.0 -942.45 4 1fs3 300 124 original
1253177 2177 0.34 16.55 445.91 -559.13 -189.74 -437.66 0.0 0.0 0.0 -52.52 -19.37 0.0 0.0 -812.51 0 1fs3 300 124 original
1272908 1892 0.34 17.13 475.97 -550.95 -176.11 -435.38 0.0 0.0 0.0 -48.52 -16.51 0.0 0.0 -751.49 8 1fs3 300 124 original
1290085 1555 0.34 15.03 518.94 -567.40 -179.76 -416.36 0.0 0.0 0.0 -24.81 -11.09 0.0 0.0 -680.47 15 1fs3 300 124 original
1287929 1901 0.35 15.35 481.79 -554.92 -186.65 -426.26 0.0 0.0 0.0 -50.07 -10.55 0.0 0.0 -746.65 14 1fs3 300 124 original
1285887 2361 0.35 14.61 379.44 -574.14 -213.94 -426.09 0.0 0.0 0.0 -55.84 -13.76 0.0 0.0 -904.33 13 1fs3 300 124 original
1255525 2023 0.35 15.44 444.61 -575.63 -182.55 -428.04 0.0 0.0 0.0 -61.84 -17.08 0.0 0.0 -820.52 1 1fs3 300 124 original
1257837 1833 0.35 15.06 430.32 -535.34 -196.46 -421.37 0.0 0.0 0.0 -43.48 -11.82 0.0 0.0 -778.14 2 1fs3 300 124 original
1297877 1841 0.36 15.35 478.04 -549.40 -190.36 -409.01 0.0 0.0 0.0 -39.69 -7.95 0.0 0.0 -718.37 18 1fs3 300 124 original
1260537 2031 0.37 15.31 440.57 -579.37 -197.69 -426.24 0.0 0.0 0.0 -55.25 -18.74 0.0 0.0 -836.73 3 1fs3 300 124 original
1270512 1998 0.37 15.97 409.48 -552.61 -191.63 -400.83 0.0 0.0 0.0 -31.33 -11.76 0.0 0.0 -778.67 7 1fs3 300 124 original
1275118 1600 0.38 15.35 489.66 -541.08 -183.56 -414.71 0.0 0.0 0.0 -39.35 -9.64 0.0 0.0 -698.68 9 1fs3 300 124 original
1293100 2068 0.40 15.28 412.83 -577.66 -193.95 -433.05 0.0 0.0 0.0 -38.18 -16.36 0.0 0.0 -846.36 16 1fs3 300 124 original

In [ ]:


In [329]:
d_cys.query("Protein=='1fs3' and Submode=='33'").groupby("Run")[["TP", "FN", "FP", "fraction"]].sum()


Out[329]:
TP FN FP fraction
Run
0 1135 6905 1975 240.816667
1 357 7771 2827 83.983333
2 1155 6961 2839 246.025000
3 1354 6478 2610 279.261905
4 1126 6810 2858 229.946429
5 333 7259 2133 77.233333
6 1122 7142 2853 243.052381
7 556 7328 2657 129.733333
8 393 7667 2416 92.533333
9 553 7543 2351 123.616667
10 377 7667 2691 87.650000
11 775 7113 2071 178.266667
12 1590 6662 2064 353.492857
13 772 7424 3006 174.061905
14 516 7184 2362 118.333333
15 408 7356 2065 96.050000
16 632 7772 2296 144.683333
17 388 7576 2316 88.433333
18 1002 6986 2095 222.733333
19 774 6714 1799 174.533333

In [330]:
d_cys.query("model > 2000 and Protein=='1fs3' and Submode=='300'").sort_values("FP")


Out[330]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP
547720 73458 2257 16 1fs3 300 0 0.25 1 3 0
550805 76543 2310 14 1fs3 300 0 0.25 1 3 0
550788 76526 2310 9 1fs3 300 0 0.25 1 3 0
550751 76489 2309 16 1fs3 300 0 0.25 1 3 0
537474 63212 2076 16 1fs3 300 0 0.25 1 3 0
... ... ... ... ... ... ... ... ... ... ...
559941 85679 2469 16 1fs3 300 0 0.00 0 4 2
555740 81478 2396 9 1fs3 300 0 0.00 0 4 3
539178 64916 2107 9 1fs3 300 0 0.00 0 4 3
551120 76858 2316 0 1fs3 300 0 0.00 0 4 3
558218 83956 2439 9 1fs3 300 0 0.00 0 4 4

2162 rows × 10 columns


In [332]:
d_cys.query("Protein=='1fs3' and Submode=='300'").sort_values("FP")


Out[332]:
Unnamed: 0 model Run Protein Submode level_4 fraction TP FN FP
526129 51867 1861 16 1fs3 300 0 0.25 1 3 0
495106 20844 1074 1 1fs3 300 0 0.25 1 3 0
495102 20840 1073 19 1fs3 300 0 0.25 1 3 0
548933 74671 2278 7 1fs3 300 0 0.25 1 3 0
515789 41527 1640 11 1fs3 300 0 0.25 1 3 0
... ... ... ... ... ... ... ... ... ... ...
555740 81478 2396 9 1fs3 300 0 0.00 0 4 3
500112 25850 1231 15 1fs3 300 0 0.00 0 4 3
494891 20629 1067 1 1fs3 300 0 0.00 0 4 3
539178 64916 2107 9 1fs3 300 0 0.00 0 4 3
558218 83956 2439 9 1fs3 300 0 0.00 0 4 4

6244 rows × 10 columns


In [358]:


In [400]:
a = pd.DataFrame(range(1, 2501), columns=["Steps"])
t = d_cys.query("Run==3 and Protein=='1fs3' and Submode=='33'").reset_index(drop=True)
t["Steps"] = t["model"] + 1

b = a.merge(t,how="left").fillna(0)
plt.plot(b["fraction"])
plt.xlabel("Frame")
plt.ylabel("Fraction")
plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/fraction_trajectory_run3_1fs3_33.png", dpi=300, bbox_inches='tight')



In [237]:
t = d_cys_2.query("Run==3 and Protein=='1fs3' and Submode=='33'").reset_index(drop=True)
plt.plot(t["fraction"])


Out[237]:
[<matplotlib.lines.Line2D at 0x16f0093c8>]

In [242]:
t = d_cys_2.query("Run==12 and Protein=='1fs3' and Submode=='33'").reset_index(drop=True)
plt.plot(t["fraction"])


Out[242]:
[<matplotlib.lines.Line2D at 0x1350af240>]

In [ ]:
# read cys bond information.
selected = data.query("Steps > 1500").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)


# selected = pd.read_csv("/Users/weilu/Research/server/may_week2_2020/disulfide_bond/selected.csv")
fraction = []
for i, line in selected.iterrows():
    run = line["Run"]
    protein = line["Protein"]
    frame = line["Steps"] - 2
    submode = line["Submode"]
    print(run, protein, frame, submode)
    fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/chosen_frames/{protein}_{submode}.pdb"
    prediction_cys_bond = get_cys_info(fileLocation)
    fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
    native_cys_bond = get_cys_info(fileLocation)
    TP, FN, FP = correct_fraction(native_cys_bond, prediction_cys_bond)
    fraction.append([TP, FN, FP])

selected = selected.reset_index(drop=True)
fraction_ = pd.DataFrame(fraction, columns=["TP", "FN", "FP"])
selected = pd.concat([selected, fraction_], axis=1)
selected["fraction"] = selected["TP"] / (selected["TP"] + selected["FN"] + selected["FP"])

In [ ]:


In [164]:
data.query("model > 2000 and Protein=='1fs3'").sort_values("FP")


Out[164]:
Unnamed: 0 model Run Protein Submode fraction TP FN FP
503222 28960 2091 1 1fs3 32 1.00 4 0 0
557590 83328 2303 17 1fs3 33 0.75 3 0 1
549628 75366 2299 13 1fs3 33 0.60 3 1 1
555606 81344 2310 16 1fs3 33 0.75 3 0 1
555572 81310 2276 16 1fs3 33 0.50 3 2 1
... ... ... ... ... ... ... ... ... ...
483287 9025 2159 6 1fs3 30 0.00 0 1 4
510990 36728 2071 9 1fs3 32 0.00 0 2 4
510989 36727 2069 9 1fs3 32 0.00 0 2 4
510987 36725 2066 9 1fs3 32 0.00 0 1 4
511130 36868 2413 9 1fs3 32 0.00 0 1 4

28417 rows × 9 columns


In [ ]:


In [142]:
data.query("model > 2000").sort_values("FP").query("FP == 7")


Out[142]:
Unnamed: 0 model Run Protein Submode fraction TP FN FP
411427 132335 2238 13 1hn4_A 32 0.0 0 1 7
451845 172753 2396 10 1hn4_A 33 0.0 0 2 7
417876 138784 2216 16 1hn4_A 32 0.0 0 1 7
395949 116857 2188 6 1hn4_A 32 0.0 0 1 7
396023 116931 2262 6 1hn4_A 32 0.0 0 1 7
... ... ... ... ... ... ... ... ... ...
378447 99355 2185 18 1hn4_A 31 0.0 0 1 7
292712 13620 2408 8 1hn4_A 300 0.0 0 1 7
292711 13619 2407 8 1hn4_A 300 0.0 0 1 7
378561 99469 2300 18 1hn4_A 31 0.0 0 1 7
294339 15247 2486 9 1hn4_A 300 0.0 0 1 7

4661 rows × 9 columns


In [150]:


In [ ]:


In [ ]:


In [ ]:
scheme_dic = {"30":"k_0", "31":"k_1", "32":"k_2", "33":"k_5", "300":"original"}
data["Submode"] = data["Submode"].astype(str)
data["Scheme"] = data["Submode"].apply(lambda x: scheme_dic[x])
data["k_cys"] = data["Scheme"]

In [142]:
# selected = pd.read_csv("/Users/weilu/Research/server/may_week2_2020/disulfide_bond/selected.csv")
fraction = []
for i, line in selected.iterrows():
    run = line["Run"]
    protein = line["Protein"]
    frame = line["Steps"] - 2
    submode = line["Submode"]
    print(run, protein, frame, submode)
    fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/chosen_frames/{protein}_{submode}.pdb"
    prediction_cys_bond = get_cys_info(fileLocation)
    fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
    native_cys_bond = get_cys_info(fileLocation)
    TP, FN, FP = correct_fraction(native_cys_bond, prediction_cys_bond)
    fraction.append([TP, FN, FP])


2 1ppb 1569 32
5 1ppb 2457 33
2 1ppb 2187 31
5 1ppb 2172 300
3 1ppb 1890 30
16 1fs3 2066 300
14 1fs3 2114 30
3 1fs3 2431 31
14 1tcg 2332 30
12 1tcg 2140 31
6 1tcg 2322 32
11 1tcg 2361 33
9 1tcg 2028 300
12 1lmm 2362 30
10 1lmm 2479 31
8 1lmm 2295 32
1 1lmm 2025 33
4 1fs3 2149 32
7 1lmm 2146 300
2 1hn4 2270 30
3 1fs3 2414 33
9 1hn4 1990 31
10 1hn4 2281 300
17 1hn4 2062 32
6 1hn4 1977 33
16 1bpi 2392 300
8 1bpi 2164 30
19 1bpi 2347 31
19 1bpi 2444 32
14 1bpi 2213 33

In [157]:
# pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# # pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1bpi"]
# for pdb in pdb_list:
#     print(pdb)
#     protein = pdb
#     fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
#     native_cys_bond = get_cys_info(fileLocation)

#     data_selected = data.query(f"Protein=='{protein}'")
#     result = data_selected[["model", "Run", "Protein", "Submode"]].groupby(["model", "Run", "Protein", "Submode"]).head(1).reset_index(drop=True)
#     result['fraction'], result['TP'], result['FN'], result['FP'] = zip(*data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v2, native_cys_bond).reset_index(drop=True))
#     result.to_csv(f"/Users/weilu/Research/data/openMM/{protein}_cys_05-21.csv")
# pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# d_list =[]
# for pdb in pdb_list:
#     t = pd.read_csv(f"/Users/weilu/Research/data/openMM/{pdb}_cys_05-21.csv")
#     d_list.append(t)
# data = pd.concat(d_list).reset_index(drop=True)
# d_cys = data

# protein = "1fs3"
# fileLocation = f"/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/{protein}/{protein}.pdb"
# native_cys_bond = get_cys_info(fileLocation)

# data_selected = data.query(f"Protein=='{protein}'")
# result = data_selected[["model", "Run", "Protein", "Submode"]].groupby(["model", "Run", "Protein", "Submode"]).head(1).reset_index(drop=True)
# result['fraction'], result['TP'], result['FN'], result['FP'] = zip(*data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v2, native_cys_bond).reset_index(drop=True))
# result.to_csv(f"/Users/weilu/Research/data/openMM/{protein}_cys_05-21.csv")

In [159]:
# selected.to_csv("/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-20.csv")

In [151]:
t = data.query("Run==9 and Protein=='1hn4_A' and Submode=='300'").reset_index(drop=True)
plt.plot(t["fraction"])


Out[151]:
[<matplotlib.lines.Line2D at 0x12e0bbdd8>]

In [153]:
t.head(20)


Out[153]:
Unnamed: 0 model Run Protein Submode fraction TP FN FP
0 13673 3 9 1hn4_A 300 0.285714 2 0 5
1 13674 8 9 1hn4_A 300 0.142857 1 0 6
2 13675 11 9 1hn4_A 300 0.571429 4 0 3
3 13676 12 9 1hn4_A 300 0.142857 1 0 6
4 13677 14 9 1hn4_A 300 0.000000 0 1 7
5 13678 24 9 1hn4_A 300 0.111111 1 2 6
6 13679 25 9 1hn4_A 300 0.142857 1 0 6
7 13680 28 9 1hn4_A 300 0.250000 2 1 5
8 13681 30 9 1hn4_A 300 0.142857 1 0 6
9 13682 31 9 1hn4_A 300 0.000000 0 1 7
10 13683 32 9 1hn4_A 300 0.000000 0 1 7
11 13684 35 9 1hn4_A 300 0.857143 6 0 1
12 13685 36 9 1hn4_A 300 0.250000 2 1 5
13 13686 39 9 1hn4_A 300 0.428571 3 0 4
14 13687 47 9 1hn4_A 300 0.285714 2 0 5
15 13688 48 9 1hn4_A 300 0.142857 1 0 6
16 13689 52 9 1hn4_A 300 0.142857 1 0 6
17 13690 69 9 1hn4_A 300 0.142857 1 0 6
18 13691 70 9 1hn4_A 300 0.142857 1 0 6
19 13692 73 9 1hn4_A 300 0.571429 4 0 3

In [ ]:


In [114]:
data.head(10)["Protein"].unique()[0]


Out[114]:
'1hn4_A'

In [ ]:
result

In [116]:
result = t[["model", "Run", "Protein", "Submode"]].groupby("model").head(1).reset_index(drop=True)
result['fraction'], result['TP'], result['FN'], result['FP'] = zip(*t.groupby("model").apply(correct_fraction_v2, native_cys_bond).reset_index(drop=True))

In [117]:
result


Out[117]:
model Run Protein Submode fraction TP FN FP
0 3 3 1fs3 33 0.000000 0 1 4
1 22 3 1fs3 33 0.000000 0 1 4
2 65 3 1fs3 33 0.000000 0 1 4
3 68 3 1fs3 33 0.000000 0 1 4
4 77 3 1fs3 33 0.000000 0 1 4
... ... ... ... ... ... ... ... ...
1953 2495 3 1fs3 33 0.400000 2 1 2
1954 2496 3 1fs3 33 0.500000 3 2 1
1955 2497 3 1fs3 33 0.400000 2 1 2
1956 2498 3 1fs3 33 0.333333 2 2 2
1957 2499 3 1fs3 33 0.400000 2 1 2

1958 rows × 8 columns


In [52]:
f, TP, FN, FP =  correct_fraction_v2(native_cys_bond, tt)

In [100]:
t.groupby("model").apply(correct_fraction_v2, native_cys_bond).reset_index(drop=True)


Out[100]:
0                      (0.0, 0, 1, 4)
1                      (0.0, 0, 1, 4)
2                      (0.0, 0, 1, 4)
3                      (0.0, 0, 1, 4)
4                      (0.0, 0, 1, 4)
                    ...              
1953                   (0.4, 2, 1, 2)
1954                   (0.5, 3, 2, 1)
1955                   (0.4, 2, 1, 2)
1956    (0.3333333333333333, 2, 2, 2)
1957                   (0.4, 2, 1, 2)
Length: 1958, dtype: object

In [ ]:
result.apply()

In [76]:
result = t.groupby("model").apply(correct_fraction_v2, native_cys_bond).reset_index(drop=True)

In [98]:
result


Out[98]:
model Run Protein Submode fraction TP FN FP
0 3 3 1fs3 33 0.000000 0 1 4
1 22 3 1fs3 33 0.000000 0 1 4
2 65 3 1fs3 33 0.000000 0 1 4
3 68 3 1fs3 33 0.000000 0 1 4
4 77 3 1fs3 33 0.000000 0 1 4
... ... ... ... ... ... ... ... ...
1953 2495 3 1fs3 33 0.400000 2 1 2
1954 2496 3 1fs3 33 0.500000 3 2 1
1955 2497 3 1fs3 33 0.400000 2 1 2
1956 2498 3 1fs3 33 0.333333 2 2 2
1957 2499 3 1fs3 33 0.400000 2 1 2

1958 rows × 8 columns


In [101]:
plt.plot(result["TP"])


Out[101]:
[<matplotlib.lines.Line2D at 0x131451ac8>]

In [99]:
plt.plot(result["fraction"])


Out[99]:
[<matplotlib.lines.Line2D at 0x131613b00>]

In [ ]:


In [64]:
result = result.to_frame(name="result")

In [106]:
data


Out[106]:
model i j r Run Protein Submode
0 11 48 49 4.415734 0 1hn4_A 300
1 13 88 100 4.236747 0 1hn4_A 300
2 15 15 48 3.744685 0 1hn4_A 300
3 24 55 102 3.940284 0 1hn4_A 300
4 25 65 102 3.700861 0 1hn4_A 300
... ... ... ... ... ... ... ...
1237625 2497 29 50 3.663368 19 1bpi 33
1237626 2498 13 37 3.453648 19 1bpi 33
1237627 2498 29 54 3.628839 19 1bpi 33
1237628 2499 13 37 3.571851 19 1bpi 33
1237629 2499 29 54 3.515576 19 1bpi 33

1237630 rows × 7 columns


In [49]:
tt


Out[49]:
model i j r Run Protein Submode pair
0 2499 25 83 3.519857 3 1fs3 33 25_83
1 2499 39 94 3.707108 3 1fs3 33 39_94
2 2499 71 109 3.992830 3 1fs3 33 71_109

In [320]:
selected = pd.read_csv("/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-20.csv", index_col=0)

In [174]:
selected = selected.reset_index(drop=True)
fraction_ = pd.DataFrame(fraction, columns=["TP", "FN", "FP"])
selected = pd.concat([selected, fraction_], axis=1)
selected["fraction"] = selected["TP"] / (selected["TP"] + selected["FN"] + selected["FP"])


# selected["Submode"] = selected["Submode"].astype(str)
# # selected["Scheme"] = "mode" + selected["Submode"]
# selected["Scheme"] = selected["Submode"].apply(lambda x: scheme_dic[x])
ax = sns.lineplot("Protein", "fraction", hue="Scheme", style="Scheme", markers=True, data=selected, dashes=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
leg = ax.legend(loc = 'center left', bbox_to_anchor = (1.0, 0.5))
# plt.tight_layout()
plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_fraction.png", dpi=300, bbox_inches='tight')



In [ ]:


In [ ]:


In [43]:
s = parser.get_structure("X", "/Users/weilu/Research/server/may_week2_2020/disulfide_bond/setups/1lmm/1lmm.pdb")
m = s[0]
all_res = list(m.get_residues())

In [52]:
n = len(all_res)
native_cys_bond = []
for i in range(n):
    res1 = all_res[i]
    if res1.resname != "CYS":
        continue
    for j in range(i+1, n):
        res2 = all_res[j]
        if res2.resname != "CYS":
            continue
        r = res1["CB"] - res2["CB"]
        # print(i, j, r)
        if r < 4.5:
            # print(i, j, r)
            native_cys_bond.append([i, j, r])
native_cys_bond = pd.DataFrame(native_cys_bond, columns=["i", "j", "r"])

In [53]:
native_cys_bond


Out[53]:
i j r
0 2 17 3.431250
1 9 22 4.407601
2 16 32 3.717292

In [55]:
s = parser.get_structure("X", "/Users/weilu/Research/server/may_week2_2020/disulfide_bond/run1_ho/1lmm/33_0/lastFrame.pdb")

m = s[0]
all_res = list(m.get_residues())

In [56]:
n = len(all_res)
for i in range(n):
    res1 = all_res[i]
    if res1.resname != "CYS":
        continue
    for j in range(i+1, n):
        res2 = all_res[j]
        if res2.resname != "CYS":
            continue
        r = res1["CB"] - res2["CB"]
        if r < 4.5:
            print(i, j, r)


9 17 3.5390723
22 32 4.0912676

In [98]:
native_cys_bond


Out[98]:
i j r
0 2 17 3.431250
1 9 22 4.407601
2 16 32 3.717292

In [80]:
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb_H", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
# subMode_list += [1300, 130, 131, 132, 133]
subMode_list += [3300, 330, 331, 332, 333]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run2_single_timeStep2_lesser_frag"
pre_base = "/Users/weilu/Research/server/jun_week1_2020//disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/info.dat"
            try:
                tmp = pd.read_csv(location, sep="\s+")
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_lesser_frag_06-17.csv

In [28]:
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb_H", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
# subMode_list += [1300, 130, 131, 132, 133]
subMode_list += [2300, 230, 231, 232, 233]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run2_single_timeStep2_less_frag"
pre_base = "/Users/weilu/Research/server/jun_week1_2020//disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/info.dat"
            try:
                tmp = pd.read_csv(location, sep="\s+")
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


1ppb_H 1 2300 /Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1ppb_H/2300_1/info.dat
[Errno 2] File b'/Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1ppb_H/2300_1/info.dat' does not exist: b'/Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1ppb_H/2300_1/info.dat'
1lmm 1 230 /Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1lmm/230_1/info.dat
[Errno 2] File b'/Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1lmm/230_1/info.dat' does not exist: b'/Users/weilu/Research/server/jun_week1_2020//disulfide_bond//run2_single_timeStep2_less_frag/1lmm/230_1/info.dat'
/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_less_frag_06-10.csv

In [3]:
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
subMode_list += [1300, 130, 131, 132, 133]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run2_single_timeStep2"
pre_base = "/Users/weilu/Research/server/jun_week1_2020//disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/info.dat"
            try:
                tmp = pd.read_csv(location, sep="\s+")
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_06-06.csv

In [ ]:
pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1fs3", "1bpi"]
# pdb_list = ["1ppb", "1lmm", "1tcg", "1hn4_A", "1bpi"]
for pdb in pdb_list:
    print(pdb)
    protein = pdb
    fileLocation = f"/Users/weilu/Research/server/jun_week1_2020//disulfide_bond/setups/{protein}/{protein}.pdb"
    native_cys_bond = get_cys_info(fileLocation)

    data_selected = data_cys.query(f"Protein=='{protein}'")
    result = data_selected.groupby(["model", "Run", "Protein", "Submode"]).apply(correct_fraction_v3, native_cys_bond=native_cys_bond).reset_index()
    result.to_csv(f"/Users/weilu/Research/data/openMM/{protein}_cys_06-06.csv")

In [19]:
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb_H", "1ppb", "1lmm", "1tcg", "1hn4", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
subMode_list += [300, 30, 31, 32, 33]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run1_ho"
pre_base = "/Users/weilu/Research/server/may_week2_2020/disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/info.dat"
            try:
                tmp = pd.read_csv(location, sep="\s+")
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/six_proteins_run1_ho_05-21.csv

In [ ]:
a = pd.read_csv("/Users/weilu/Research/data/openMM/length_info_12-01.csv", index_col=0)
data = data.merge(a, on="Protein")

In [17]:
filtered_data = data.query("Steps > 1500 and Protein == '1lmm'")
Q_max = filtered_data.sort_values('Q').groupby(["Submode", "Run"]).tail(1).sort_values(['Submode', "Q"])
Q_max["Annealing Index"] = Q_max.groupby(["Submode"])["Q"].rank(method="first", ascending=False).astype(int)
sns.lineplot("Annealing Index", "Q", hue="Submode", markers=True, data=Q_max, style="Submode", dashes=False)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x122d39a20>

In [ ]:


In [88]:
selected = data.query("Steps > 1500").sort_values("Q").groupby(["Protein", "Submode"]).tail(1)
selected["Submode"] = selected["Submode"].astype(str)
selected["Scheme"] = "mode" + selected["Submode"]
ax = sns.lineplot("Protein", "Q", hue="Scheme", markers=True, data=selected, dashes=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


Out[88]:
<matplotlib.legend.Legend at 0x189ed32b0>

In [74]:
selected = data.query("Steps > 1500")
sns.boxenplot("Protein", "Q", hue="Submode", data=selected)


Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x12b83e630>

In [ ]:
selected.to_csv("/Users/weilu/Research/server/may_week2_2020/disulfide_bond/selected.csv")

In [ ]:
memory = 'ho'
filtered_data = data.query(f"memory == '{memory}'").query("k_cys != 'k_10' and k_cys != 'k_20'")
# filtered_data = data.query(f"Protein == '{protein}'").query(f"memory == '{memory}'").query("k_cys != 'k_10' and k_cys != 'k_20'")
Q_max = filtered_data.sort_values('Q').groupby(["Submode", "Protein"]).tail(1).sort_values(['Submode', "Q"])
Q_max.Protein = pd.Categorical(Q_max.Protein, a.sort_values("Length").Protein.values)
Q_max.k_cys = pd.Categorical(Q_max.k_cys, ["k_0", "k_1", "k_2", "k_5"])
# Q_max["Annealing Index"] = Q_max.groupby(["Submode"])["Q"].rank(method="first", ascending=False).astype(int)
# last = data.query("Steps > 350 and Folder in ['eighth_2_cys']")

# sns.lineplot("Annealing Index", "Q", markers=True, style="k_cys", data=Q_max, hue="k_cys", dashes=False)
sns.lineplot("Protein", "Q", hue="k_cys", markers=True, data=Q_max.sort_values("Length").reset_index(drop=True), style="k_cys", dashes=False)
plt.ylabel("Best Q")
# _ = plt.xticks(np.arange(1, 21, 1))
plt.title(f"structure prediction results of 5 Disufide rich proteins")
# plt.savefig("/Users/weilu/Dropbox/openAWSEM/figures/k_cys_original_gamma.png", dpi=300)

In [28]:
a = np.load("/Users/weilu/Research/server/may_week3_2020/T1024/T1024.npz")

In [33]:
a["dist"].shape


Out[33]:
(408, 408, 37)

In [31]:
b = np.load("/Users/weilu/Research/server/may_week3_2020/T1024/dist.npz")

In [34]:
b["distspline"].shape


Out[34]:
(408, 408, 35)

In [ ]:
def get_frame(file="movie.pdb", to="last_frame.pdb", frame=-1):
    # default is last frame.
    # if you want first, please set frame to 1.
    a = open(file).read().split("ENDMDL")
    assert a[-1] == "\nEND\n"
    with open(to, "w") as out:
        out.write(a[frame-1])

In [ ]:


In [ ]:


In [ ]:


In [31]:
# read all cys information.
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb_H", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
# subMode_list += [300, 30, 31, 32, 33]
subMode_list += [3300, 330, 331, 332, 333]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "six_proteins"
subFolder = "run2_single_timeStep2_lesser_frag"
pre_base = "/Users/weilu/Research/server/jun_week1_2020/disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(15):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/cys_bonds.csv"
            try:
                tmp = pd.read_csv(location, index_col=0)
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/six_proteins_run2_single_timeStep2_lesser_frag_06-16.csv

In [79]:
# read all cys information.
# pdb_list = ["1tt8", "6btc", "6g57", "6q64", "pex22", "sos", "unk2", "6jdq", "5uak"]
pdb_list = ["1hn4_A", "1ppb_H", "1lmm", "1tcg", "1fs3", "1bpi"]
subMode_list = []
# subMode_list += [100, 101, 102, 103, 104, 105, 106]
# subMode_list += [110, 111, 112, 113, 114, 115, 116]
# subMode_list += [120, 121, 122, 123, 124, 125, 126]
# subMode_list += [0, 1, 2, 3, 4, 5]
# subMode_list += [30, 31, 32, 33, 34, 35]
# subMode_list += [300, 30, 31, 32, 33]
subMode_list += [3300, 330, 331, 332, 333]
# subMode_list += [130, 131, 132, 133, 134, 135] 
simulationType = "cys_all"
subFolder = "run2_single_timeStep2_lesser_frag"
pre_base = "/Users/weilu/Research/server/jun_week1_2020/disulfide_bond/"
info = []
all_data = []
for pdb in pdb_list:
    for submode in subMode_list:
        for i in range(20):
            pre = f"{pre_base}/{subFolder}/{pdb}/{submode}_{i}"
            location = f"{pre}/cys_bonds.csv"
            try:
                tmp = pd.read_csv(location, index_col=0)
                tmp = tmp.assign(Run=i, Protein=pdb, Submode=submode)
                all_data.append(tmp)
            except Exception as e:
                print(pdb, i, submode, location)
                print(e)
                pass
data = pd.concat(all_data, sort=False)

# info = pd.DataFrame(info)

today = datetime.today().strftime('%m-%d')
outFile = f"/Users/weilu/Research/data/openMM/{simulationType}_{subFolder}_{today}.csv"
data.reset_index(drop=True).to_csv(outFile)
print(outFile)


/Users/weilu/Research/data/openMM/cys_all_run2_single_timeStep2_lesser_frag_06-17.csv

In [ ]: