In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

sns.set(rc={'figure.figsize':(10,6.180)})
sns.set_style("whitegrid")

%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [31]:
pdb_list = ["2bg9", "1j4n", "1py6_SD", "2bl2", "1rhz", "1iwg", "2ic8", "1pv6", "1occ", "1kpl", "2bs2", "1py6", "1u19"]

simulationType = "openMM_membrane_structure_prediction"
folder = "test_2"

location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/energy_new_no_burial.dat"
new = pd.read_csv(location, sep="\s+").assign(Folder="new")
location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/energy_no_burial.dat"
original = pd.read_csv(location, sep="\s+").assign(Folder="original")
# d = pd.concat([new, original])
plt.scatter(new["Qc"], new["Contact"])
plt.scatter(original["Qc"], original["Contact"], c="red")
# original.plot.scatter("Qc", "Contact")


Out[31]:
<matplotlib.collections.PathCollection at 0x1a1ce28128>

In [32]:
pdb_list = ["2bg9", "1j4n", "1py6_SD", "2bl2", "1rhz", "1iwg", "2ic8", "1pv6", "1occ", "1kpl", "2bs2", "1py6", "1u19"]

simulationType = "openMM_membrane_structure_prediction"
folder = "test_1_1"

location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/energy_new_no_burial.dat"
new = pd.read_csv(location, sep="\s+").assign(Folder="new")
location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/energy_no_burial.dat"
original = pd.read_csv(location, sep="\s+").assign(Folder="original")
# d = pd.concat([new, original])
plt.scatter(new["Qc"], new["Contact"])
plt.scatter(original["Qc"], original["Contact"], c="red")
# original.plot.scatter("Qc", "Contact")


Out[32]:
<matplotlib.collections.PathCollection at 0x1a1bebae10>

In [30]:



Out[30]:
<matplotlib.collections.PathCollection at 0x1a1ce840f0>

In [ ]:
pdb_list = ["2bg9", "1j4n", "1py6_SD", "2bl2", "1rhz", "1iwg", "2ic8", "1pv6", "1occ", "1kpl", "2bs2", "1py6", "1u19"]

simulationType = "openMM_membrane_structure_prediction"
folder = "test_2"
n_run = 1
all_data = []
for pdb in pdb_list:
    for i in range(n_run):
        for restart in range(2):
            location = f"/Users/weilu/Research/server/jun_2019/{simulationType}/{folder}/{pdb}/{i}_{restart}/info.dat"
            try:
                tmp = pd.read_csv(location, sep="\s+")
                tmp = tmp.assign(Run=i, Protein=pdb, Restart=restart)
                all_data.append(tmp)
            except:
                print(pdb, i, restart)
                pass
data = pd.concat(all_data)
today = datetime.today().strftime('%m-%d')
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/openMM/{simulationType}_{folder}_{today}.csv")

In [3]:
def my_reorder(a, first):
    # move first to the top. and keep the rest
    new_order = first.copy()
    for col in a:
        if col not in first:
            new_order.append(col)
    return new_order

def read_pdb(pre, name, run=30, rerun=2):
    all_data = []
    if run == -1:
        run_list = ["native"]
    else:
        run_list = list(range(run))
    for i in run_list:
        if rerun == -1:
            rerun_list = ["rerun"]
        else:
            rerun_list = list(range(rerun))
        for j in rerun_list:
            # pre = "/Users/weilu/Research/server/nov_2018/iterative_optimization_4/all_simulations/"
            location = pre + f"{name}/simulation/{i}/{j}/"
            try:
                wham = pd.read_csv(location+"wham.dat")
            except:
                print(f"PDB: {name}, Run: {i}, Rerun: {j} not exist")
                print(location+"wham.dat")
                continue
            wham.columns = wham.columns.str.strip()
            remove_columns = ['Tc', 'Energy']
            wham = wham.drop(remove_columns, axis=1)
            energy = pd.read_csv(location+"energy.dat")
            energy.columns = energy.columns.str.strip()
            remove_columns = ['Steps', 'Shake', 'Excluded', 'Helix', 'AMH-Go', 'Vec_FM', 'SSB']
            energy = energy.drop(remove_columns, axis=1)
            data = pd.concat([wham, energy], axis=1).assign(Repeat=i, Run=j)
            all_data.append(data)
    data = pd.concat(all_data).reset_index(drop=True)
    data = data.reindex(columns=my_reorder(data.columns, ["Steps", "Qw", "VTotal", "Run", "Repeat"]))
    print(name, len(data))
    return data

In [5]:
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
            "new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
            "test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
#     folder = "first"
#     steps = 30
def get_complete_data(pre, folder_list, pdb_list, formatName=True, **kwargs):
    complete_all_data = []
    for folder in folder_list:
        # pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_old_set/"
        pre_folder = f"{pre}{folder}/"
        all_data = []
        for p in pdb_list:
            if formatName:
                name = p.lower()[:4]
            else:
                name = p
            tmp = read_pdb(pre_folder, name, **kwargs)
            all_data.append(tmp.assign(Name=name))
        data = pd.concat(all_data)
        complete_all_data.append(data.assign(Folder=folder))
    data = pd.concat(complete_all_data)
    data = data.reindex(columns=my_reorder(data.columns, ["Name", "Folder"]))
    return data

In [ ]:


In [7]:
pre = "/Users/weilu/Research/server/jun_2019/membrane_potein_prediction/"
# folder_list = ["multi_iter0", "original"]
folder_list = ["first"]
# pdb_list = ['T0759-D1', 'T0953s2-D1', 'T0943-D1', 'T0773-D1', 'T0816-D1', 'T0854-D2', 'T0767-D1', 'T0853-D1', 'T0958-D1', 'T0834-D2', 'T0960-D3', 'T0862-D1', 'T0912-D3', 'T0898-D1', 'T0824-D1', 'T0782-D1', 'T0830-D2', 'T0761-D2', 'T0968s1-D1', 'T0870-D1', 'T0838-D1', 'T0803-D1']
pdb_list = ["2bg9", "1j4n", "1py6_SD", "2bl2", "1rhz", "1iwg", "2ic8", "1pv6", "1occ", "1kpl", "2bs2", "1py6", "1u19"]

# data = get_complete_data(pre, folder_list, pdb_list, run=30, rerun=-1, formatName=True)
data = get_complete_data(pre, folder_list, pdb_list, run=10, rerun=1, formatName=False)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
subset_data = data.query("Steps % 80000 == 0 and Steps != 0")
a = data


2bg9 7500
1j4n 7500
1py6_SD 7500
2bl2 6980
1rhz 5847
1iwg 5621
2ic8 4811
1pv6 4590
1occ 5748
1kpl 6148
2bs2 5585
1py6 5799
1u19 3818

In [10]:


In [12]:
data = get_complete_data(pre, folder_list, pdb_list, run=-1, rerun=-1, formatName=False)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
native = data


2bg9 27
1j4n 27
1py6_SD 27
2bl2 18
1rhz 14
1iwg 15
2ic8 13
1pv6 12
1occ 13
1kpl 11
2bs2 12
1py6 10
1u19 7

In [14]:
native_energy = native.groupby(["Name", "Folder"]).head(1)
prediction_energy = data.groupby(["Name", "Repeat", "Folder"]).head(1)
native_energy["Contact"] = native_energy["Water"] + native_energy["Burial"]


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

In [17]:
native_energy


Out[17]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama DSSP P_AP Water Burial Frag_Mem Membrane Ebond Epair Contact
0 2bg9 first 0 0.838101 -948.414961 rerun native 15.220424 5.257505 2.125330 -472.736420 -1.546839e-23 -4.132152 -8.159816 -80.603627 -365.448652 -24.717129 0.793329 0.132028 -88.763444
0 1j4n first 0 0.944231 -1192.264636 rerun native 16.984969 9.934251 2.884753 -510.291123 -2.347511e-13 -24.266347 -27.095550 -98.797371 -540.186041 -4.447209 1.302925 0.633765 -125.892921
0 1py6_SD first 0 0.977503 -1219.178215 rerun native 15.567489 8.020894 3.168102 -591.291468 -1.095754e-11 -16.638590 -22.423293 -106.655333 -483.004036 -10.354490 1.454516 0.604241 -129.078627
0 2bl2 first 0 0.977152 -1536.032752 rerun native 17.583618 16.732055 4.580090 -699.557997 -5.731210e-10 -65.592803 -66.780346 -118.704081 -604.826281 -1.883389 2.953080 3.200684 -185.484427
0 1rhz first 0 0.966949 -1499.395452 rerun native 17.144802 11.648333 3.504758 -635.301058 -7.307994e-04 -22.519471 -55.247615 -141.073410 -644.937690 -15.468568 2.024466 1.035943 -196.321025
0 1iwg first 0 0.956191 -1857.954118 rerun native 17.329547 11.202651 4.863830 -778.514078 -4.249604e-11 -38.894420 -50.473604 -143.637969 -847.096406 -15.404122 1.626904 1.058777 -194.111573
0 2ic8 first 0 0.952585 -1757.486692 rerun native 15.822472 18.132672 4.939960 -786.166908 -3.804962e-03 -33.210745 -87.592790 -152.161484 -703.599720 -17.823872 2.555108 1.677647 -239.754274
0 1pv6 first 0 0.940864 -1846.435169 rerun native 17.026593 21.073074 5.362404 -819.864103 -2.768316e-11 -40.909274 -74.163136 -165.492078 -730.664706 -41.777349 3.741343 2.057372 -239.655214
0 1occ first 0 0.970189 -1837.646139 rerun native 19.121399 11.386351 4.668925 -899.021427 -1.793563e-07 -23.079568 -52.129511 -166.342742 -694.731792 -18.396375 1.836573 0.707416 -218.472253
0 1kpl first 0 0.964803 -2002.395921 rerun native 18.085826 21.454679 5.906635 -886.466975 -3.566527e-08 -83.594964 -98.804139 -160.413143 -813.059274 12.581260 3.918806 2.448429 -259.217282
0 2bs2 first 0 0.974153 -2205.956854 rerun native 19.221433 16.678803 5.804319 -1051.844953 -2.294539e-12 -21.937901 -49.441027 -190.786283 -898.109580 -16.320231 2.604627 1.035547 -240.227310
0 1py6 first 0 0.952403 -2277.245532 rerun native 17.864933 16.636881 5.498604 -1060.222114 -1.901159e-06 -59.327782 -72.139637 -197.394441 -894.790264 -15.506777 2.496800 1.040674 -269.534078
0 1u19 first 0 0.955318 -2718.316935 rerun native 19.730365 25.071359 7.666655 -1253.849934 -8.827194e+00 -55.135343 -118.934786 -240.287595 -1040.758396 -33.261701 4.131762 2.224099 -359.222381

In [15]:
y_show = "Contact"
g = sns.FacetGrid(a, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
#     energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
#     ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [ ]:
subset_data.tail()

In [ ]:


In [84]:
a = subset_data.query("Folder != 'multi_iter0_A_norm'")

In [85]:
y_show = "Contact"
g = sns.FacetGrid(a, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
    ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [40]:
data.to_csv("/Users/weilu/Research/data/optimization/may08.csv")

In [41]:
last_frame = data.groupby(["Name", "Repeat", "Folder"]).tail(1)

In [55]:
def do(cmd):
    return subprocess.Popen(cmd, shell=True).wait()

In [51]:
import subprocess

In [ ]:


In [57]:
for i, line in last_frame.iterrows():
    frame = int(int(line["Steps"]) / 4000)
    folder = line["Folder"]
    name = line["Name"]
    repeat = int(line["Repeat"])
    source = f"/Users/weilu/Research/server/may_2019/database/{folder}_{name}_{repeat}/frame{frame}.pdb"
    pre = "/Users/weilu/Research/server/may_2019/single_memory/chosen_structures/"
    do(f"mkdir -p {pre}{name}")
    target = f"{pre}{name}/{folder}_{repeat}.pdb"
    a = do(f"cp {source} {target}")
    # print(a)
    # print(frame, line)
    # break

In [65]:


In [61]:
name_list = [a.lower()[:4] for a in pdb_list]

In [69]:
name = name_list[0]
for name in name_list:
    out_file = f'''
load cleaned_pdbs/{name}.pdb
load {name}/original_0.pdb
load {name}/multi_iter0_0.pdb
alignto {name},
orient
util.color_deep("gray80", '{name}', 0)
util.color_deep("green", 'original_0', 0)
util.color_deep("red", 'multi_iter0_0', 0)
'''
    with open(f"/Users/weilu/Research/server/may_2019/single_memory/chosen_structures/{name}.pml", "w") as out:
        out.write(out_file)

In [25]:
subset_data = subset_data.query("Steps % 80000 == 0 and Steps != 0")

In [30]:
subset_data.head()


Out[30]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama DSSP P_AP Water Burial Frag_Mem Membrane Ebond Epair
20 1r69 original 80000 0.260807 -95.226489 rerun 0 12.152029 89.649368 22.310868 -95.028886 -5.027819e-08 -2.392265e+00 -8.481156 -52.398134 -48.886283 0 182.881847 2.415284
40 1r69 original 160000 0.289319 -153.975015 rerun 0 11.343404 80.901303 23.956897 -129.096091 -4.829285e-05 -4.281021e+00 -17.782898 -54.834392 -52.838764 0 152.662868 9.564202
60 1r69 original 240000 0.269704 -109.853812 rerun 0 11.993795 93.107580 39.120580 -120.314117 -4.341149e-08 -1.480912e-07 -12.983571 -54.297658 -54.486626 0 153.819585 6.610288
80 1r69 original 320000 0.323581 -145.143185 rerun 0 11.123848 84.775473 27.871982 -110.119621 -3.622934e-13 -9.983451e-01 -28.144780 -53.935452 -64.592443 0 161.938049 11.678037
100 1r69 original 400000 0.283331 -135.358589 rerun 0 11.343645 95.362166 19.067645 -105.841878 -2.887054e-07 -4.868810e+00 -22.096093 -53.450463 -63.531155 0 165.933341 4.365326

In [31]:
subset_data["Contact"] = subset_data["Water"] + subset_data["Burial"]

In [21]:
data = get_complete_data(pre, folder_list, pdb_list, run=-1, rerun=-1, formatName=True)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
native = data


1r69 26
1utg 26
3icb 26
256b 26
4cpv 26
1ccr 26
2mhr 26
1mba 26
2fha 26
1fc2 26
1enh 26
2gb1 26
2cro 26
1ctf 26
4icb 26
1r69 26
1utg 26
3icb 26
256b 26
4cpv 26
1ccr 26
2mhr 26
1mba 26
2fha 26
1fc2 26
1enh 26
2gb1 26
2cro 26
1ctf 26
4icb 26

In [7]:
subset_data.head()


Out[7]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama DSSP P_AP Water Burial Frag_Mem Membrane Ebond Epair
0 1r69 original 0 0.005951 -189.021357 rerun 0 57.809329 3.906152 0.925549 -137.479793 0.000000e+00 0.000000e+00 0.000000 -51.827012 -4.546254 0 0.933290 0.000000
20 1r69 original 80000 0.260807 -95.226489 rerun 0 12.152029 89.649368 22.310868 -95.028886 -5.027819e-08 -2.392265e+00 -8.481156 -52.398134 -48.886283 0 182.881847 2.415284
40 1r69 original 160000 0.289319 -153.975015 rerun 0 11.343404 80.901303 23.956897 -129.096091 -4.829285e-05 -4.281021e+00 -17.782898 -54.834392 -52.838764 0 152.662868 9.564202
60 1r69 original 240000 0.269704 -109.853812 rerun 0 11.993795 93.107580 39.120580 -120.314117 -4.341149e-08 -1.480912e-07 -12.983571 -54.297658 -54.486626 0 153.819585 6.610288
80 1r69 original 320000 0.323581 -145.143185 rerun 0 11.123848 84.775473 27.871982 -110.119621 -3.622934e-13 -9.983451e-01 -28.144780 -53.935452 -64.592443 0 161.938049 11.678037

In [23]:
native_energy = native.groupby(["Name", "Folder"]).head(1)
prediction_energy = data.groupby(["Name", "Repeat", "Folder"]).head(1)
native_energy["Contact"] = native_energy["Water"] + native_energy["Burial"]

In [11]:


In [70]:
y_show = "VTotal"
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
    ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [35]:
y_show = "Contact"
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
    ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [29]:
y_show = "Water"
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
    ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [36]:
y_show = "Frag_Mem"
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
    name= ax.title.get_text().split(" ")[-1]
    # print(name)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
    ax.axhline(energy, ls="--", color="blue", linewidth=4)
    energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
    ax.axhline(energy, ls="--", color="orange", linewidth=4)



In [ ]: