In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

sns.set(rc={'figure.figsize':(10,6.180)})
sns.set_style("whitegrid")

%matplotlib inline

In [106]:
def my_reorder(a, first):
    # move first to the top. and keep the rest
    new_order = first.copy()
    for col in a:
        if col not in first:
            new_order.append(col)
    return new_order

def read_pdb(pre, name, run=30, rerun=2):
    all_data = []
    for i in range(run):
        for j in range(rerun):
            # pre = "/Users/weilu/Research/server/nov_2018/iterative_optimization_4/all_simulations/"
            location = pre + f"{name}/simulation/{i}/{j}/"
            try:
                wham = pd.read_csv(location+"wham.dat")
            except:
                print(f"PDB: {name}, Run: {i}, Rerun: {j} not exist")
                print(location+"wham.dat")
                continue
            wham.columns = wham.columns.str.strip()
            remove_columns = ['Tc', 'Energy']
            wham = wham.drop(remove_columns, axis=1)
            energy = pd.read_csv(location+"energy.dat")
            energy.columns = energy.columns.str.strip()
            remove_columns = ['Steps', 'Shake', 'Excluded', 'Helix', 'AMH-Go', 'Vec_FM', 'SSB']
            energy = energy.drop(remove_columns, axis=1)
            data = pd.concat([wham, energy], axis=1).assign(Repeat=i, Run=j)
            all_data.append(data)
    data = pd.concat(all_data).reset_index(drop=True)
    data = data.reindex(columns=my_reorder(data.columns, ["Steps", "Qw", "VTotal", "Run", "Repeat"]))
    print(name, len(data))
    return data

In [49]:
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
            "new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
            "test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]

def get_complete_data(pre, folder_list, pdb_list, **kwargs):
    complete_all_data = []
    for folder in folder_list:
        # pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_old_set/"
        pre_folder = f"{pre}{folder}/"
        all_data = []
        for p in pdb_list:
            name = p.lower()[:4]
            tmp = read_pdb(pre_folder, name, **kwargs)
            all_data.append(tmp.assign(Name=name))
        data = pd.concat(all_data)
        complete_all_data.append(data.assign(Folder=folder))
    data = pd.concat(complete_all_data)
    data = data.reindex(columns=my_reorder(data.columns, ["Name", "Folder"]))
    return data

In [107]:
pre = "/Users/weilu/Research/server/april_2019/globular_2xov_named_2lep/"
folder_list = ["strengthen_beta"]
pdb_list = ["2lep"]
data = get_complete_data(pre, folder_list, pdb_list, run=20, rerun=1)
data_strength_beta = data


2lep 40000

In [108]:
pre = "/Users/weilu/Research/server/april_2019/globular_2xov_named_2lep/"
folder_list = ["longerRun"]
pdb_list = ["2lep"]
data = get_complete_data(pre, folder_list, pdb_list, run=20, rerun=2)
data_longerRun = data


2lep 80000

In [97]:
pre = "/Users/weilu/Research/server/april_2019/globular_2xov_named_2lep/"
folder_list = ["longerRun"]
pdb_list = ["2lep"]
data = get_complete_data(pre, folder_list, pdb_list, run=20, rerun=2)
# subset_data = data.query("Steps % 80000 == 0")
today = datetime.datetime.today().strftime('%m-%d')
print(today)
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")
# a = pd.read_csv("/Users/weilu/Research/data/optimization/energy_03-31.csv", index_col=0)
# b = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-01.csv", index_col=0)
# data = pd.concat([a,b])
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")


2lep 80000
04-11

In [101]:
data_strength_beta = data

In [99]:
data_longerRun = data

In [109]:
data = pd.concat([data_strength_beta, data_longerRun])

native water energy is about -53.


In [110]:
data.head(1)


Out[110]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama DSSP P_AP Water Burial Frag_Mem Membrane Ebond Epair
0 2lep strengthen_beta 4000 0.095645 -284.816011 0 0 30.959568 97.453826 19.810282 -103.119297 0.0 0.0 0.05606 -53.375831 -245.64105 0 160.906666 2.88755

In [ ]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "DSSP", alpha=0.1).add_legend())

In [111]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "DSSP", alpha=0.1).add_legend())



In [104]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.1).add_legend())



In [98]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [62]:
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/"
folder_list = ["iter2_normalized_noFrag"]
pdb_list = dataset["combined"]
data = get_complete_data(pre, folder_list, pdb_list, run=10, rerun=2)
subset_data = data.query("Steps % 80000 == 0")
today = datetime.datetime.today().strftime('%m-%d')
print(today)
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")
# a = pd.read_csv("/Users/weilu/Research/data/optimization/energy_03-31.csv", index_col=0)
# b = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-01.csv", index_col=0)
# data = pd.concat([a,b])
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")


1r69 20000
1utg 20000
3icb 20000
256b 20000
4cpv 20000
1ccr 20000
2mhr 20000
1mba 15000
2fha 14421
1fc2 40000
1enh 40000
2gb1 40000
2cro 40000
1ctf 40000
4icb 40000
04-01

In [72]:
data = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-01.csv", index_col=0)


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/arraysetops.py:522: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [76]:
data.shape


Out[76]:
(1228195, 17)

In [84]:
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set_with_frag/"
folder_list = ["iter6_normalized_noFrag"]
pdb_list = dataset["combined"]
data = get_complete_data(pre, folder_list, pdb_list, run=10, rerun=2)
subset_data = data.query("Steps % 80000 == 0")
today = datetime.datetime.today().strftime('%m-%d')
print(today)
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")
# a = pd.read_csv("/Users/weilu/Research/data/optimization/energy_03-31.csv", index_col=0)
# b = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-01.csv", index_col=0)
# data = pd.concat([a,b])
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}_1.csv")


1r69 20000
1utg 20000
3icb 20000
256b 20000
4cpv 20000
1ccr 20000
2mhr 20000
1mba 15000
2fha 15000
1fc2 40000
1enh 40000
2gb1 40000
2cro 40000
1ctf 40000
4icb 40000
04-08

In [83]:
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set_with_frag/"
folder_list = ["iter0"]
pdb_list = dataset["combined"]
data = get_complete_data(pre, folder_list, pdb_list, run=30, rerun=1)
# subset_data = data.query("Steps % 80000 == 0")
today = datetime.datetime.today().strftime('%m-%d')
print(today)
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")
# a = pd.read_csv("/Users/weilu/Research/data/optimization/energy_03-31.csv", index_col=0)
# b = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-01.csv", index_col=0)
# data = pd.concat([a,b])
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")


1r69 30000
1utg 30000
3icb 30000
256b 30000
4cpv 30000
1ccr 30000
2mhr 30000
1mba 22500
2fha 22500
1fc2 60000
1enh 60000
2gb1 60000
2cro 60000
1ctf 60000
4icb 60000
04-08

In [112]:
data = pd.read_csv("/Users/weilu/Research/data/optimization/energy_04-08.csv", index_col=0)

In [113]:
subset_iter6 = subset_data

In [115]:
data.head()


Out[115]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama P_AP Water Burial Frag_Mem Ebond Epair
0 1r69 iter0 4000 0.342615 -114.472189 0 0 11.061036 157.535308 21.793236 -48.797913 0 -10.796545 -2.331958 -231.874317 284.228717 11.066021
1 1r69 iter0 8000 0.278288 -203.720363 0 0 12.400950 117.297890 38.255093 -91.625458 0 -10.943550 -4.078143 -252.626195 214.008935 8.430595
2 1r69 iter0 12000 0.279709 -286.708326 0 0 11.648768 68.085923 22.697867 -105.052346 0 -6.586444 -2.945326 -262.907999 181.378195 3.043456
3 1r69 iter0 16000 0.283629 -331.560857 0 0 11.789625 99.347878 19.623003 -157.677660 0 -13.977479 -3.824403 -275.052196 194.861902 8.094861
4 1r69 iter0 20000 0.302869 -341.545376 0 0 11.664210 86.151590 18.731492 -153.732794 0 -7.723566 -4.621929 -280.350169 162.867865 3.861190

In [116]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Frag_Mem", "Water", alpha=0.5).add_legend())



In [87]:
subset_iter0_with_rg = data.query("Steps % 80000 == 0")

In [92]:
data = pd.concat([subset_iter0_with_rg, subset_iter6])

In [ ]:

with rg constraint, we are able to find those unative collapsed structure.

I will add iter6 gamma, with constraint below.


In [95]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [93]:
g = sns.FacetGrid(data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Rg", alpha=0.5).add_legend())



In [89]:
g = sns.FacetGrid(subset_iter0_with_rg, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Rg", alpha=0.5).add_legend())



In [88]:
g = sns.FacetGrid(subset_iter6, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Rg", alpha=0.5).add_legend())



In [82]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Rg", alpha=0.5).add_legend())



In [78]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Rg", alpha=0.5).add_legend())



In [75]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [64]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [50]:
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/"
folder_list = ["iter0_normalized_noFrag", "iter1_normalized_noFrag"]
pdb_list = dataset["combined"]
data = get_complete_data(pre, folder_list, pdb_list, run=10, rerun=2)
subset_data = data.query("Steps % 80000 == 0")
today = datetime.datetime.today().strftime('%m-%d')
print(today)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")


1r69 20000
1utg 20000
3icb 20000
256b 20000
4cpv 20000
1ccr 20000
2mhr 20000
1mba 15000
2fha 14738
1fc2 40000
1enh 40000
2gb1 40000
2cro 40000
1ctf 40000
4icb 40000
1r69 20000
1utg 20000
3icb 20000
256b 20000
4cpv 20000
1ccr 20000
2mhr 20000
1mba 15000
2fha 14036
1fc2 40000
1enh 40000
2gb1 40000
2cro 40000
1ctf 40000
4icb 40000

In [52]:
today = datetime.datetime.today().strftime('%m-%d')
print(today)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/energy_{today}.csv")


03-31

In [57]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Steps", "Water", alpha=0.5).add_legend())



In [58]:
g = sns.FacetGrid(subset_data, col="Name",col_wrap=4,  hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [ ]:


In [31]:
dataset = {"old":("1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "), 40),
            "new":("1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "), 80),
            "test":(["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"], 40)}
pdb_list, steps = dataset["old"]

complete_all_data = []
for folder in ["single", "noFrag"]:
    pre = f"/Users/weilu/Research/server/april_2019/iterative_optimization_old_set/{folder}/"
    all_data = []
    for p in pdb_list:
        name = p.lower()[:4]
        tmp = read_pdb(pre, name, run=30, rerun=1)
        all_data.append(tmp.assign(Name=name))
    data = pd.concat(all_data)
    complete_all_data.append(data.assign(Folder=folder))
data = pd.concat(complete_all_data)
data = data.reindex(columns=my_reorder(data.columns, ["Name", "Folder"]))
data_subset = data.query("Steps % 80000 == 0")


1r69 30000
1utg 30000
3icb 30000
256b 30000
4cpv 30000
1ccr 30000
2mhr 30000
1mba 23146
2fha 19885
1r69 30000
1utg 30000
3icb 30000
256b 30000
4cpv 30000
1ccr 30000
2mhr 30000
1mba 25680
2fha 22113

In [32]:
data.head()


Out[32]:
Name Folder Steps Qw VTotal Run Repeat Rg Chain Chi Rama P_AP Water Burial Frag_Mem Ebond Epair
0 1r69 single 4000 0.087288 -97.248076 0 0 30.620022 84.805297 28.447449 -76.387581 0 -1.560166 -51.343641 -81.209434 185.385749 4.497708
1 1r69 single 8000 0.179975 -183.634765 0 0 22.565086 111.754883 16.184323 -113.969007 0 -4.159232 -53.089330 -140.356402 177.924318 14.831873
2 1r69 single 12000 0.235191 -220.810608 0 0 16.929506 95.902273 21.480374 -113.593651 0 -7.269499 -53.867215 -163.462890 153.082652 8.920596
3 1r69 single 16000 0.377747 -255.999037 0 0 11.232515 95.598735 16.251947 -133.302030 0 -22.810907 -54.899661 -156.837122 178.698420 4.564993
4 1r69 single 20000 0.205453 -211.454610 0 0 18.008032 91.545729 19.502155 -116.180706 0 -10.169564 -52.802971 -143.349253 164.482353 10.297213

In [33]:
data_subset = data.query("Steps % 80000 == 0")

In [40]:
g = sns.FacetGrid(data_subset, col="Name",col_wrap=4,  hue="Folder")
g = (g.map(plt.scatter, "Qw", "Water", alpha=0.5).add_legend())



In [39]:
g = sns.FacetGrid(data_subset, col="Name",col_wrap=4,  hue="Folder")
g = (g.map(plt.scatter, "Steps", "Water", alpha=0.5).add_legend())



In [27]:
g = sns.FacetGrid(data_subset, col="Name",col_wrap=4)
g = g.map(plt.scatter, "Steps", "Frag_Mem")



In [29]:
g = sns.FacetGrid(data_subset, col="Name",col_wrap=4)
g = g.map(plt.scatter, "Steps", "Qw")



In [16]:
g = sns.FacetGrid(data, col="Name",col_wrap=4)
g = g.map(plt.scatter, "Steps", "Qw")



In [ ]: