In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
# from small_script.myFunctions import *



%matplotlib inline
%load_ext autoreload
%autoreload 2

In [32]:
plt.rcParams['figure.figsize'] = [16.18033, 10]

dataset = {"old":("1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "), 40),
            "new":("1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "), 80),
            "test":(['t089', 't120', 't251', 'top7', '1ubq', 't0766', 't0778', 't0782',
                       't0792', 't0803', 't0815', 't0833', 't0842', 't0844'], 40)}
# pdb_list, steps = dataset["old"]


def get_data(pre, pdb_list, simType="all_simulations", n_rum=30, formatName=True):
    # to get last 20 frame of each run
    _all = []
    for p in pdb_list:
        if formatName:
            name = p.lower()[:4]
        else:
            name = p
        for i in range(n_rum):
            for ii in range(1):
                location = pre + f"{simType}/{name}/simulation/{i}/{ii}/wham.dat"
                try:
                    tmp = pd.read_csv(location).tail(50).reset_index()
                    tmp.columns = tmp.columns.str.strip()
                    _all.append(tmp.assign(Run=i, Name=name, Rerun=ii))
                except Exception as e: 
                    print(e)
    data = pd.concat(_all)
    data["Run"] = "Run" + data["Run"].astype(str)
    return data

# pre = "/Users/weilu/Research/server/feb_2019/optimization_iter1/database/2gb1/"
# fileName = "movie.pdb"
def splitPDB(pre, fileName):
    location = f"{pre}/{fileName}"
    with open(location, "r") as f:
        a = f.readlines()
    i = 0
    tmp = ""
    for line in a:
        tmp += line
    #     os.system(f"echo '{line}' >> {pre}frame{i}")
        if line == "END\n":
            with open(f"{pre}frame{i}.pdb", "w") as out:
                out.write(tmp)
            i += 1
            tmp = ""


import subprocess
def getFromTerminal(CMD):
    return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()
def getSize(p):
    protein = p.lower()[:4]
    pre = f"/Users/weilu/Research/server/feb_2019/iterative_optimization_test_set/all_simulations/{protein}/{protein}/ssweight"
    a = getFromTerminal(f"wc {pre}")
#     print(a)
    n = int(a.split()[0])
    return n

In [36]:
def getSize(p, preL="/Users/weilu/Research/server/feb_2019/iterative_optimization_test_set/all_simulations/"):
    protein = p.lower()[:4]
    pre = preL + f"{protein}/{protein}/ssweight"
    a = getFromTerminal(f"wc {pre}")
#     print(a)
    n = int(a.split()[0])
    return n

In [81]:
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
            "new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
            "test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
p_list = []
n_list = []
rg_list = []
for p in dataset["combined"]:
    protein = p.lower()[:4]
    n_list.append(protein)
    pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/native/"
    s = getSize(p, preL=pre)
    location = pre + f"{protein}/simulation/0/rerun/wham.dat"
    rg= pd.read_csv(location).values[0][2]
    p_list.append(s)
    rg_list.append(rg)

data = pd.DataFrame([n_list, p_list, rg_list], index=["Protein", "Length", "Rg"]).T
data.Length = data.Length.astype(int)
data.Rg = data.Rg.astype(float)

In [83]:
data["Rg_normalized_by_length"] = data["Rg"] / data["Length"]

In [82]:
data.plot.scatter("Length", "Rg")


Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a294fff28>

In [87]:
data.to_csv("/Users/weilu/Research/server/april_2019/protein_info.csv")

In [89]:
data = pd.read_csv("/Users/weilu/Research/server/april_2019/protein_info.csv", index_col=0)

In [94]:
rg = data.query("Protein == '1r69'")["Rg"].values[0]


Out[94]:
10.1147733666746

In [95]:
a = "sddsfsdf"

In [96]:
a.replace("sd", "a")


Out[96]:
'adsfaf'

In [86]:
data


Out[86]:
Protein Length Rg Rg_normalized_by_length
0 1r69 63 10.114773 0.160552
1 1utg 70 12.705198 0.181503
2 3icb 75 11.189342 0.149191
3 256b 106 14.173721 0.133714
4 4cpv 108 12.543948 0.116148
5 1ccr 111 12.978877 0.116927
6 2mhr 118 13.775409 0.116741
7 1mba 146 14.725247 0.100858
8 2fha 172 18.408701 0.107027
9 1fc2 44 9.620456 0.218647
10 1enh 54 10.112475 0.187268
11 2gb1 56 10.165557 0.181528
12 2cro 65 10.115199 0.155618
13 1ctf 68 10.552933 0.155190
14 4icb 76 11.338816 0.149195

In [85]:
data.sort_values("Length")


Out[85]:
Protein Length Rg Rg_normalized_by_length
9 1fc2 44 9.620456 0.218647
10 1enh 54 10.112475 0.187268
11 2gb1 56 10.165557 0.181528
0 1r69 63 10.114773 0.160552
12 2cro 65 10.115199 0.155618
13 1ctf 68 10.552933 0.155190
1 1utg 70 12.705198 0.181503
2 3icb 75 11.189342 0.149191
14 4icb 76 11.338816 0.149195
3 256b 106 14.173721 0.133714
4 4cpv 108 12.543948 0.116148
5 1ccr 111 12.978877 0.116927
6 2mhr 118 13.775409 0.116741
7 1mba 146 14.725247 0.100858
8 2fha 172 18.408701 0.107027

In [64]:
round(rg, 2)


Out[64]:
11.34

In [62]:
data


Out[62]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
Protein 1r69 1utg 3icb 256b 4cpv 1ccr 2mhr 1mba 2fha 1fc2 1enh 2gb1 2cro 1ctf 4icb
Length 63 70 75 106 108 111 118 146 172 44 54 56 65 68 76
Rg 10.1148 12.7052 11.1893 14.1737 12.5439 12.9789 13.7754 14.7252 18.4087 9.62046 10.1125 10.1656 10.1152 10.5529 11.3388

In [58]:
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/native/"
location = pre + "1ccr/simulation/0/rerun/wham.dat"
rg= pd.read_csv(location).values[0][2]

In [60]:
a.values[0][2]


Out[60]:
12.978876511958699

In [52]:
data


Out[52]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
Protein 1r69 1utg 3icb 256b 4cpv 1ccr 2mhr 1mba 2fha 1fc2 1enh 2gb1 2cro 1ctf 4icb
Length 63 70 75 106 108 111 118 146 172 44 54 56 65 68 76

In [51]:
data


Out[51]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
Protein 1r69 1utg 3icb 256b 4cpv 1ccr 2mhr 1mba 2fha 1fc2 1enh 2gb1 2cro 1ctf 4icb
Length 63 70 75 106 108 111 118 146 172 44 54 56 65 68 76

In [27]:
data = pd.read_csv("/Users/weilu/Research/library/test_set_info.csv", index_col=0)

In [30]:
data.sort_values("Length").T


Out[30]:
4 8 0 3 10 5 6 11 7 2 1 12 9 13 14
Name 1ubq t0792 t089 top7 t0815 t0766 t0778 t0833 t0782 t251 t120 t0842 t0803 t0844 t0846
Length 76 80 81 92 106 108 108 108 110 111 115 120 134 137 243

In [35]:
data = pd.read_csv("/Users/weilu/Research/data/optimization/single_iterative_optimization_test_03-29.csv", index_col=0)
data2 = pd.read_csv("/Users/weilu/Research/data/optimization/iter7_2_iterative_optimization_test_03-29.csv", index_col=0)
d = pd.concat([
                data.assign(Scheme="singleMemory"), 
                   
                 data2.assign(Scheme="iter7"), 

              ])
sns.boxplot("Name", "Qw", hue="Scheme", data=d)
# sns.boxplot("Qw", "Name", hue="Scheme", data=d)


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2722ffd0>

In [33]:
pre = "/Users/weilu/Research/server/april_2019/"
folder = "iterative_optimization_test"
pre = pre + folder + "/"
simulationType = "single"
today = datetime.datetime.today().strftime('%m-%d')
# pdb_list, steps = dataset["test"]
pdb_list, steps = dataset["test"]
# pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
data = get_data(pre, pdb_list, simType=simulationType, n_rum=10, formatName=False)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/{simulationType}_{folder}_{today}.csv")
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/withoutContact_{today}.csv")
sns.boxplot("Name", "Qw", data=data)


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a25ac3dd8>

In [34]:
pre = "/Users/weilu/Research/server/april_2019/"
folder = "iterative_optimization_test"
pre = pre + folder + "/"
simulationType = "iter7_2"
today = datetime.datetime.today().strftime('%m-%d')
# pdb_list, steps = dataset["test"]
pdb_list, steps = dataset["test"]
# pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
data = get_data(pre, pdb_list, simType=simulationType, n_rum=10, formatName=False)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/{simulationType}_{folder}_{today}.csv")
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/withoutContact_{today}.csv")
sns.boxplot("Name", "Qw", data=data)


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a271b9390>

In [ ]: