notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
# from small_script.myFunctions import *



%matplotlib inline
%load_ext autoreload
%autoreload 2



In [32]:

    
plt.rcParams['figure.figsize'] = [16.18033, 10]

dataset = {"old":("1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "), 40),
            "new":("1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "), 80),
            "test":(['t089', 't120', 't251', 'top7', '1ubq', 't0766', 't0778', 't0782',
                       't0792', 't0803', 't0815', 't0833', 't0842', 't0844'], 40)}
# pdb_list, steps = dataset["old"]


def get_data(pre, pdb_list, simType="all_simulations", n_rum=30, formatName=True):
    # to get last 20 frame of each run
    _all = []
    for p in pdb_list:
        if formatName:
            name = p.lower()[:4]
        else:
            name = p
        for i in range(n_rum):
            for ii in range(1):
                location = pre + f"{simType}/{name}/simulation/{i}/{ii}/wham.dat"
                try:
                    tmp = pd.read_csv(location).tail(50).reset_index()
                    tmp.columns = tmp.columns.str.strip()
                    _all.append(tmp.assign(Run=i, Name=name, Rerun=ii))
                except Exception as e: 
                    print(e)
    data = pd.concat(_all)
    data["Run"] = "Run" + data["Run"].astype(str)
    return data

# pre = "/Users/weilu/Research/server/feb_2019/optimization_iter1/database/2gb1/"
# fileName = "movie.pdb"
def splitPDB(pre, fileName):
    location = f"{pre}/{fileName}"
    with open(location, "r") as f:
        a = f.readlines()
    i = 0
    tmp = ""
    for line in a:
        tmp += line
    #     os.system(f"echo '{line}' >> {pre}frame{i}")
        if line == "END\n":
            with open(f"{pre}frame{i}.pdb", "w") as out:
                out.write(tmp)
            i += 1
            tmp = ""


import subprocess
def getFromTerminal(CMD):
    return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()
def getSize(p):
    protein = p.lower()[:4]
    pre = f"/Users/weilu/Research/server/feb_2019/iterative_optimization_test_set/all_simulations/{protein}/{protein}/ssweight"
    a = getFromTerminal(f"wc {pre}")
#     print(a)
    n = int(a.split()[0])
    return n



In [36]:

    
def getSize(p, preL="/Users/weilu/Research/server/feb_2019/iterative_optimization_test_set/all_simulations/"):
    protein = p.lower()[:4]
    pre = preL + f"{protein}/{protein}/ssweight"
    a = getFromTerminal(f"wc {pre}")
#     print(a)
    n = int(a.split()[0])
    return n



In [81]:

    
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
            "new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
            "test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
p_list = []
n_list = []
rg_list = []
for p in dataset["combined"]:
    protein = p.lower()[:4]
    n_list.append(protein)
    pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/native/"
    s = getSize(p, preL=pre)
    location = pre + f"{protein}/simulation/0/rerun/wham.dat"
    rg= pd.read_csv(location).values[0][2]
    p_list.append(s)
    rg_list.append(rg)

data = pd.DataFrame([n_list, p_list, rg_list], index=["Protein", "Length", "Rg"]).T
data.Length = data.Length.astype(int)
data.Rg = data.Rg.astype(float)



In [83]:

    
data["Rg_normalized_by_length"] = data["Rg"] / data["Length"]



In [82]:

    
data.plot.scatter("Length", "Rg")









    Out[82]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a294fff28>



In [87]:

    
data.to_csv("/Users/weilu/Research/server/april_2019/protein_info.csv")



In [89]:

    
data = pd.read_csv("/Users/weilu/Research/server/april_2019/protein_info.csv", index_col=0)



In [94]:

    
rg = data.query("Protein == '1r69'")["Rg"].values[0]









    Out[94]:





10.1147733666746



In [95]:

    
a = "sddsfsdf"



In [96]:

    
a.replace("sd", "a")









    Out[96]:





'adsfaf'



In [86]:

    
data









    Out[86]:







  
    
      
      Protein
      Length
      Rg
      Rg_normalized_by_length
    
  
  
    
      0
      1r69
      63
      10.114773
      0.160552
    
    
      1
      1utg
      70
      12.705198
      0.181503
    
    
      2
      3icb
      75
      11.189342
      0.149191
    
    
      3
      256b
      106
      14.173721
      0.133714
    
    
      4
      4cpv
      108
      12.543948
      0.116148
    
    
      5
      1ccr
      111
      12.978877
      0.116927
    
    
      6
      2mhr
      118
      13.775409
      0.116741
    
    
      7
      1mba
      146
      14.725247
      0.100858
    
    
      8
      2fha
      172
      18.408701
      0.107027
    
    
      9
      1fc2
      44
      9.620456
      0.218647
    
    
      10
      1enh
      54
      10.112475
      0.187268
    
    
      11
      2gb1
      56
      10.165557
      0.181528
    
    
      12
      2cro
      65
      10.115199
      0.155618
    
    
      13
      1ctf
      68
      10.552933
      0.155190
    
    
      14
      4icb
      76
      11.338816
      0.149195



In [85]:

    
data.sort_values("Length")









    Out[85]:







  
    
      
      Protein
      Length
      Rg
      Rg_normalized_by_length
    
  
  
    
      9
      1fc2
      44
      9.620456
      0.218647
    
    
      10
      1enh
      54
      10.112475
      0.187268
    
    
      11
      2gb1
      56
      10.165557
      0.181528
    
    
      0
      1r69
      63
      10.114773
      0.160552
    
    
      12
      2cro
      65
      10.115199
      0.155618
    
    
      13
      1ctf
      68
      10.552933
      0.155190
    
    
      1
      1utg
      70
      12.705198
      0.181503
    
    
      2
      3icb
      75
      11.189342
      0.149191
    
    
      14
      4icb
      76
      11.338816
      0.149195
    
    
      3
      256b
      106
      14.173721
      0.133714
    
    
      4
      4cpv
      108
      12.543948
      0.116148
    
    
      5
      1ccr
      111
      12.978877
      0.116927
    
    
      6
      2mhr
      118
      13.775409
      0.116741
    
    
      7
      1mba
      146
      14.725247
      0.100858
    
    
      8
      2fha
      172
      18.408701
      0.107027



In [64]:

    
round(rg, 2)









    Out[64]:





11.34



In [62]:

    
data



In [58]:

    
pre = "/Users/weilu/Research/server/april_2019/iterative_optimization_combined_train_set/native/"
location = pre + "1ccr/simulation/0/rerun/wham.dat"
rg= pd.read_csv(location).values[0][2]



In [60]:

    
a.values[0][2]









    Out[60]:





12.978876511958699



In [52]:

    
data



In [51]:

    
data



In [27]:

    
data = pd.read_csv("/Users/weilu/Research/library/test_set_info.csv", index_col=0)



In [30]:

    
data.sort_values("Length").T



In [35]:

    
data = pd.read_csv("/Users/weilu/Research/data/optimization/single_iterative_optimization_test_03-29.csv", index_col=0)
data2 = pd.read_csv("/Users/weilu/Research/data/optimization/iter7_2_iterative_optimization_test_03-29.csv", index_col=0)
d = pd.concat([
                data.assign(Scheme="singleMemory"), 
                   
                 data2.assign(Scheme="iter7"), 

              ])
sns.boxplot("Name", "Qw", hue="Scheme", data=d)
# sns.boxplot("Qw", "Name", hue="Scheme", data=d)









    Out[35]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a2722ffd0>



In [33]:

    
pre = "/Users/weilu/Research/server/april_2019/"
folder = "iterative_optimization_test"
pre = pre + folder + "/"
simulationType = "single"
today = datetime.datetime.today().strftime('%m-%d')
# pdb_list, steps = dataset["test"]
pdb_list, steps = dataset["test"]
# pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
data = get_data(pre, pdb_list, simType=simulationType, n_rum=10, formatName=False)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/{simulationType}_{folder}_{today}.csv")
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/withoutContact_{today}.csv")
sns.boxplot("Name", "Qw", data=data)









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a25ac3dd8>



In [34]:

    
pre = "/Users/weilu/Research/server/april_2019/"
folder = "iterative_optimization_test"
pre = pre + folder + "/"
simulationType = "iter7_2"
today = datetime.datetime.today().strftime('%m-%d')
# pdb_list, steps = dataset["test"]
pdb_list, steps = dataset["test"]
# pdb_list = "1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", ")
data = get_data(pre, pdb_list, simType=simulationType, n_rum=10, formatName=False)
data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/{simulationType}_{folder}_{today}.csv")
# data.reset_index(drop=True).to_csv(f"/Users/weilu/Research/data/optimization/withoutContact_{today}.csv")
sns.boxplot("Name", "Qw", data=data)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a271b9390>



In [ ]:

	Protein	Length	Rg	Rg_normalized_by_length
0	1r69	63	10.114773	0.160552
1	1utg	70	12.705198	0.181503
2	3icb	75	11.189342	0.149191
3	256b	106	14.173721	0.133714
4	4cpv	108	12.543948	0.116148
5	1ccr	111	12.978877	0.116927
6	2mhr	118	13.775409	0.116741
7	1mba	146	14.725247	0.100858
8	2fha	172	18.408701	0.107027
9	1fc2	44	9.620456	0.218647
10	1enh	54	10.112475	0.187268
11	2gb1	56	10.165557	0.181528
12	2cro	65	10.115199	0.155618
13	1ctf	68	10.552933	0.155190
14	4icb	76	11.338816	0.149195

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14
Protein	1r69	1utg	3icb	256b	4cpv	1ccr	2mhr	1mba	2fha	1fc2	1enh	2gb1	2cro	1ctf	4icb
Length	63	70	75	106	108	111	118	146	172	44	54	56	65	68	76
Rg	10.1148	12.7052	11.1893	14.1737	12.5439	12.9789	13.7754	14.7252	18.4087	9.62046	10.1125	10.1656	10.1152	10.5529	11.3388

	4	8	0	3	10	5	6	11	7	2	1	12	9	13	14
Name	1ubq	t0792	t089	top7	t0815	t0766	t0778	t0833	t0782	t251	t120	t0842	t0803	t0844	t0846
Length	76	80	81	92	106	108	108	108	110	111	115	120	134	137	243