In [2]:

    
# written in python3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio


def my_transform(data, label, degree, FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

functions for reading data



In [5]:

    
# read energy, rw, bias, rmsd data from location
def read_data(name):
#     name="tr872"
    name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]

    # you probably want to change the location below
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
    location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
    RMSD = pd.read_table(location+"rmsd.xvg", names=["i", "RMSD"], sep="\s+")
    bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
    awsem = pd.read_table(location+"awsem.log", names=name_list)
    rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
    qw = pd.read_table(location+"Qw.out", names=["i", "Qw"], sep="\s+").drop("i", axis=1)
    # pc location
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
    pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
    raw_data = pd.concat([RMSD, rw, bias, qw, awsem, pc], axis=1)
    return raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})

def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)

# read the pmf, rc. 
# def read_data_2(name):
# #     name = "tr894"
# #     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# #     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
#     location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
#     rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
#     rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
#     awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
#     qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
#     freeE = pd.read_table(location+"pmf3000"
#                           , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
#     raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
#     return raw_data

def read_data_2(name):
#     name = "tr894"
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
    location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
    rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
    rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
    awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
    qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
    freeE = pd.read_table(location+"pmf3000"
                          , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
    raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
    return raw_data

train based on free energy, rw and awsem.



In [6]:

    
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr948-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]


# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# read all data

# tr884-halfDIHE
# tr872-halfDIHE
# tr948-halfDIHE
data_list = []
for name in folder_list:
    tmp = read_data_2(name)
    data_list.append(tmp)
raw_data_all = pd.concat(data_list)
n = 1
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="rmsd").reset_index(drop=True)


# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]

# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'name in {train_name_list}')









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-6-85c2cf7e15c2> in <module>()
     16 data_list = []
     17 for name in folder_list:
---> 18     tmp = read_data_2(name)
     19     data_list.append(tmp)
     20 raw_data_all = pd.concat(data_list)

<ipython-input-5-2f3855ee1f37> in read_data_2(name)
     31     rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
     32     awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
---> 33     qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
     34     freeE = pd.read_table(location+"pmf3000"
     35                           , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
    676                     skip_blank_lines=skip_blank_lines)
    677 
--> 678         return _read(filepath_or_buffer, kwds)
    679 
    680     parser_f.__name__ = name

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    438 
    439     # Create the parser.
--> 440     parser = TextFileReader(filepath_or_buffer, **kwds)
    441 
    442     if chunksize or iterator:

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    785             self.options['has_index_names'] = kwds['has_index_names']
    786 
--> 787         self._make_engine(self.engine)
    788 
    789     def close(self):

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1012     def _make_engine(self, engine='c'):
   1013         if engine == 'c':
-> 1014             self._engine = CParserWrapper(self.f, **self.options)
   1015         else:
   1016             if engine == 'python':

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1706         kwds['usecols'] = self.usecols
   1707 
-> 1708         self._reader = parsers.TextReader(src, **kwds)
   1709 
   1710         passed_names = self.names is None

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: File b'/Users/weilu/Research/server/dec_2018/structure_selection/tr884-halfDIHE/rc_QwhigherBound' does not exist



In [4]:

    
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = ["f",
    'rw',
     'awsem',
#     'RMSD', # test
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "rmsd"
LABEL = "qw"
DEGREE = 1

def pred_from_raw(a, clf):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prediceted_rmsd= clf.predict(test_set)
    return a.assign(prediceted_rmsd=prediceted_rmsd)

def assign_lowest_f(a):
    return a.assign(lowest_f=a["f"].sort_values().iloc[0])



In [5]:

    
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)



In [6]:

    
# # data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# # data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
# data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
#                                                      label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
# train_y = data[:,-1]
# train_set = data[:,:-1]
# from sklearn import svm
# # clf = svm.SVC(probability=True)
# clf = LinearRegression()
# clf.fit(train_set, train_y)
# y_pred_svm = clf.predict(train_set)

# raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(pred_from_raw, clf).reset_index(drop=True)
# # raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)



# picked_n = 1
# best = raw_data_all.groupby("name").apply(choose_top, col="rmsd"
#                                             , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# picked = raw_data_all.groupby("name").apply(choose_top, col="prediceted_rmsd"
#                                             , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# # init = raw_data_all.query("i == 0.0")
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])

# picked_keep = picked.copy()



In [7]:

    
from scipy.interpolate import interp1d
f_dic = {}
for name in folder_list:
    a = raw_data_all.query(f"name == '{name}'")
    x = a["pc"].values
    y = a["f"].values
    f_dic[name] = interp1d(x, y, fill_value="extrapolate")



In [8]:

    
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# plt.ylim([0,1])



In [9]:

    
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# g = g.map(plt.plot, "pc", "prediceted_rmsd")



In [10]:

    
# raw_data_all.query("name == 'tr594'").plot("pc", "f")



In [11]:

    
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "rmsd")
# plt.ylim([0,1])

used picked_keep as to filter the compelete data and select again



In [12]:

    
f_dic["tr594"](raw_data_all["pc"]).shape









    Out[12]:





(526,)



In [13]:

    
def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)


# WIDTH = 100
# WIDTH = 0.1
# WIDTH = 1
# WIDTH = 0.2
# def with_in_range(data, width=WIDTH):
#     return data.assign(inrange= (data["pc"] < (data["pc_center"]+width)) & (data["pc"] > (data["pc_center"]-width)))

def with_in_range(data, width=5):
    name = data["name"].iloc[0]
    return data.assign(inrange= (0 < (f_dic[name](data["pc"]))) & ((f_dic[name](data["pc"])) < width))



In [14]:

    
folder_list = ["tr898", "tr869", "tr947", "tr894", "tr882", "tr594", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# "tr898"
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# folder_list = ["tr866"]


# define top based on RMSD or Qw
# best_metric = "RMSD"
best_metric = "Qw"
if best_metric == "Qw":
    isAscending = False
else:
    isAscending = True
data_list = []
for name in folder_list:
    tmp = read_data(name)
    data_list.append(tmp)
raw_data_all_2 = pd.concat(data_list).dropna()
n = 25
raw_data_all_2 = raw_data_all_2.reset_index(drop=True).groupby("name").\
        apply(choose_top, n=n, col=best_metric, ascending=isAscending).reset_index(drop=True)


raw_data = raw_data_all_2.reset_index(drop=True).query(f'name in {train_name_list}').dropna()
# a = raw_data_all_2.dropna().merge(picked_keep[["pc", "name"]].rename(columns={"pc":"pc_center"}),on="name")
a = raw_data_all_2.dropna()
filtered = a.groupby("name").apply(with_in_range).query("inrange == True").reset_index(drop=True)



In [15]:

    
filtered.shape









    Out[15]:





(38834, 29)



In [16]:

    
a.shape









    Out[16]:





(50096, 28)



In [17]:

    
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
    "biasQ",
    'Rw',
     'VTotal',
#     'RMSD', # test
#     'Qw',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prob= clf.predict_proba(test_set)[:,1]
    return a.assign(prob=prob)

# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]

# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)

filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("name").apply(choose_top, col="prob"
                                        , n=1, ascending=False).reset_index(drop=True).query("chosen==True")

# if False:
picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
                        , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))









    Out[17]:





<seaborn.axisgrid.FacetGrid at 0x103929160>



In [23]:

    
all_results = pd.concat([picked_1.assign(result="picked_top1"), picked.assign(result='picked_top5'),
                         best.assign(result='best'), 
                          init.assign(result='init'),
                         worst.assign(result='worst'), 
                        ], sort=False)



In [25]:

    
picked.shape









    Out[25]:





(20, 30)



In [32]:

    
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"])) .reset_index(drop=True).to_csv("/Users/weilu/Desktop/selection_result.csv")



In [28]:

    
def my_reorder(a, first):
    # move first to the top. and keep the rest
    new_order = first.copy()
    for col in a:
        if col not in first:
            new_order.append(col)
    return new_order



In [31]:

    
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"]))









    Out[31]:







  
    
      
      name
      RMSD
      result
      folder
      i
      Rw
      biasQ
      bias
      Qw
      Step
      ...
      Frag_Mem
      Vec_FM
      Membrane
      SSB
      VTotal
      pc
      pc2
      chosen
      inrange
      prob
    
  
  
    
      1917
      tr594
      5.71845
      picked_top1
      2028
      2400.0
      -9655.812091
      0.562631
      38.258352
      0.397447
      0.0
      ...
      -542.733446
      0.0
      0.0
      0.0
      -976.169211
      1.783749
      -0.897284
      True
      True
      0.336318
    
    
      3098
      tr862
      6.21427
      picked_top1
      716
      21500.0
      -14017.862442
      0.795115
      8.395546
      0.461948
      0.0
      ...
      -860.309802
      0.0
      0.0
      0.0
      -1467.578457
      -1.316991
      -2.854982
      True
      True
      0.184471
    
    
      5040
      tr866
      1.65094
      picked_top1
      436
      43600.0
      -17053.726578
      0.628532
      27.597682
      0.834830
      0.0
      ...
      -730.772207
      0.0
      0.0
      0.0
      -1442.122133
      -0.087063
      3.699315
      True
      True
      0.220490
    
    
      7964
      tr868
      2.55439
      picked_top1
      1009
      700.0
      -19002.022160
      0.515777
      46.894413
      0.759775
      0.0
      ...
      -1184.431369
      0.0
      0.0
      0.0
      -1896.962431
      0.139004
      -0.381850
      True
      True
      0.419466
    
    
      9181
      tr869
      12.30950
      picked_top1
      0
      0.0
      -17101.278236
      0.327521
      90.445532
      0.340709
      0.0
      ...
      -745.292489
      0.0
      0.0
      0.0
      -1364.887693
      -0.000178
      -0.048953
      True
      True
      0.287940
    
    
      11347
      tr870
      8.98989
      picked_top1
      27
      2700.0
      -18766.397302
      0.366368
      80.297996
      0.325325
      0.0
      ...
      -919.447094
      0.0
      0.0
      0.0
      -1663.622985
      -1.937195
      -0.884915
      True
      True
      0.021734
    
    
      13892
      tr872
      3.88397
      picked_top1
      130
      13000.0
      -12597.510264
      0.673965
      21.259789
      0.654242
      0.0
      ...
      -546.317159
      0.0
      0.0
      0.0
      -1131.645842
      0.635999
      -1.004592
      True
      True
      0.232302
    
    
      15145
      tr877
      3.00711
      picked_top1
      0
      0.0
      -24114.108027
      0.568868
      37.174966
      0.648838
      0.0
      ...
      -821.245402
      0.0
      0.0
      0.0
      -1699.528374
      -0.055574
      -0.014200
      True
      True
      0.773073
    
    
      18607
      tr882
      2.75337
      picked_top1
      1585
      8200.0
      -12227.807526
      0.648724
      24.679000
      0.790488
      0.0
      ...
      -630.005603
      0.0
      0.0
      0.0
      -1123.346564
      -0.326972
      0.008144
      True
      True
      0.257559
    
    
      20532
      tr884
      5.42798
      picked_top1
      1448
      44600.0
      -9418.042083
      0.530488
      44.088316
      0.496556
      0.0
      ...
      -492.254269
      0.0
      0.0
      0.0
      -924.112214
      0.739004
      -2.026302
      True
      True
      0.231153
    
    
      22564
      tr885
      3.18402
      picked_top1
      1065
      6300.0
      -19678.210041
      0.713528
      16.413273
      0.723373
      0.0
      ...
      -879.121171
      0.0
      0.0
      0.0
      -1671.835269
      0.400841
      -0.236901
      True
      True
      0.125852
    
    
      25261
      tr891
      1.91208
      picked_top1
      1709
      20600.0
      -18636.734867
      0.645838
      25.086161
      0.814518
      0.0
      ...
      -811.315247
      0.0
      0.0
      0.0
      -1634.043036
      0.290187
      0.469162
      True
      True
      0.623671
    
    
      26508
      tr894
      2.00769
      picked_top1
      904
      40300.0
      -6759.374211
      0.667682
      22.087109
      0.764641
      0.0
      ...
      -339.543577
      0.0
      0.0
      0.0
      -671.963552
      -0.318981
      -0.765288
      True
      True
      0.385394
    
    
      28827
      tr895
      4.87177
      picked_top1
      992
      49100.0
      -20488.493033
      0.646876
      24.939243
      0.571973
      0.0
      ...
      -901.913395
      0.0
      0.0
      0.0
      -1650.549327
      -0.933751
      0.277437
      True
      True
      0.225033
    
    
      29502
      tr896
      8.62961
      picked_top1
      3
      300.0
      -11703.621241
      0.461786
      57.934848
      0.405691
      0.0
      ...
      -534.883801
      0.0
      0.0
      0.0
      -989.943449
      -1.047856
      0.015758
      True
      True
      0.497582
    
    
      31988
      tr898
      13.46090
      picked_top1
      729
      22800.0
      -19042.795684
      0.568624
      37.217128
      0.398104
      0.0
      ...
      -783.469742
      0.0
      0.0
      0.0
      -1469.264433
      -2.546374
      -2.851364
      True
      True
      0.170833
    
    
      33058
      tr921
      3.24877
      picked_top1
      828
      32700.0
      -21881.858026
      0.680476
      20.419091
      0.624908
      0.0
      ...
      -1871.646575
      0.0
      0.0
      0.0
      -2764.790089
      0.245506
      -0.892383
      True
      True
      0.165306
    
    
      34861
      tr922
      3.68668
      picked_top1
      1011
      900.0
      -9636.845094
      0.651477
      24.293703
      0.606792
      0.0
      ...
      -620.135642
      0.0
      0.0
      0.0
      -1010.273657
      -0.056487
      1.712977
      True
      True
      0.241481
    
    
      36552
      tr947
      9.56921
      picked_top1
      481
      48100.0
      -27891.079857
      0.610006
      30.418991
      0.482850
      0.0
      ...
      -1263.925386
      0.0
      0.0
      0.0
      -2108.005624
      -2.046083
      0.185091
      True
      True
      0.209786
    
    
      37373
      tr948
      6.42138
      picked_top1
      810
      30900.0
      -28698.131138
      0.581505
      35.027568
      0.603420
      0.0
      ...
      -1488.304814
      0.0
      0.0
      0.0
      -2589.281395
      -0.825228
      0.477354
      True
      True
      0.596875
    
    
      0
      tr594
      5.71845
      picked_top5
      2028
      2400.0
      -9655.812091
      0.562631
      38.258352
      0.397447
      0.0
      ...
      -542.733446
      0.0
      0.0
      0.0
      -976.169211
      1.783749
      -0.897284
      True
      True
      0.336318
    
    
      8
      tr862
      5.91362
      picked_top5
      840
      33900.0
      -14298.952979
      0.792131
      8.641868
      0.457493
      0.0
      ...
      -857.960775
      0.0
      0.0
      0.0
      -1460.246571
      -0.883616
      -2.363559
      True
      True
      0.176140
    
    
      12
      tr866
      1.52312
      picked_top5
      296
      29600.0
      -16825.468015
      0.627247
      27.789014
      0.867941
      0.0
      ...
      -729.065574
      0.0
      0.0
      0.0
      -1438.226706
      -0.323353
      4.008893
      True
      True
      0.149921
    
    
      16
      tr868
      1.94233
      picked_top5
      772
      27100.0
      -18689.078432
      0.534715
      43.298055
      0.804022
      0.0
      ...
      -1144.918529
      0.0
      0.0
      0.0
      -1868.985796
      -0.338582
      -0.103741
      True
      True
      0.328877
    
    
      20
      tr869
      12.30950
      picked_top5
      0
      0.0
      -17101.278236
      0.327521
      90.445532
      0.340709
      0.0
      ...
      -745.292489
      0.0
      0.0
      0.0
      -1364.887693
      -0.000178
      -0.048953
      True
      True
      0.287940
    
    
      26
      tr870
      8.81563
      picked_top5
      129
      12900.0
      -20157.706928
      0.630429
      27.316528
      0.324614
      80000000.0
      ...
      -985.958008
      0.0
      0.0
      0.0
      -1635.238984
      -1.121479
      -0.345961
      True
      True
      0.020210
    
    
      31
      tr872
      3.38775
      picked_top5
      141
      14100.0
      -12447.029453
      0.676140
      20.977116
      0.667752
      0.0
      ...
      -546.638457
      0.0
      0.0
      0.0
      -1131.691792
      0.641274
      -0.860320
      True
      True
      0.220697
    
    
      35
      tr877
      3.00711
      picked_top5
      0
      0.0
      -24114.108027
      0.568868
      37.174966
      0.648838
      0.0
      ...
      -821.245402
      0.0
      0.0
      0.0
      -1699.528374
      -0.055574
      -0.014200
      True
      True
      0.773073
    
    
      44
      tr882
      2.28804
      picked_top5
      1965
      46200.0
      -11916.930776
      0.636862
      26.373853
      0.759764
      0.0
      ...
      -633.406934
      0.0
      0.0
      0.0
      -1132.496525
      -0.322323
      0.313275
      True
      True
      0.212781
    
    
      49
      tr884
      4.60271
      picked_top5
      1851
      34800.0
      -9350.865722
      0.537420
      42.796012
      0.571265
      0.0
      ...
      -492.137189
      0.0
      0.0
      0.0
      -916.994515
      0.261957
      -0.655306
      True
      True
      0.138139
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      25047
      tr885
      2.35937
      init
      0
      0.0
      -20598.856085
      0.685012
      19.843456
      0.830205
      0.0
      ...
      -870.801921
      0.0
      0.0
      0.0
      -1645.527867
      -0.037832
      0.039669
      True
      NaN
      NaN
    
    
      27552
      tr891
      1.60893
      init
      0
      0.0
      -18747.275083
      0.620600
      28.788908
      0.860321
      0.0
      ...
      -820.247037
      0.0
      0.0
      0.0
      -1600.853027
      -0.184893
      -0.098650
      True
      NaN
      NaN
    
    
      30057
      tr894
      2.24606
      init
      0
      0.0
      -6810.073249
      0.556208
      39.390241
      0.731370
      0.0
      ...
      -330.532789
      0.0
      0.0
      0.0
      -668.558313
      -0.065067
      -0.283718
      True
      NaN
      NaN
    
    
      32562
      tr895
      4.33382
      init
      0
      0.0
      -22028.450058
      0.618778
      29.065981
      0.654401
      0.0
      ...
      -889.343989
      0.0
      0.0
      0.0
      -1623.103778
      -0.010141
      -0.051103
      True
      NaN
      NaN
    
    
      35067
      tr896
      8.22359
      init
      0
      0.0
      -12136.482066
      0.466766
      56.867616
      0.443834
      0.0
      ...
      -525.593994
      0.0
      0.0
      0.0
      -947.681062
      -0.058300
      -0.054690
      True
      NaN
      NaN
    
    
      37572
      tr898
      13.99650
      init
      0
      0.0
      -18590.677602
      0.528118
      44.534476
      0.357930
      0.0
      ...
      -782.373728
      0.0
      0.0
      0.0
      -1446.453004
      0.042324
      -0.167047
      True
      NaN
      NaN
    
    
      40077
      tr921
      3.55243
      init
      0
      0.0
      -22225.052777
      0.676982
      20.868098
      0.633795
      0.0
      ...
      -1814.239425
      0.0
      0.0
      0.0
      -2679.102572
      -0.101087
      0.070372
      True
      NaN
      NaN
    
    
      42582
      tr922
      2.55776
      init
      0
      0.0
      -9494.179526
      0.646615
      24.976143
      0.778997
      0.0
      ...
      -619.584850
      0.0
      0.0
      0.0
      -999.248416
      0.115988
      0.147087
      True
      NaN
      NaN
    
    
      45087
      tr947
      12.92220
      init
      0
      0.0
      -26347.737275
      0.602246
      31.641680
      0.498832
      0.0
      ...
      -1262.954342
      0.0
      0.0
      0.0
      -2074.380569
      1.187467
      -0.301399
      True
      NaN
      NaN
    
    
      47592
      tr948
      6.72126
      init
      0
      0.0
      -29784.167159
      0.534029
      43.425714
      0.623263
      0.0
      ...
      -1442.266587
      0.0
      0.0
      0.0
      -2474.731767
      -0.029092
      0.051793
      True
      NaN
      NaN
    
    
      2281
      tr594
      11.66420
      worst
      2392
      38800.0
      -7800.180486
      0.405731
      70.631048
      0.293334
      0.0
      ...
      -528.484319
      0.0
      0.0
      0.0
      -945.059692
      2.492546
      0.800841
      True
      True
      0.000514
    
    
      2842
      tr862
      8.64546
      worst
      459
      45900.0
      -13675.272843
      0.744725
      13.033074
      0.552184
      0.0
      ...
      -836.017899
      0.0
      0.0
      0.0
      -1416.508005
      -2.115153
      0.313762
      True
      True
      0.004266
    
    
      5982
      tr866
      5.49268
      worst
      1436
      43400.0
      -16316.523037
      0.460162
      58.285112
      0.523121
      0.0
      ...
      -706.324188
      0.0
      0.0
      0.0
      -1342.476061
      0.189714
      -0.228174
      True
      True
      0.000023
    
    
      8884
      tr868
      5.50126
      worst
      2067
      6300.0
      -18165.315305
      0.465461
      57.146452
      0.608624
      0.0
      ...
      -1085.027900
      0.0
      0.0
      0.0
      -1783.826243
      0.645704
      -2.754277
      True
      True
      0.004290
    
    
      9679
      tr869
      14.43870
      worst
      598
      9700.0
      -16097.470589
      0.218214
      122.237944
      0.311339
      0.0
      ...
      -729.588674
      0.0
      0.0
      0.0
      -1323.833114
      -0.808695
      -0.587784
      True
      True
      0.001463
    
    
      12776
      tr870
      11.43280
      worst
      1484
      48200.0
      -18635.189014
      0.630429
      27.316528
      0.311343
      80000000.0
      ...
      -985.958008
      0.0
      0.0
      0.0
      -1635.238984
      -1.256596
      2.241660
      True
      True
      0.006724
    
    
      15120
      tr872
      7.00137
      worst
      2011
      700.0
      -11677.938128
      0.670047
      21.773797
      0.571122
      0.0
      ...
      -545.404119
      0.0
      0.0
      0.0
      -1101.906710
      0.895718
      0.631344
      True
      True
      0.007777
    
    
      16772
      tr877
      5.97014
      worst
      1819
      31600.0
      -23656.864393
      0.497143
      50.573068
      0.572003
      0.0
      ...
      -819.115369
      0.0
      0.0
      0.0
      -1632.289229
      0.220377
      3.612046
      True
      True
      0.001921
    
    
      18310
      tr882
      4.10836
      worst
      1201
      19900.0
      -11669.955472
      0.613084
      29.940872
      0.673967
      0.0
      ...
      -617.772188
      0.0
      0.0
      0.0
      -1084.858365
      -0.434148
      -0.384838
      True
      True
      0.000567
    
    
      21428
      tr884
      8.92547
      worst
      2345
      34100.0
      -8695.952091
      0.456176
      59.148866
      0.482172
      0.0
      ...
      -479.625249
      0.0
      0.0
      0.0
      -875.415909
      2.096840
      0.496279
      True
      True
      0.000063
    
    
      22233
      tr885
      5.25420
      worst
      733
      23200.0
      -19040.407218
      0.663800
      22.606043
      0.762638
      0.0
      ...
      -879.497813
      0.0
      0.0
      0.0
      -1663.557353
      0.193633
      -1.313889
      True
      True
      0.007403
    
    
      26049
      tr891
      4.07153
      worst
      2497
      49300.0
      -17308.511883
      0.599059
      32.150760
      0.652094
      0.0
      ...
      -791.156171
      0.0
      0.0
      0.0
      -1550.446442
      2.179495
      1.449662
      True
      True
      0.000239
    
    
      27079
      tr894
      5.09216
      worst
      1485
      48300.0
      -6089.793333
      0.505490
      48.908058
      0.487889
      0.0
      ...
      -326.511714
      0.0
      0.0
      0.0
      -661.998634
      -0.766482
      -2.990092
      True
      True
      0.003307
    
    
      27974
      tr895
      7.98054
      worst
      43
      4300.0
      -18903.580144
      0.583194
      34.745474
      0.565594
      0.0
      ...
      -892.775956
      0.0
      0.0
      0.0
      -1630.348760
      -1.605658
      3.039932
      True
      True
      0.001931
    
    
      30562
      tr896
      11.04840
      worst
      1207
      20500.0
      -10977.597770
      0.402719
      71.348928
      0.333853
      0.0
      ...
      -538.589762
      0.0
      0.0
      0.0
      -956.501625
      -0.912589
      -2.112027
      True
      True
      0.008942
    
    
      31528
      tr898
      15.98600
      worst
      132
      13200.0
      -17054.063013
      0.483393
      53.376536
      0.348017
      0.0
      ...
      -785.866058
      0.0
      0.0
      0.0
      -1457.509460
      -2.655997
      0.094145
      True
      True
      0.005441
    
    
      32402
      tr921
      4.78319
      worst
      164
      16400.0
      -20379.263146
      0.629018
      27.525487
      0.588459
      0.0
      ...
      -1754.727325
      0.0
      0.0
      0.0
      -2577.565026
      -1.471915
      -4.113387
      True
      True
      0.000025
    
    
      36178
      tr922
      8.54551
      worst
      2429
      42500.0
      -8689.220583
      0.406243
      70.509502
      0.408714
      0.0
      ...
      -507.886517
      0.0
      0.0
      0.0
      -879.592569
      1.571574
      4.837135
      True
      True
      0.000040
    
    
      36219
      tr947
      15.48280
      worst
      15
      1500.0
      -25124.991722
      0.569790
      37.016088
      0.459361
      0.0
      ...
      -1249.896273
      0.0
      0.0
      0.0
      -2041.418239
      -1.986278
      -0.695540
      True
      True
      0.000236
    
    
      38817
      tr948
      10.08460
      worst
      2465
      46100.0
      -26585.016143
      0.517020
      46.653959
      0.518408
      0.0
      ...
      -1433.128337
      0.0
      0.0
      0.0
      -2468.110974
      0.743237
      -1.782562
      True
      True
      0.000068
    
  

100 rows × 31 columns



In [ ]:

    
picked



In [19]:

    
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)









    



tr898 13.9965 13.4609 0.536 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 8.10622 4.816 True
tr894 2.24606 1.6856900000000001 0.56 True
tr882 2.34977 2.28804 0.062 True
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 5.91362 -0.379 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 3.38775 2.279 True
tr885 2.35937 2.72348 -0.364 False
tr866 3.23637 1.52312 1.713 True
tr868 1.97032 1.94233 0.028 True
tr884 3.81972 4.60271 -0.783 False
tr895 4.333819999999999 4.87177 -0.538 False
tr896 8.22359 8.01121 0.212 True
tr870 7.65488 8.81563 -1.161 False
tr921 3.55243 3.17155 0.381 True
tr922 2.55776 3.28628 -0.729 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 5.56396 1.157 True
improved:  10 20 6.42652



In [26]:

    
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked_1.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)









    



tr898 13.9965 13.4609 0.536 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 9.56921 3.353 True
tr894 2.24606 2.00769 0.238 True
tr882 2.34977 2.75337 -0.404 False
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 6.21427 -0.68 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 3.8839699999999997 1.783 True
tr885 2.35937 3.1840200000000003 -0.825 False
tr866 3.23637 1.65094 1.585 True
tr868 1.97032 2.55439 -0.584 False
tr884 3.81972 5.427980000000001 -1.608 False
tr895 4.333819999999999 4.87177 -0.538 False
tr896 8.22359 8.629610000000001 -0.406 False
tr870 7.65488 8.989889999999999 -1.335 False
tr921 3.55243 3.24877 0.304 True
tr922 2.55776 3.6866800000000004 -1.129 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 6.42138 0.3 True
improved:  7 20 -0.7740600000000017



In [181]:

    
filtered = a
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
    "biasQ",
    'Rw',
     'VTotal',
#     'RMSD', # test
#     'Qw',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prob= clf.predict_proba(test_set)[:,1]
    return a.assign(prob=prob)

# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]

# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)

filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
if True:
    picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")

if False:
    picked = filtered.groupby("name").apply(choose_top, col="prob"
                                                , n=5, ascending=False).reset_index(drop=True).query("chosen==True")
    picked = picked.groupby("name").apply(choose_top, col="RMSD"
                                                , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
                        , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))









    Out[181]:





<seaborn.axisgrid.FacetGrid at 0x1a20ed8e10>



In [182]:

    
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)









    



tr898 13.9965 12.8408 1.156 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 12.1452 0.777 True
tr894 2.24606 2.00769 0.238 True
tr882 2.34977 2.75337 -0.404 False
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 5.91362 -0.379 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 4.53573 1.131 True
tr885 2.35937 3.1840200000000003 -0.825 False
tr866 3.23637 1.65094 1.585 True
tr868 1.97032 2.55439 -0.584 False
tr884 3.81972 4.81 -0.99 False
tr895 4.333819999999999 5.72387 -1.39 False
tr896 8.22359 8.629610000000001 -0.406 False
tr870 7.65488 9.11107 -1.456 False
tr921 3.55243 3.24877 0.304 True
tr922 2.55776 3.6866800000000004 -1.129 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 6.42138 0.3 True
improved:  7 20 -3.436360000000002



In [339]:

    
all_results.query("name == 'tr594'")









    Out[339]:







  
    
      
      folder
      i
      RMSD
      Rw
      biasQ
      bias
      Qw
      Step
      Chain
      Shake
      ...
      SSB
      VTotal
      pc
      pc2
      name
      chosen
      result
      pc_center
      inrange
      prob
    
  
  
    
      0
      0
      0.0
      4.65665
      -10838.987781
      0.590522
      33.534424
      0.464193
      0.0
      14.571923
      0.0
      ...
      0.0
      -940.887277
      -0.086922
      -0.059985
      tr594
      True
      best
      NaN
      NaN
      NaN
    
    
      0
      0
      0.0
      4.65665
      -10838.987781
      0.590522
      33.534424
      0.464193
      0.0
      14.571923
      0.0
      ...
      0.0
      -940.887277
      -0.086922
      -0.059985
      tr594
      True
      picked
      0.703
      True
      0.042606
    
    
      0
      0
      0.0
      4.65665
      -10838.987781
      0.590522
      33.534424
      0.464193
      0.0
      14.571923
      0.0
      ...
      0.0
      -940.887277
      -0.086922
      -0.059985
      tr594
      True
      init
      NaN
      NaN
      NaN
    
    
      1020
      2380
      37600.0
      11.55900
      -8163.823349
      0.491720
      51.669743
      0.349195
      0.0
      4.380186
      0.0
      ...
      0.0
      -960.110987
      1.421570
      -0.122716
      tr594
      True
      worst
      0.703
      True
      0.003102
    
  

4 rows × 32 columns



In [35]:

    
clf.coef_









    Out[35]:





array([[ 0.24185815, -0.37308731, -0.43718463]])



In [221]:

    
Plot_Metric = "Qw"
if Plot_Metric:
    isAscending = False
else:
    isAscending = True

picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col=Plot_Metric
                                            , n=1, ascending=isAscending).reset_index(drop=True).query("chosen==True")
picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col=Plot_Metric
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
#                         , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', Plot_Metric).add_legend(fontsize=20)
fg.set(ylim=(0, 1))









    Out[221]:





<seaborn.axisgrid.FacetGrid at 0x1a2d7cfd30>



In [170]:

    
picked["init_RMSD"] = init["RMSD"].values
picked["diff_RMSD"] = init["RMSD"].values - picked["RMSD"].values
out = picked[["name", "RMSD", "init_RMSD", "diff_RMSD", "folder"]].reset_index(drop=True)



In [206]:

    
fg = sns.FacetGrid(data=filtered, hue='name', height=8, aspect=1.63)
fg.map(plt.scatter, 'Qw', 'RMSD').add_legend(fontsize=20)









    Out[206]:





<seaborn.axisgrid.FacetGrid at 0x1a2c9cb780>



In [35]:

    
filtered.plot.scatter("prob", "RMSD")









    Out[35]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a27569080>



In [25]:

    
out









    Out[25]:







  
    
      
      name
      RMSD
      init_RMSD
      diff_RMSD
      folder
    
  
  
    
      0
      tr894
      1.36968
      2.24606
      0.87638
      980



In [13]:

    
raw_data_all_2.plot("RMSD", "Rw")









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a23707f60>



In [14]:

    
raw_data_all_2.plot("RMSD", "pc")









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a23783828>



In [15]:

    
out









    Out[15]:







  
    
      
      name
      RMSD
      init_RMSD
      diff_RMSD
      folder
    
  
  
    
      0
      tr894
      1.36968
      2.24606
      0.87638
      980



In [16]:

    
out









    Out[16]:







  
    
      
      name
      RMSD
      init_RMSD
      diff_RMSD
      folder
    
  
  
    
      0
      tr894
      1.36968
      2.24606
      0.87638
      980



In [70]:

    
all_results









    Out[70]:







  
    
      
      folder
      i
      RMSD
      Rw
      biasQ
      bias
      Step
      Chain
      Shake
      Chi
      ...
      SSB
      VTotal
      pc
      pc2
      name
      chosen
      result
      pc_center
      inrange
      prob
    
  
  
    
      980
      980
      47900.0
      1.36968
      -6834.949362
      0.645772
      25.095468
      0
      3.685220
      0.0
      0.588609
      ...
      0.0
      -656.184416
      0.944122
      -0.804111
      tr894
      True
      best
      NaN
      NaN
      NaN
    
    
      981
      981
      48000.0
      1.57229
      -6850.297788
      0.673878
      21.271087
      0
      4.750551
      0.0
      0.512601
      ...
      0.0
      -659.909501
      0.762314
      -0.531746
      tr894
      True
      picked
      2.03
      True
      0.523816
    
    
      0
      0
      0.0
      2.24606
      -6810.073249
      0.556208
      39.390241
      0
      5.546380
      0.0
      1.160888
      ...
      0.0
      -668.558313
      -0.065067
      -0.283718
      tr894
      True
      init
      NaN
      NaN
      NaN
    
  

3 rows × 31 columns



In [13]:

    
# out.to_csv("/Users/weilu/Desktop/picked_3.csv")



In [26]:

    
clf.coef_









    Out[26]:





array([[ 0.31865771, -0.24574338, -0.00429271, -0.15621297,  0.12086065,
         0.03529636,  0.05114406,  0.06779384,  0.23049113, -0.0941187 ]])



In [14]:

    
clf.coef_









    Out[14]:





array([[ 0.20157408, -0.69485223,  0.04456798]])



In [15]:

    
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))









    Out[15]:





<seaborn.axisgrid.FacetGrid at 0x1a22576f60>



In [16]:

    
filtered["name"].unique().shape









    Out[16]:





(17,)



In [17]:

    
picked[["RMSD", "name"]]



In [18]:

    
# picked.to_csv("/Users/weilu/Desktop/picked_2.csv")



In [19]:

    
name ="tr894"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]

# you probably want to change the location below
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/nov_2018/structure_selection/{name}/"
RMSD = pd.read_table(location+"rmsd-angstrom.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
# pc location
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, awsem, pc], axis=1)
raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})









    Out[19]:







  
    
      
      folder
      i
      RMSD
      Rw
      biasQ
      bias
      Step
      Chain
      Shake
      Chi
      ...
      Helix
      AMH_Go
      Frag_Mem
      Vec_FM
      Membrane
      SSB
      VTotal
      pc
      pc2
      name
    
  
  
    
      0
      0
      0.0
      2.24606
      -6810.073249
      0.556208
      39.390241
      0
      5.546380
      0.0
      1.160888
      ...
      -11.494550
      0.0
      -330.532789
      0.0
      0.0
      0.0
      -668.558313
      -0.065067
      -0.283718
      tr894
    
    
      1
      1
      100.0
      3.23173
      -6286.940968
      0.551209
      40.282725
      0
      2.470745
      0.0
      0.544233
      ...
      -12.663186
      0.0
      -332.365779
      0.0
      0.0
      0.0
      -661.611254
      -2.030413
      -1.872569
      tr894
    
    
      2
      2
      200.0
      3.21661
      -6289.430903
      0.497013
      50.599184
      0
      4.764399
      0.0
      0.562483
      ...
      -13.188086
      0.0
      -332.765589
      0.0
      0.0
      0.0
      -658.149297
      -2.028462
      -1.644785
      tr894
    
    
      3
      3
      300.0
      3.15813
      -6190.501191
      0.528574
      44.448412
      0
      2.255989
      0.0
      0.468350
      ...
      -13.341891
      0.0
      -333.111932
      0.0
      0.0
      0.0
      -662.850984
      -1.765059
      -1.880311
      tr894
    
    
      4
      4
      400.0
      3.48106
      -6070.736394
      0.507673
      48.477271
      0
      4.563115
      0.0
      0.358890
      ...
      -13.054297
      0.0
      -332.096669
      0.0
      0.0
      0.0
      -655.505067
      -2.050421
      -0.862438
      tr894
    
    
      5
      5
      500.0
      3.21603
      -6207.393138
      0.507665
      48.478663
      0
      3.079867
      0.0
      0.562905
      ...
      -12.983445
      0.0
      -334.065069
      0.0
      0.0
      0.0
      -654.514531
      -2.024328
      -0.505672
      tr894
    
    
      6
      6
      600.0
      3.26742
      -6270.166177
      0.515849
      46.880384
      0
      3.598324
      0.0
      0.689476
      ...
      -12.473662
      0.0
      -329.046856
      0.0
      0.0
      0.0
      -661.071468
      -2.225011
      -1.368251
      tr894
    
    
      7
      7
      700.0
      3.45545
      -6275.141716
      0.521223
      45.845485
      0
      3.142972
      0.0
      0.471721
      ...
      -12.879093
      0.0
      -330.526926
      0.0
      0.0
      0.0
      -660.649964
      -1.917079
      -0.934712
      tr894
    
    
      8
      8
      800.0
      2.84899
      -6353.542762
      0.521744
      45.745735
      0
      4.776124
      0.0
      0.537145
      ...
      -12.874610
      0.0
      -330.107068
      0.0
      0.0
      0.0
      -666.482035
      -1.836351
      -1.092572
      tr894
    
    
      9
      9
      900.0
      3.43413
      -6114.807297
      0.567139
      37.473775
      0
      3.616298
      0.0
      0.553106
      ...
      -13.392355
      0.0
      -332.670193
      0.0
      0.0
      0.0
      -664.643019
      -1.984028
      -1.763320
      tr894
    
    
      10
      10
      1000.0
      3.39099
      -6010.453477
      0.536807
      42.909507
      0
      3.596885
      0.0
      0.391673
      ...
      -13.353049
      0.0
      -332.451981
      0.0
      0.0
      0.0
      -659.813810
      -2.215176
      0.442220
      tr894
    
    
      11
      11
      1100.0
      3.48153
      -6114.438719
      0.581163
      35.084917
      0
      2.720963
      0.0
      0.310693
      ...
      -13.200931
      0.0
      -331.501808
      0.0
      0.0
      0.0
      -664.442638
      -2.008016
      0.629552
      tr894
    
    
      12
      12
      1200.0
      3.23210
      -6101.638205
      0.519535
      46.169300
      0
      5.605657
      0.0
      0.721162
      ...
      -10.424754
      0.0
      -332.022908
      0.0
      0.0
      0.0
      -656.524265
      -1.985955
      -0.727112
      tr894
    
    
      13
      13
      1300.0
      2.92439
      -6059.757243
      0.529431
      44.287121
      0
      2.323098
      0.0
      0.547973
      ...
      -13.361106
      0.0
      -334.500030
      0.0
      0.0
      0.0
      -660.984355
      -2.111932
      -1.343508
      tr894
    
    
      14
      14
      1400.0
      2.90091
      -6306.977750
      0.545174
      41.373291
      0
      3.194121
      0.0
      0.466709
      ...
      -13.974103
      0.0
      -333.122664
      0.0
      0.0
      0.0
      -664.931939
      -1.862049
      -0.307235
      tr894
    
    
      15
      15
      1500.0
      3.44498
      -6054.804323
      0.481283
      53.813425
      0
      4.393513
      0.0
      0.413912
      ...
      -10.416986
      0.0
      -329.519729
      0.0
      0.0
      0.0
      -640.413959
      -1.964237
      -0.545008
      tr894
    
    
      16
      16
      1600.0
      3.72309
      -5924.173405
      0.517176
      46.623769
      0
      4.403761
      0.0
      0.530923
      ...
      -12.445575
      0.0
      -332.027049
      0.0
      0.0
      0.0
      -652.482812
      -2.177049
      -0.822812
      tr894
    
    
      17
      17
      1700.0
      2.94234
      -6170.545926
      0.605790
      31.080307
      0
      3.403110
      0.0
      0.394039
      ...
      -13.070255
      0.0
      -334.764821
      0.0
      0.0
      0.0
      -662.858390
      -1.977594
      -0.605940
      tr894
    
    
      18
      18
      1800.0
      3.14126
      -6104.911588
      0.565975
      37.675492
      0
      2.148955
      0.0
      0.416200
      ...
      -14.408324
      0.0
      -332.427927
      0.0
      0.0
      0.0
      -657.525813
      -2.012379
      -0.666930
      tr894
    
    
      19
      19
      1900.0
      3.03640
      -6248.712283
      0.549830
      40.530526
      0
      3.192358
      0.0
      0.373018
      ...
      -12.509658
      0.0
      -331.034803
      0.0
      0.0
      0.0
      -652.148108
      -1.860132
      -0.951803
      tr894
    
    
      20
      20
      2000.0
      2.80779
      -6441.963711
      0.556436
      39.349743
      0
      2.222043
      0.0
      0.423124
      ...
      -12.922997
      0.0
      -334.503676
      0.0
      0.0
      0.0
      -661.244525
      -1.888069
      -0.825117
      tr894
    
    
      21
      21
      2100.0
      3.31259
      -6410.346460
      0.513748
      47.288295
      0
      6.191025
      0.0
      0.604926
      ...
      -12.253130
      0.0
      -332.203648
      0.0
      0.0
      0.0
      -651.160377
      -2.001888
      -0.592104
      tr894
    
    
      22
      22
      2200.0
      2.91335
      -6308.221083
      0.593916
      32.980882
      0
      3.234118
      0.0
      0.664409
      ...
      -13.207077
      0.0
      -332.990401
      0.0
      0.0
      0.0
      -658.201126
      -2.152202
      -0.717611
      tr894
    
    
      23
      23
      2300.0
      3.19933
      -6374.911948
      0.539413
      42.428153
      0
      2.538853
      0.0
      0.341001
      ...
      -13.408200
      0.0
      -331.572188
      0.0
      0.0
      0.0
      -662.264580
      -2.190752
      -0.847559
      tr894
    
    
      24
      24
      2400.0
      2.94005
      -6354.892279
      0.603693
      31.411886
      0
      1.888008
      0.0
      0.590554
      ...
      -13.759548
      0.0
      -334.756167
      0.0
      0.0
      0.0
      -673.536860
      -2.020588
      -0.800902
      tr894
    
    
      25
      25
      2500.0
      3.24435
      -6448.101987
      0.570527
      36.889392
      0
      2.534727
      0.0
      0.389001
      ...
      -12.327469
      0.0
      -332.104002
      0.0
      0.0
      0.0
      -664.744860
      -1.970641
      -1.751093
      tr894
    
    
      26
      26
      2600.0
      3.21349
      -6356.016826
      0.552069
      40.128420
      0
      3.240573
      0.0
      0.340624
      ...
      -14.062783
      0.0
      -331.951360
      0.0
      0.0
      0.0
      -669.486651
      -2.017166
      -1.114400
      tr894
    
    
      27
      27
      2700.0
      3.08817
      -6015.780813
      0.543925
      41.600811
      0
      2.849521
      0.0
      0.563807
      ...
      -11.707489
      0.0
      -333.438918
      0.0
      0.0
      0.0
      -656.923779
      -2.079045
      -1.713251
      tr894
    
    
      28
      28
      2800.0
      2.91854
      -6266.414740
      0.570673
      36.864257
      0
      2.305444
      0.0
      0.542672
      ...
      -13.504228
      0.0
      -336.875255
      0.0
      0.0
      0.0
      -661.990310
      -1.959546
      -1.258906
      tr894
    
    
      29
      29
      2900.0
      3.51806
      -6095.122990
      0.575797
      35.989700
      0
      2.400738
      0.0
      0.481813
      ...
      -12.959278
      0.0
      -333.359501
      0.0
      0.0
      0.0
      -662.430279
      -2.148657
      -0.842376
      tr894
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2475
      2475
      47100.0
      3.04801
      -6584.548221
      0.528828
      44.400564
      0
      3.443490
      0.0
      0.689302
      ...
      -8.527353
      0.0
      -328.333283
      0.0
      0.0
      0.0
      -656.799558
      1.489679
      -1.073833
      tr894
    
    
      2476
      2476
      47200.0
      3.06231
      -6524.537951
      0.501176
      49.765102
      0
      4.955148
      0.0
      0.473575
      ...
      -7.623936
      0.0
      -324.707280
      0.0
      0.0
      0.0
      -648.310152
      1.433943
      -0.830940
      tr894
    
    
      2477
      2477
      47300.0
      3.28581
      -6530.410132
      0.574789
      36.160841
      0
      4.166641
      0.0
      0.671689
      ...
      -9.461912
      0.0
      -327.996908
      0.0
      0.0
      0.0
      -661.264516
      1.390436
      -1.208435
      tr894
    
    
      2478
      2478
      47400.0
      3.23796
      -6505.011426
      0.455993
      59.188681
      0
      7.585024
      0.0
      0.630827
      ...
      -6.166685
      0.0
      -318.313982
      0.0
      0.0
      0.0
      -633.543536
      1.532387
      -1.189597
      tr894
    
    
      2479
      2479
      47500.0
      3.47343
      -6412.675255
      0.509082
      48.200074
      0
      4.337123
      0.0
      0.533898
      ...
      -8.175169
      0.0
      -321.367115
      0.0
      0.0
      0.0
      -641.175122
      1.170308
      -0.850157
      tr894
    
    
      2480
      2480
      47600.0
      3.22195
      -6600.284916
      0.546451
      41.141258
      0
      4.499017
      0.0
      0.531768
      ...
      -8.919736
      0.0
      -328.276578
      0.0
      0.0
      0.0
      -656.319864
      1.024558
      -1.633348
      tr894
    
    
      2481
      2481
      47700.0
      3.29628
      -6594.057433
      0.450815
      60.320799
      0
      5.516787
      0.0
      0.532782
      ...
      -6.014002
      0.0
      -324.793170
      0.0
      0.0
      0.0
      -643.313596
      1.289813
      -1.317772
      tr894
    
    
      2482
      2482
      47800.0
      3.22039
      -6538.294794
      0.492895
      51.431169
      0
      4.771708
      0.0
      0.643649
      ...
      -10.536290
      0.0
      -327.516119
      0.0
      0.0
      0.0
      -654.529729
      0.546995
      -1.410352
      tr894
    
    
      2483
      2483
      47900.0
      3.71419
      -6429.439945
      0.486252
      52.787339
      0
      4.245413
      0.0
      0.423871
      ...
      -4.541977
      0.0
      -327.570965
      0.0
      0.0
      0.0
      -650.587467
      0.814162
      -1.430980
      tr894
    
    
      2484
      2484
      48000.0
      3.19131
      -6477.936224
      0.522025
      45.692109
      0
      4.294322
      0.0
      0.558186
      ...
      -7.184502
      0.0
      -326.483925
      0.0
      0.0
      0.0
      -651.265310
      1.539578
      -1.387833
      tr894
    
    
      2485
      2485
      48100.0
      3.07537
      -6498.204684
      0.521062
      45.876390
      0
      4.032237
      0.0
      0.661476
      ...
      -9.748129
      0.0
      -325.586983
      0.0
      0.0
      0.0
      -651.190938
      1.369766
      -0.825550
      tr894
    
    
      2486
      2486
      48200.0
      3.37094
      -6374.215012
      0.539856
      42.346474
      0
      3.503117
      0.0
      0.476007
      ...
      -6.907826
      0.0
      -326.781573
      0.0
      0.0
      0.0
      -650.287189
      1.779587
      -0.777044
      tr894
    
    
      2487
      2487
      48300.0
      3.28947
      -6462.479014
      0.529538
      44.266976
      0
      4.450656
      0.0
      0.778137
      ...
      -9.720084
      0.0
      -327.426579
      0.0
      0.0
      0.0
      -655.430363
      1.632520
      -1.162894
      tr894
    
    
      2488
      2488
      48400.0
      3.32840
      -6504.263804
      0.487773
      52.475263
      0
      3.823074
      0.0
      0.541481
      ...
      -9.621493
      0.0
      -328.796522
      0.0
      0.0
      0.0
      -648.395792
      0.591862
      -1.299183
      tr894
    
    
      2489
      2489
      48500.0
      3.43751
      -6447.190797
      0.507673
      48.477255
      0
      5.101223
      0.0
      0.543194
      ...
      -8.555008
      0.0
      -326.452097
      0.0
      0.0
      0.0
      -648.549022
      0.661395
      -1.650177
      tr894
    
    
      2490
      2490
      48600.0
      3.82345
      -6365.246694
      0.495035
      50.997870
      0
      3.198458
      0.0
      0.629422
      ...
      -9.256821
      0.0
      -327.410325
      0.0
      0.0
      0.0
      -647.431837
      -0.195077
      -2.383967
      tr894
    
    
      2491
      2491
      48700.0
      3.62716
      -6430.212535
      0.492129
      51.586661
      0
      4.407783
      0.0
      0.580651
      ...
      -8.285745
      0.0
      -323.436894
      0.0
      0.0
      0.0
      -646.712158
      0.334757
      -1.955750
      tr894
    
    
      2492
      2492
      48800.0
      3.25856
      -6733.348524
      0.528495
      44.463321
      0
      4.332744
      0.0
      0.787561
      ...
      -10.086368
      0.0
      -326.230732
      0.0
      0.0
      0.0
      -653.059823
      0.900004
      -1.994071
      tr894
    
    
      2493
      2493
      48900.0
      3.31050
      -6597.765990
      0.501219
      49.756485
      0
      4.567093
      0.0
      0.523651
      ...
      -9.982728
      0.0
      -325.830854
      0.0
      0.0
      0.0
      -652.938934
      1.253588
      -1.430779
      tr894
    
    
      2494
      2494
      49000.0
      3.41650
      -6464.407265
      0.496882
      50.625585
      0
      4.832824
      0.0
      0.524553
      ...
      -8.364190
      0.0
      -325.062802
      0.0
      0.0
      0.0
      -650.974420
      1.921487
      -1.013259
      tr894
    
    
      2495
      2495
      49100.0
      3.10904
      -6387.091755
      0.548666
      40.740404
      0
      3.316020
      0.0
      0.581348
      ...
      -10.967092
      0.0
      -328.988742
      0.0
      0.0
      0.0
      -659.262743
      1.190546
      -1.329242
      tr894
    
    
      2496
      2496
      49200.0
      3.08787
      -6544.665697
      0.542510
      41.859331
      0
      5.020698
      0.0
      0.864652
      ...
      -9.712455
      0.0
      -326.961086
      0.0
      0.0
      0.0
      -653.968884
      1.103267
      -1.117054
      tr894
    
    
      2497
      2497
      49300.0
      3.08903
      -6565.835977
      0.519321
      46.210480
      0
      4.056949
      0.0
      0.782764
      ...
      -10.167152
      0.0
      -325.731478
      0.0
      0.0
      0.0
      -649.584897
      1.347216
      -1.148235
      tr894
    
    
      2498
      2498
      49400.0
      2.86923
      -6705.263150
      0.478287
      54.436844
      0
      5.751120
      0.0
      0.550526
      ...
      -8.520623
      0.0
      -326.361710
      0.0
      0.0
      0.0
      -647.484556
      0.969805
      -1.272144
      tr894
    
    
      2499
      2499
      49500.0
      3.75897
      -6463.716890
      0.445395
      61.517373
      0
      6.437783
      0.0
      0.522887
      ...
      -8.618340
      0.0
      -320.399867
      0.0
      0.0
      0.0
      -635.290484
      1.147018
      -1.649743
      tr894
    
    
      2500
      2500
      49600.0
      3.39286
      -6501.433057
      0.485665
      52.908148
      0
      4.294195
      0.0
      0.505823
      ...
      -10.501468
      0.0
      -323.573172
      0.0
      0.0
      0.0
      -647.184033
      1.221193
      -1.348389
      tr894
    
    
      2501
      2501
      49700.0
      3.32162
      -6391.616071
      0.532840
      43.647727
      0
      4.949311
      0.0
      0.640128
      ...
      -9.125481
      0.0
      -329.320986
      0.0
      0.0
      0.0
      -656.508771
      0.993919
      -0.986276
      tr894
    
    
      2502
      2502
      49800.0
      3.28345
      -6563.961634
      0.519311
      46.212290
      0
      5.139785
      0.0
      0.632014
      ...
      -9.679081
      0.0
      -325.602523
      0.0
      0.0
      0.0
      -647.098837
      0.616816
      -0.979727
      tr894
    
    
      2503
      2503
      49900.0
      3.04609
      -6472.654667
      0.524059
      45.303913
      0
      5.039938
      0.0
      0.796512
      ...
      -9.205125
      0.0
      -324.496863
      0.0
      0.0
      0.0
      -647.414385
      1.401103
      -0.722207
      tr894
    
    
      2504
      2504
      50000.0
      3.99691
      -6555.354241
      0.470289
      56.118701
      0
      3.803515
      0.0
      0.692461
      ...
      -8.788567
      0.0
      -329.318347
      0.0
      0.0
      0.0
      -652.710108
      0.573119
      -1.284276
      tr894
    
  

2505 rows × 26 columns



In [ ]:

	RMSD	name
0	5.14331	tr594
1136	4.31091	tr862
1890	3.14792	tr866
2586	1.61058	tr868
3123	8.69691	tr870
3576	2.81351	tr872
4796	3.07719	tr877
5446	2.25775	tr882
6372	4.06919	tr884
6989	2.35937	tr885
7922	1.60893	tr891
9265	1.18208	tr894
10424	4.67184	tr895
10781	8.22359	tr896
14297	3.25991	tr921
14558	3.84193	tr922
16140	5.40097	tr948

	name	RMSD	result	folder	i	Rw	biasQ	bias	Qw	Step	...	Frag_Mem	Vec_FM	Membrane	SSB	VTotal	pc	pc2	chosen	inrange	prob
1917	tr594	5.71845	picked_top1	2028	2400.0	-9655.812091	0.562631	38.258352	0.397447	0.0	...	-542.733446	0.0	0.0	0.0	-976.169211	1.783749	-0.897284	True	True	0.336318
3098	tr862	6.21427	picked_top1	716	21500.0	-14017.862442	0.795115	8.395546	0.461948	0.0	...	-860.309802	0.0	0.0	0.0	-1467.578457	-1.316991	-2.854982	True	True	0.184471
5040	tr866	1.65094	picked_top1	436	43600.0	-17053.726578	0.628532	27.597682	0.834830	0.0	...	-730.772207	0.0	0.0	0.0	-1442.122133	-0.087063	3.699315	True	True	0.220490
7964	tr868	2.55439	picked_top1	1009	700.0	-19002.022160	0.515777	46.894413	0.759775	0.0	...	-1184.431369	0.0	0.0	0.0	-1896.962431	0.139004	-0.381850	True	True	0.419466
9181	tr869	12.30950	picked_top1	0	0.0	-17101.278236	0.327521	90.445532	0.340709	0.0	...	-745.292489	0.0	0.0	0.0	-1364.887693	-0.000178	-0.048953	True	True	0.287940
11347	tr870	8.98989	picked_top1	27	2700.0	-18766.397302	0.366368	80.297996	0.325325	0.0	...	-919.447094	0.0	0.0	0.0	-1663.622985	-1.937195	-0.884915	True	True	0.021734
13892	tr872	3.88397	picked_top1	130	13000.0	-12597.510264	0.673965	21.259789	0.654242	0.0	...	-546.317159	0.0	0.0	0.0	-1131.645842	0.635999	-1.004592	True	True	0.232302
15145	tr877	3.00711	picked_top1	0	0.0	-24114.108027	0.568868	37.174966	0.648838	0.0	...	-821.245402	0.0	0.0	0.0	-1699.528374	-0.055574	-0.014200	True	True	0.773073
18607	tr882	2.75337	picked_top1	1585	8200.0	-12227.807526	0.648724	24.679000	0.790488	0.0	...	-630.005603	0.0	0.0	0.0	-1123.346564	-0.326972	0.008144	True	True	0.257559
20532	tr884	5.42798	picked_top1	1448	44600.0	-9418.042083	0.530488	44.088316	0.496556	0.0	...	-492.254269	0.0	0.0	0.0	-924.112214	0.739004	-2.026302	True	True	0.231153
22564	tr885	3.18402	picked_top1	1065	6300.0	-19678.210041	0.713528	16.413273	0.723373	0.0	...	-879.121171	0.0	0.0	0.0	-1671.835269	0.400841	-0.236901	True	True	0.125852
25261	tr891	1.91208	picked_top1	1709	20600.0	-18636.734867	0.645838	25.086161	0.814518	0.0	...	-811.315247	0.0	0.0	0.0	-1634.043036	0.290187	0.469162	True	True	0.623671
26508	tr894	2.00769	picked_top1	904	40300.0	-6759.374211	0.667682	22.087109	0.764641	0.0	...	-339.543577	0.0	0.0	0.0	-671.963552	-0.318981	-0.765288	True	True	0.385394
28827	tr895	4.87177	picked_top1	992	49100.0	-20488.493033	0.646876	24.939243	0.571973	0.0	...	-901.913395	0.0	0.0	0.0	-1650.549327	-0.933751	0.277437	True	True	0.225033
29502	tr896	8.62961	picked_top1	3	300.0	-11703.621241	0.461786	57.934848	0.405691	0.0	...	-534.883801	0.0	0.0	0.0	-989.943449	-1.047856	0.015758	True	True	0.497582
31988	tr898	13.46090	picked_top1	729	22800.0	-19042.795684	0.568624	37.217128	0.398104	0.0	...	-783.469742	0.0	0.0	0.0	-1469.264433	-2.546374	-2.851364	True	True	0.170833
33058	tr921	3.24877	picked_top1	828	32700.0	-21881.858026	0.680476	20.419091	0.624908	0.0	...	-1871.646575	0.0	0.0	0.0	-2764.790089	0.245506	-0.892383	True	True	0.165306
34861	tr922	3.68668	picked_top1	1011	900.0	-9636.845094	0.651477	24.293703	0.606792	0.0	...	-620.135642	0.0	0.0	0.0	-1010.273657	-0.056487	1.712977	True	True	0.241481
36552	tr947	9.56921	picked_top1	481	48100.0	-27891.079857	0.610006	30.418991	0.482850	0.0	...	-1263.925386	0.0	0.0	0.0	-2108.005624	-2.046083	0.185091	True	True	0.209786
37373	tr948	6.42138	picked_top1	810	30900.0	-28698.131138	0.581505	35.027568	0.603420	0.0	...	-1488.304814	0.0	0.0	0.0	-2589.281395	-0.825228	0.477354	True	True	0.596875
0	tr594	5.71845	picked_top5	2028	2400.0	-9655.812091	0.562631	38.258352	0.397447	0.0	...	-542.733446	0.0	0.0	0.0	-976.169211	1.783749	-0.897284	True	True	0.336318
8	tr862	5.91362	picked_top5	840	33900.0	-14298.952979	0.792131	8.641868	0.457493	0.0	...	-857.960775	0.0	0.0	0.0	-1460.246571	-0.883616	-2.363559	True	True	0.176140
12	tr866	1.52312	picked_top5	296	29600.0	-16825.468015	0.627247	27.789014	0.867941	0.0	...	-729.065574	0.0	0.0	0.0	-1438.226706	-0.323353	4.008893	True	True	0.149921
16	tr868	1.94233	picked_top5	772	27100.0	-18689.078432	0.534715	43.298055	0.804022	0.0	...	-1144.918529	0.0	0.0	0.0	-1868.985796	-0.338582	-0.103741	True	True	0.328877
20	tr869	12.30950	picked_top5	0	0.0	-17101.278236	0.327521	90.445532	0.340709	0.0	...	-745.292489	0.0	0.0	0.0	-1364.887693	-0.000178	-0.048953	True	True	0.287940
26	tr870	8.81563	picked_top5	129	12900.0	-20157.706928	0.630429	27.316528	0.324614	80000000.0	...	-985.958008	0.0	0.0	0.0	-1635.238984	-1.121479	-0.345961	True	True	0.020210
31	tr872	3.38775	picked_top5	141	14100.0	-12447.029453	0.676140	20.977116	0.667752	0.0	...	-546.638457	0.0	0.0	0.0	-1131.691792	0.641274	-0.860320	True	True	0.220697
35	tr877	3.00711	picked_top5	0	0.0	-24114.108027	0.568868	37.174966	0.648838	0.0	...	-821.245402	0.0	0.0	0.0	-1699.528374	-0.055574	-0.014200	True	True	0.773073
44	tr882	2.28804	picked_top5	1965	46200.0	-11916.930776	0.636862	26.373853	0.759764	0.0	...	-633.406934	0.0	0.0	0.0	-1132.496525	-0.322323	0.313275	True	True	0.212781
49	tr884	4.60271	picked_top5	1851	34800.0	-9350.865722	0.537420	42.796012	0.571265	0.0	...	-492.137189	0.0	0.0	0.0	-916.994515	0.261957	-0.655306	True	True	0.138139
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25047	tr885	2.35937	init	0	0.0	-20598.856085	0.685012	19.843456	0.830205	0.0	...	-870.801921	0.0	0.0	0.0	-1645.527867	-0.037832	0.039669	True	NaN	NaN
27552	tr891	1.60893	init	0	0.0	-18747.275083	0.620600	28.788908	0.860321	0.0	...	-820.247037	0.0	0.0	0.0	-1600.853027	-0.184893	-0.098650	True	NaN	NaN
30057	tr894	2.24606	init	0	0.0	-6810.073249	0.556208	39.390241	0.731370	0.0	...	-330.532789	0.0	0.0	0.0	-668.558313	-0.065067	-0.283718	True	NaN	NaN
32562	tr895	4.33382	init	0	0.0	-22028.450058	0.618778	29.065981	0.654401	0.0	...	-889.343989	0.0	0.0	0.0	-1623.103778	-0.010141	-0.051103	True	NaN	NaN
35067	tr896	8.22359	init	0	0.0	-12136.482066	0.466766	56.867616	0.443834	0.0	...	-525.593994	0.0	0.0	0.0	-947.681062	-0.058300	-0.054690	True	NaN	NaN
37572	tr898	13.99650	init	0	0.0	-18590.677602	0.528118	44.534476	0.357930	0.0	...	-782.373728	0.0	0.0	0.0	-1446.453004	0.042324	-0.167047	True	NaN	NaN
40077	tr921	3.55243	init	0	0.0	-22225.052777	0.676982	20.868098	0.633795	0.0	...	-1814.239425	0.0	0.0	0.0	-2679.102572	-0.101087	0.070372	True	NaN	NaN
42582	tr922	2.55776	init	0	0.0	-9494.179526	0.646615	24.976143	0.778997	0.0	...	-619.584850	0.0	0.0	0.0	-999.248416	0.115988	0.147087	True	NaN	NaN
45087	tr947	12.92220	init	0	0.0	-26347.737275	0.602246	31.641680	0.498832	0.0	...	-1262.954342	0.0	0.0	0.0	-2074.380569	1.187467	-0.301399	True	NaN	NaN
47592	tr948	6.72126	init	0	0.0	-29784.167159	0.534029	43.425714	0.623263	0.0	...	-1442.266587	0.0	0.0	0.0	-2474.731767	-0.029092	0.051793	True	NaN	NaN
2281	tr594	11.66420	worst	2392	38800.0	-7800.180486	0.405731	70.631048	0.293334	0.0	...	-528.484319	0.0	0.0	0.0	-945.059692	2.492546	0.800841	True	True	0.000514
2842	tr862	8.64546	worst	459	45900.0	-13675.272843	0.744725	13.033074	0.552184	0.0	...	-836.017899	0.0	0.0	0.0	-1416.508005	-2.115153	0.313762	True	True	0.004266
5982	tr866	5.49268	worst	1436	43400.0	-16316.523037	0.460162	58.285112	0.523121	0.0	...	-706.324188	0.0	0.0	0.0	-1342.476061	0.189714	-0.228174	True	True	0.000023
8884	tr868	5.50126	worst	2067	6300.0	-18165.315305	0.465461	57.146452	0.608624	0.0	...	-1085.027900	0.0	0.0	0.0	-1783.826243	0.645704	-2.754277	True	True	0.004290
9679	tr869	14.43870	worst	598	9700.0	-16097.470589	0.218214	122.237944	0.311339	0.0	...	-729.588674	0.0	0.0	0.0	-1323.833114	-0.808695	-0.587784	True	True	0.001463
12776	tr870	11.43280	worst	1484	48200.0	-18635.189014	0.630429	27.316528	0.311343	80000000.0	...	-985.958008	0.0	0.0	0.0	-1635.238984	-1.256596	2.241660	True	True	0.006724
15120	tr872	7.00137	worst	2011	700.0	-11677.938128	0.670047	21.773797	0.571122	0.0	...	-545.404119	0.0	0.0	0.0	-1101.906710	0.895718	0.631344	True	True	0.007777
16772	tr877	5.97014	worst	1819	31600.0	-23656.864393	0.497143	50.573068	0.572003	0.0	...	-819.115369	0.0	0.0	0.0	-1632.289229	0.220377	3.612046	True	True	0.001921
18310	tr882	4.10836	worst	1201	19900.0	-11669.955472	0.613084	29.940872	0.673967	0.0	...	-617.772188	0.0	0.0	0.0	-1084.858365	-0.434148	-0.384838	True	True	0.000567
21428	tr884	8.92547	worst	2345	34100.0	-8695.952091	0.456176	59.148866	0.482172	0.0	...	-479.625249	0.0	0.0	0.0	-875.415909	2.096840	0.496279	True	True	0.000063
22233	tr885	5.25420	worst	733	23200.0	-19040.407218	0.663800	22.606043	0.762638	0.0	...	-879.497813	0.0	0.0	0.0	-1663.557353	0.193633	-1.313889	True	True	0.007403
26049	tr891	4.07153	worst	2497	49300.0	-17308.511883	0.599059	32.150760	0.652094	0.0	...	-791.156171	0.0	0.0	0.0	-1550.446442	2.179495	1.449662	True	True	0.000239
27079	tr894	5.09216	worst	1485	48300.0	-6089.793333	0.505490	48.908058	0.487889	0.0	...	-326.511714	0.0	0.0	0.0	-661.998634	-0.766482	-2.990092	True	True	0.003307
27974	tr895	7.98054	worst	43	4300.0	-18903.580144	0.583194	34.745474	0.565594	0.0	...	-892.775956	0.0	0.0	0.0	-1630.348760	-1.605658	3.039932	True	True	0.001931
30562	tr896	11.04840	worst	1207	20500.0	-10977.597770	0.402719	71.348928	0.333853	0.0	...	-538.589762	0.0	0.0	0.0	-956.501625	-0.912589	-2.112027	True	True	0.008942
31528	tr898	15.98600	worst	132	13200.0	-17054.063013	0.483393	53.376536	0.348017	0.0	...	-785.866058	0.0	0.0	0.0	-1457.509460	-2.655997	0.094145	True	True	0.005441
32402	tr921	4.78319	worst	164	16400.0	-20379.263146	0.629018	27.525487	0.588459	0.0	...	-1754.727325	0.0	0.0	0.0	-2577.565026	-1.471915	-4.113387	True	True	0.000025
36178	tr922	8.54551	worst	2429	42500.0	-8689.220583	0.406243	70.509502	0.408714	0.0	...	-507.886517	0.0	0.0	0.0	-879.592569	1.571574	4.837135	True	True	0.000040
36219	tr947	15.48280	worst	15	1500.0	-25124.991722	0.569790	37.016088	0.459361	0.0	...	-1249.896273	0.0	0.0	0.0	-2041.418239	-1.986278	-0.695540	True	True	0.000236
38817	tr948	10.08460	worst	2465	46100.0	-26585.016143	0.517020	46.653959	0.518408	0.0	...	-1433.128337	0.0	0.0	0.0	-2468.110974	0.743237	-1.782562	True	True	0.000068

	folder	i	RMSD	Rw	biasQ	bias	Qw	Chain	...	VTotal	pc	pc2	name	chosen	result	pc_center	inrange	prob
0	0	0.0	4.65665	-10838.987781	0.590522	33.534424	0.464193	14.571923	...	-940.887277	-0.086922	-0.059985	tr594	True	best	NaN	NaN	NaN
0	0	0.0	4.65665	-10838.987781	0.590522	33.534424	0.464193	14.571923	...	-940.887277	-0.086922	-0.059985	tr594	True	picked	0.703	True	0.042606
0	0	0.0	4.65665	-10838.987781	0.590522	33.534424	0.464193	14.571923	...	-940.887277	-0.086922	-0.059985	tr594	True	init	NaN	NaN	NaN
1020	2380	37600.0	11.55900	-8163.823349	0.491720	51.669743	0.349195	4.380186	...	-960.110987	1.421570	-0.122716	tr594	True	worst	0.703	True	0.003102

	folder	i	RMSD	Rw	biasQ	bias	Chain	Chi	...	VTotal	pc	pc2	name	chosen	result	pc_center	inrange	prob
980	980	47900.0	1.36968	-6834.949362	0.645772	25.095468	3.685220	0.588609	...	-656.184416	0.944122	-0.804111	tr894	True	best	NaN	NaN	NaN
981	981	48000.0	1.57229	-6850.297788	0.673878	21.271087	4.750551	0.512601	...	-659.909501	0.762314	-0.531746	tr894	True	picked	2.03	True	0.523816
0	0	0.0	2.24606	-6810.073249	0.556208	39.390241	5.546380	1.160888	...	-668.558313	-0.065067	-0.283718	tr894	True	init	NaN	NaN	NaN