In [2]:
# written in python3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio


def my_transform(data, label, degree, FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

functions for reading data


In [5]:
# read energy, rw, bias, rmsd data from location
def read_data(name):
#     name="tr872"
    name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]

    # you probably want to change the location below
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
    location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
    RMSD = pd.read_table(location+"rmsd.xvg", names=["i", "RMSD"], sep="\s+")
    bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
    awsem = pd.read_table(location+"awsem.log", names=name_list)
    rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
    qw = pd.read_table(location+"Qw.out", names=["i", "Qw"], sep="\s+").drop("i", axis=1)
    # pc location
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
    pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
    raw_data = pd.concat([RMSD, rw, bias, qw, awsem, pc], axis=1)
    return raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})

def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)

# read the pmf, rc. 
# def read_data_2(name):
# #     name = "tr894"
# #     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# #     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
#     location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
#     rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
#     rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
#     awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
#     qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
#     freeE = pd.read_table(location+"pmf3000"
#                           , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
#     raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
#     return raw_data

def read_data_2(name):
#     name = "tr894"
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
    location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
    rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
    rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
    awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
    qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
    freeE = pd.read_table(location+"pmf3000"
                          , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
    raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
    return raw_data

train based on free energy, rw and awsem.


In [6]:
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr948-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]


# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# read all data

# tr884-halfDIHE
# tr872-halfDIHE
# tr948-halfDIHE
data_list = []
for name in folder_list:
    tmp = read_data_2(name)
    data_list.append(tmp)
raw_data_all = pd.concat(data_list)
n = 1
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="rmsd").reset_index(drop=True)


# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]

# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'name in {train_name_list}')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-6-85c2cf7e15c2> in <module>()
     16 data_list = []
     17 for name in folder_list:
---> 18     tmp = read_data_2(name)
     19     data_list.append(tmp)
     20 raw_data_all = pd.concat(data_list)

<ipython-input-5-2f3855ee1f37> in read_data_2(name)
     31     rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
     32     awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
---> 33     qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
     34     freeE = pd.read_table(location+"pmf3000"
     35                           , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
    676                     skip_blank_lines=skip_blank_lines)
    677 
--> 678         return _read(filepath_or_buffer, kwds)
    679 
    680     parser_f.__name__ = name

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    438 
    439     # Create the parser.
--> 440     parser = TextFileReader(filepath_or_buffer, **kwds)
    441 
    442     if chunksize or iterator:

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    785             self.options['has_index_names'] = kwds['has_index_names']
    786 
--> 787         self._make_engine(self.engine)
    788 
    789     def close(self):

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1012     def _make_engine(self, engine='c'):
   1013         if engine == 'c':
-> 1014             self._engine = CParserWrapper(self.f, **self.options)
   1015         else:
   1016             if engine == 'python':

~/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1706         kwds['usecols'] = self.usecols
   1707 
-> 1708         self._reader = parsers.TextReader(src, **kwds)
   1709 
   1710         passed_names = self.names is None

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: File b'/Users/weilu/Research/server/dec_2018/structure_selection/tr884-halfDIHE/rc_QwhigherBound' does not exist

In [4]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = ["f",
    'rw',
     'awsem',
#     'RMSD', # test
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "rmsd"
LABEL = "qw"
DEGREE = 1

def pred_from_raw(a, clf):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prediceted_rmsd= clf.predict(test_set)
    return a.assign(prediceted_rmsd=prediceted_rmsd)

def assign_lowest_f(a):
    return a.assign(lowest_f=a["f"].sort_values().iloc[0])

In [5]:
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)

In [6]:
# # data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# # data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
# data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
#                                                      label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
# train_y = data[:,-1]
# train_set = data[:,:-1]
# from sklearn import svm
# # clf = svm.SVC(probability=True)
# clf = LinearRegression()
# clf.fit(train_set, train_y)
# y_pred_svm = clf.predict(train_set)

# raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(pred_from_raw, clf).reset_index(drop=True)
# # raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)



# picked_n = 1
# best = raw_data_all.groupby("name").apply(choose_top, col="rmsd"
#                                             , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# picked = raw_data_all.groupby("name").apply(choose_top, col="prediceted_rmsd"
#                                             , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# # init = raw_data_all.query("i == 0.0")
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])

# picked_keep = picked.copy()

In [7]:
from scipy.interpolate import interp1d
f_dic = {}
for name in folder_list:
    a = raw_data_all.query(f"name == '{name}'")
    x = a["pc"].values
    y = a["f"].values
    f_dic[name] = interp1d(x, y, fill_value="extrapolate")

In [8]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# plt.ylim([0,1])

In [9]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# g = g.map(plt.plot, "pc", "prediceted_rmsd")

In [10]:
# raw_data_all.query("name == 'tr594'").plot("pc", "f")

In [11]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "rmsd")
# plt.ylim([0,1])

used picked_keep as to filter the compelete data and select again


In [12]:
f_dic["tr594"](raw_data_all["pc"]).shape


Out[12]:
(526,)

In [13]:
def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)


# WIDTH = 100
# WIDTH = 0.1
# WIDTH = 1
# WIDTH = 0.2
# def with_in_range(data, width=WIDTH):
#     return data.assign(inrange= (data["pc"] < (data["pc_center"]+width)) & (data["pc"] > (data["pc_center"]-width)))

def with_in_range(data, width=5):
    name = data["name"].iloc[0]
    return data.assign(inrange= (0 < (f_dic[name](data["pc"]))) & ((f_dic[name](data["pc"])) < width))

In [14]:
folder_list = ["tr898", "tr869", "tr947", "tr894", "tr882", "tr594", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# "tr898"
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# folder_list = ["tr866"]


# define top based on RMSD or Qw
# best_metric = "RMSD"
best_metric = "Qw"
if best_metric == "Qw":
    isAscending = False
else:
    isAscending = True
data_list = []
for name in folder_list:
    tmp = read_data(name)
    data_list.append(tmp)
raw_data_all_2 = pd.concat(data_list).dropna()
n = 25
raw_data_all_2 = raw_data_all_2.reset_index(drop=True).groupby("name").\
        apply(choose_top, n=n, col=best_metric, ascending=isAscending).reset_index(drop=True)


raw_data = raw_data_all_2.reset_index(drop=True).query(f'name in {train_name_list}').dropna()
# a = raw_data_all_2.dropna().merge(picked_keep[["pc", "name"]].rename(columns={"pc":"pc_center"}),on="name")
a = raw_data_all_2.dropna()
filtered = a.groupby("name").apply(with_in_range).query("inrange == True").reset_index(drop=True)

In [15]:
filtered.shape


Out[15]:
(38834, 29)

In [16]:
a.shape


Out[16]:
(50096, 28)

In [17]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
    "biasQ",
    'Rw',
     'VTotal',
#     'RMSD', # test
#     'Qw',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prob= clf.predict_proba(test_set)[:,1]
    return a.assign(prob=prob)

# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]

# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)

filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("name").apply(choose_top, col="prob"
                                        , n=1, ascending=False).reset_index(drop=True).query("chosen==True")

# if False:
picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
                        , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))


Out[17]:
<seaborn.axisgrid.FacetGrid at 0x103929160>

In [23]:
all_results = pd.concat([picked_1.assign(result="picked_top1"), picked.assign(result='picked_top5'),
                         best.assign(result='best'), 
                          init.assign(result='init'),
                         worst.assign(result='worst'), 
                        ], sort=False)

In [25]:
picked.shape


Out[25]:
(20, 30)

In [32]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"])) .reset_index(drop=True).to_csv("/Users/weilu/Desktop/selection_result.csv")

In [28]:
def my_reorder(a, first):
    # move first to the top. and keep the rest
    new_order = first.copy()
    for col in a:
        if col not in first:
            new_order.append(col)
    return new_order

In [31]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"]))


Out[31]:
name RMSD result folder i Rw biasQ bias Qw Step ... Frag_Mem Vec_FM Membrane SSB VTotal pc pc2 chosen inrange prob
1917 tr594 5.71845 picked_top1 2028 2400.0 -9655.812091 0.562631 38.258352 0.397447 0.0 ... -542.733446 0.0 0.0 0.0 -976.169211 1.783749 -0.897284 True True 0.336318
3098 tr862 6.21427 picked_top1 716 21500.0 -14017.862442 0.795115 8.395546 0.461948 0.0 ... -860.309802 0.0 0.0 0.0 -1467.578457 -1.316991 -2.854982 True True 0.184471
5040 tr866 1.65094 picked_top1 436 43600.0 -17053.726578 0.628532 27.597682 0.834830 0.0 ... -730.772207 0.0 0.0 0.0 -1442.122133 -0.087063 3.699315 True True 0.220490
7964 tr868 2.55439 picked_top1 1009 700.0 -19002.022160 0.515777 46.894413 0.759775 0.0 ... -1184.431369 0.0 0.0 0.0 -1896.962431 0.139004 -0.381850 True True 0.419466
9181 tr869 12.30950 picked_top1 0 0.0 -17101.278236 0.327521 90.445532 0.340709 0.0 ... -745.292489 0.0 0.0 0.0 -1364.887693 -0.000178 -0.048953 True True 0.287940
11347 tr870 8.98989 picked_top1 27 2700.0 -18766.397302 0.366368 80.297996 0.325325 0.0 ... -919.447094 0.0 0.0 0.0 -1663.622985 -1.937195 -0.884915 True True 0.021734
13892 tr872 3.88397 picked_top1 130 13000.0 -12597.510264 0.673965 21.259789 0.654242 0.0 ... -546.317159 0.0 0.0 0.0 -1131.645842 0.635999 -1.004592 True True 0.232302
15145 tr877 3.00711 picked_top1 0 0.0 -24114.108027 0.568868 37.174966 0.648838 0.0 ... -821.245402 0.0 0.0 0.0 -1699.528374 -0.055574 -0.014200 True True 0.773073
18607 tr882 2.75337 picked_top1 1585 8200.0 -12227.807526 0.648724 24.679000 0.790488 0.0 ... -630.005603 0.0 0.0 0.0 -1123.346564 -0.326972 0.008144 True True 0.257559
20532 tr884 5.42798 picked_top1 1448 44600.0 -9418.042083 0.530488 44.088316 0.496556 0.0 ... -492.254269 0.0 0.0 0.0 -924.112214 0.739004 -2.026302 True True 0.231153
22564 tr885 3.18402 picked_top1 1065 6300.0 -19678.210041 0.713528 16.413273 0.723373 0.0 ... -879.121171 0.0 0.0 0.0 -1671.835269 0.400841 -0.236901 True True 0.125852
25261 tr891 1.91208 picked_top1 1709 20600.0 -18636.734867 0.645838 25.086161 0.814518 0.0 ... -811.315247 0.0 0.0 0.0 -1634.043036 0.290187 0.469162 True True 0.623671
26508 tr894 2.00769 picked_top1 904 40300.0 -6759.374211 0.667682 22.087109 0.764641 0.0 ... -339.543577 0.0 0.0 0.0 -671.963552 -0.318981 -0.765288 True True 0.385394
28827 tr895 4.87177 picked_top1 992 49100.0 -20488.493033 0.646876 24.939243 0.571973 0.0 ... -901.913395 0.0 0.0 0.0 -1650.549327 -0.933751 0.277437 True True 0.225033
29502 tr896 8.62961 picked_top1 3 300.0 -11703.621241 0.461786 57.934848 0.405691 0.0 ... -534.883801 0.0 0.0 0.0 -989.943449 -1.047856 0.015758 True True 0.497582
31988 tr898 13.46090 picked_top1 729 22800.0 -19042.795684 0.568624 37.217128 0.398104 0.0 ... -783.469742 0.0 0.0 0.0 -1469.264433 -2.546374 -2.851364 True True 0.170833
33058 tr921 3.24877 picked_top1 828 32700.0 -21881.858026 0.680476 20.419091 0.624908 0.0 ... -1871.646575 0.0 0.0 0.0 -2764.790089 0.245506 -0.892383 True True 0.165306
34861 tr922 3.68668 picked_top1 1011 900.0 -9636.845094 0.651477 24.293703 0.606792 0.0 ... -620.135642 0.0 0.0 0.0 -1010.273657 -0.056487 1.712977 True True 0.241481
36552 tr947 9.56921 picked_top1 481 48100.0 -27891.079857 0.610006 30.418991 0.482850 0.0 ... -1263.925386 0.0 0.0 0.0 -2108.005624 -2.046083 0.185091 True True 0.209786
37373 tr948 6.42138 picked_top1 810 30900.0 -28698.131138 0.581505 35.027568 0.603420 0.0 ... -1488.304814 0.0 0.0 0.0 -2589.281395 -0.825228 0.477354 True True 0.596875
0 tr594 5.71845 picked_top5 2028 2400.0 -9655.812091 0.562631 38.258352 0.397447 0.0 ... -542.733446 0.0 0.0 0.0 -976.169211 1.783749 -0.897284 True True 0.336318
8 tr862 5.91362 picked_top5 840 33900.0 -14298.952979 0.792131 8.641868 0.457493 0.0 ... -857.960775 0.0 0.0 0.0 -1460.246571 -0.883616 -2.363559 True True 0.176140
12 tr866 1.52312 picked_top5 296 29600.0 -16825.468015 0.627247 27.789014 0.867941 0.0 ... -729.065574 0.0 0.0 0.0 -1438.226706 -0.323353 4.008893 True True 0.149921
16 tr868 1.94233 picked_top5 772 27100.0 -18689.078432 0.534715 43.298055 0.804022 0.0 ... -1144.918529 0.0 0.0 0.0 -1868.985796 -0.338582 -0.103741 True True 0.328877
20 tr869 12.30950 picked_top5 0 0.0 -17101.278236 0.327521 90.445532 0.340709 0.0 ... -745.292489 0.0 0.0 0.0 -1364.887693 -0.000178 -0.048953 True True 0.287940
26 tr870 8.81563 picked_top5 129 12900.0 -20157.706928 0.630429 27.316528 0.324614 80000000.0 ... -985.958008 0.0 0.0 0.0 -1635.238984 -1.121479 -0.345961 True True 0.020210
31 tr872 3.38775 picked_top5 141 14100.0 -12447.029453 0.676140 20.977116 0.667752 0.0 ... -546.638457 0.0 0.0 0.0 -1131.691792 0.641274 -0.860320 True True 0.220697
35 tr877 3.00711 picked_top5 0 0.0 -24114.108027 0.568868 37.174966 0.648838 0.0 ... -821.245402 0.0 0.0 0.0 -1699.528374 -0.055574 -0.014200 True True 0.773073
44 tr882 2.28804 picked_top5 1965 46200.0 -11916.930776 0.636862 26.373853 0.759764 0.0 ... -633.406934 0.0 0.0 0.0 -1132.496525 -0.322323 0.313275 True True 0.212781
49 tr884 4.60271 picked_top5 1851 34800.0 -9350.865722 0.537420 42.796012 0.571265 0.0 ... -492.137189 0.0 0.0 0.0 -916.994515 0.261957 -0.655306 True True 0.138139
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25047 tr885 2.35937 init 0 0.0 -20598.856085 0.685012 19.843456 0.830205 0.0 ... -870.801921 0.0 0.0 0.0 -1645.527867 -0.037832 0.039669 True NaN NaN
27552 tr891 1.60893 init 0 0.0 -18747.275083 0.620600 28.788908 0.860321 0.0 ... -820.247037 0.0 0.0 0.0 -1600.853027 -0.184893 -0.098650 True NaN NaN
30057 tr894 2.24606 init 0 0.0 -6810.073249 0.556208 39.390241 0.731370 0.0 ... -330.532789 0.0 0.0 0.0 -668.558313 -0.065067 -0.283718 True NaN NaN
32562 tr895 4.33382 init 0 0.0 -22028.450058 0.618778 29.065981 0.654401 0.0 ... -889.343989 0.0 0.0 0.0 -1623.103778 -0.010141 -0.051103 True NaN NaN
35067 tr896 8.22359 init 0 0.0 -12136.482066 0.466766 56.867616 0.443834 0.0 ... -525.593994 0.0 0.0 0.0 -947.681062 -0.058300 -0.054690 True NaN NaN
37572 tr898 13.99650 init 0 0.0 -18590.677602 0.528118 44.534476 0.357930 0.0 ... -782.373728 0.0 0.0 0.0 -1446.453004 0.042324 -0.167047 True NaN NaN
40077 tr921 3.55243 init 0 0.0 -22225.052777 0.676982 20.868098 0.633795 0.0 ... -1814.239425 0.0 0.0 0.0 -2679.102572 -0.101087 0.070372 True NaN NaN
42582 tr922 2.55776 init 0 0.0 -9494.179526 0.646615 24.976143 0.778997 0.0 ... -619.584850 0.0 0.0 0.0 -999.248416 0.115988 0.147087 True NaN NaN
45087 tr947 12.92220 init 0 0.0 -26347.737275 0.602246 31.641680 0.498832 0.0 ... -1262.954342 0.0 0.0 0.0 -2074.380569 1.187467 -0.301399 True NaN NaN
47592 tr948 6.72126 init 0 0.0 -29784.167159 0.534029 43.425714 0.623263 0.0 ... -1442.266587 0.0 0.0 0.0 -2474.731767 -0.029092 0.051793 True NaN NaN
2281 tr594 11.66420 worst 2392 38800.0 -7800.180486 0.405731 70.631048 0.293334 0.0 ... -528.484319 0.0 0.0 0.0 -945.059692 2.492546 0.800841 True True 0.000514
2842 tr862 8.64546 worst 459 45900.0 -13675.272843 0.744725 13.033074 0.552184 0.0 ... -836.017899 0.0 0.0 0.0 -1416.508005 -2.115153 0.313762 True True 0.004266
5982 tr866 5.49268 worst 1436 43400.0 -16316.523037 0.460162 58.285112 0.523121 0.0 ... -706.324188 0.0 0.0 0.0 -1342.476061 0.189714 -0.228174 True True 0.000023
8884 tr868 5.50126 worst 2067 6300.0 -18165.315305 0.465461 57.146452 0.608624 0.0 ... -1085.027900 0.0 0.0 0.0 -1783.826243 0.645704 -2.754277 True True 0.004290
9679 tr869 14.43870 worst 598 9700.0 -16097.470589 0.218214 122.237944 0.311339 0.0 ... -729.588674 0.0 0.0 0.0 -1323.833114 -0.808695 -0.587784 True True 0.001463
12776 tr870 11.43280 worst 1484 48200.0 -18635.189014 0.630429 27.316528 0.311343 80000000.0 ... -985.958008 0.0 0.0 0.0 -1635.238984 -1.256596 2.241660 True True 0.006724
15120 tr872 7.00137 worst 2011 700.0 -11677.938128 0.670047 21.773797 0.571122 0.0 ... -545.404119 0.0 0.0 0.0 -1101.906710 0.895718 0.631344 True True 0.007777
16772 tr877 5.97014 worst 1819 31600.0 -23656.864393 0.497143 50.573068 0.572003 0.0 ... -819.115369 0.0 0.0 0.0 -1632.289229 0.220377 3.612046 True True 0.001921
18310 tr882 4.10836 worst 1201 19900.0 -11669.955472 0.613084 29.940872 0.673967 0.0 ... -617.772188 0.0 0.0 0.0 -1084.858365 -0.434148 -0.384838 True True 0.000567
21428 tr884 8.92547 worst 2345 34100.0 -8695.952091 0.456176 59.148866 0.482172 0.0 ... -479.625249 0.0 0.0 0.0 -875.415909 2.096840 0.496279 True True 0.000063
22233 tr885 5.25420 worst 733 23200.0 -19040.407218 0.663800 22.606043 0.762638 0.0 ... -879.497813 0.0 0.0 0.0 -1663.557353 0.193633 -1.313889 True True 0.007403
26049 tr891 4.07153 worst 2497 49300.0 -17308.511883 0.599059 32.150760 0.652094 0.0 ... -791.156171 0.0 0.0 0.0 -1550.446442 2.179495 1.449662 True True 0.000239
27079 tr894 5.09216 worst 1485 48300.0 -6089.793333 0.505490 48.908058 0.487889 0.0 ... -326.511714 0.0 0.0 0.0 -661.998634 -0.766482 -2.990092 True True 0.003307
27974 tr895 7.98054 worst 43 4300.0 -18903.580144 0.583194 34.745474 0.565594 0.0 ... -892.775956 0.0 0.0 0.0 -1630.348760 -1.605658 3.039932 True True 0.001931
30562 tr896 11.04840 worst 1207 20500.0 -10977.597770 0.402719 71.348928 0.333853 0.0 ... -538.589762 0.0 0.0 0.0 -956.501625 -0.912589 -2.112027 True True 0.008942
31528 tr898 15.98600 worst 132 13200.0 -17054.063013 0.483393 53.376536 0.348017 0.0 ... -785.866058 0.0 0.0 0.0 -1457.509460 -2.655997 0.094145 True True 0.005441
32402 tr921 4.78319 worst 164 16400.0 -20379.263146 0.629018 27.525487 0.588459 0.0 ... -1754.727325 0.0 0.0 0.0 -2577.565026 -1.471915 -4.113387 True True 0.000025
36178 tr922 8.54551 worst 2429 42500.0 -8689.220583 0.406243 70.509502 0.408714 0.0 ... -507.886517 0.0 0.0 0.0 -879.592569 1.571574 4.837135 True True 0.000040
36219 tr947 15.48280 worst 15 1500.0 -25124.991722 0.569790 37.016088 0.459361 0.0 ... -1249.896273 0.0 0.0 0.0 -2041.418239 -1.986278 -0.695540 True True 0.000236
38817 tr948 10.08460 worst 2465 46100.0 -26585.016143 0.517020 46.653959 0.518408 0.0 ... -1433.128337 0.0 0.0 0.0 -2468.110974 0.743237 -1.782562 True True 0.000068

100 rows × 31 columns


In [ ]:
picked

In [19]:
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)


tr898 13.9965 13.4609 0.536 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 8.10622 4.816 True
tr894 2.24606 1.6856900000000001 0.56 True
tr882 2.34977 2.28804 0.062 True
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 5.91362 -0.379 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 3.38775 2.279 True
tr885 2.35937 2.72348 -0.364 False
tr866 3.23637 1.52312 1.713 True
tr868 1.97032 1.94233 0.028 True
tr884 3.81972 4.60271 -0.783 False
tr895 4.333819999999999 4.87177 -0.538 False
tr896 8.22359 8.01121 0.212 True
tr870 7.65488 8.81563 -1.161 False
tr921 3.55243 3.17155 0.381 True
tr922 2.55776 3.28628 -0.729 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 5.56396 1.157 True
improved:  10 20 6.42652

In [26]:
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked_1.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)


tr898 13.9965 13.4609 0.536 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 9.56921 3.353 True
tr894 2.24606 2.00769 0.238 True
tr882 2.34977 2.75337 -0.404 False
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 6.21427 -0.68 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 3.8839699999999997 1.783 True
tr885 2.35937 3.1840200000000003 -0.825 False
tr866 3.23637 1.65094 1.585 True
tr868 1.97032 2.55439 -0.584 False
tr884 3.81972 5.427980000000001 -1.608 False
tr895 4.333819999999999 4.87177 -0.538 False
tr896 8.22359 8.629610000000001 -0.406 False
tr870 7.65488 8.989889999999999 -1.335 False
tr921 3.55243 3.24877 0.304 True
tr922 2.55776 3.6866800000000004 -1.129 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 6.42138 0.3 True
improved:  7 20 -0.7740600000000017

In [181]:
filtered = a
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
    "biasQ",
    'Rw',
     'VTotal',
#     'RMSD', # test
#     'Qw',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prob= clf.predict_proba(test_set)[:,1]
    return a.assign(prob=prob)

# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]

# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)

filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
if True:
    picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")

if False:
    picked = filtered.groupby("name").apply(choose_top, col="prob"
                                                , n=5, ascending=False).reset_index(drop=True).query("chosen==True")
    picked = picked.groupby("name").apply(choose_top, col="RMSD"
                                                , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
                        , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))


Out[181]:
<seaborn.axisgrid.FacetGrid at 0x1a20ed8e10>

In [182]:
count = 0
total = 0
for name in folder_list:
    init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
    picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
    improved = picked_tmp < init_tmp
    print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
    total += init_tmp - picked_tmp
    count += improved
print("improved: ", count, len(folder_list), total)


tr898 13.9965 12.8408 1.156 True
tr869 12.3095 12.3095 0.0 False
tr947 12.9222 12.1452 0.777 True
tr894 2.24606 2.00769 0.238 True
tr882 2.34977 2.75337 -0.404 False
tr594 4.65665 5.71845 -1.062 False
tr862 5.53469 5.91362 -0.379 False
tr877 3.00711 3.00711 0.0 False
tr872 5.66699 4.53573 1.131 True
tr885 2.35937 3.1840200000000003 -0.825 False
tr866 3.23637 1.65094 1.585 True
tr868 1.97032 2.55439 -0.584 False
tr884 3.81972 4.81 -0.99 False
tr895 4.333819999999999 5.72387 -1.39 False
tr896 8.22359 8.629610000000001 -0.406 False
tr870 7.65488 9.11107 -1.456 False
tr921 3.55243 3.24877 0.304 True
tr922 2.55776 3.6866800000000004 -1.129 False
tr891 1.60893 1.91208 -0.303 False
tr948 6.721260000000001 6.42138 0.3 True
improved:  7 20 -3.436360000000002

In [339]:
all_results.query("name == 'tr594'")


Out[339]:
folder i RMSD Rw biasQ bias Qw Step Chain Shake ... SSB VTotal pc pc2 name chosen result pc_center inrange prob
0 0 0.0 4.65665 -10838.987781 0.590522 33.534424 0.464193 0.0 14.571923 0.0 ... 0.0 -940.887277 -0.086922 -0.059985 tr594 True best NaN NaN NaN
0 0 0.0 4.65665 -10838.987781 0.590522 33.534424 0.464193 0.0 14.571923 0.0 ... 0.0 -940.887277 -0.086922 -0.059985 tr594 True picked 0.703 True 0.042606
0 0 0.0 4.65665 -10838.987781 0.590522 33.534424 0.464193 0.0 14.571923 0.0 ... 0.0 -940.887277 -0.086922 -0.059985 tr594 True init NaN NaN NaN
1020 2380 37600.0 11.55900 -8163.823349 0.491720 51.669743 0.349195 0.0 4.380186 0.0 ... 0.0 -960.110987 1.421570 -0.122716 tr594 True worst 0.703 True 0.003102

4 rows × 32 columns


In [35]:
clf.coef_


Out[35]:
array([[ 0.24185815, -0.37308731, -0.43718463]])

In [221]:
Plot_Metric = "Qw"
if Plot_Metric:
    isAscending = False
else:
    isAscending = True

picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col=Plot_Metric
                                            , n=1, ascending=isAscending).reset_index(drop=True).query("chosen==True")
picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col=Plot_Metric
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')
#                         , worst.assign(result='worst')
                        ], sort=False)
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', Plot_Metric).add_legend(fontsize=20)
fg.set(ylim=(0, 1))


Out[221]:
<seaborn.axisgrid.FacetGrid at 0x1a2d7cfd30>

In [170]:
picked["init_RMSD"] = init["RMSD"].values
picked["diff_RMSD"] = init["RMSD"].values - picked["RMSD"].values
out = picked[["name", "RMSD", "init_RMSD", "diff_RMSD", "folder"]].reset_index(drop=True)

In [206]:
fg = sns.FacetGrid(data=filtered, hue='name', height=8, aspect=1.63)
fg.map(plt.scatter, 'Qw', 'RMSD').add_legend(fontsize=20)


Out[206]:
<seaborn.axisgrid.FacetGrid at 0x1a2c9cb780>

In [35]:
filtered.plot.scatter("prob", "RMSD")


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a27569080>

In [25]:
out


Out[25]:
name RMSD init_RMSD diff_RMSD folder
0 tr894 1.36968 2.24606 0.87638 980

In [13]:
raw_data_all_2.plot("RMSD", "Rw")


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23707f60>

In [14]:
raw_data_all_2.plot("RMSD", "pc")


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23783828>

In [15]:
out


Out[15]:
name RMSD init_RMSD diff_RMSD folder
0 tr894 1.36968 2.24606 0.87638 980

In [16]:
out


Out[16]:
name RMSD init_RMSD diff_RMSD folder
0 tr894 1.36968 2.24606 0.87638 980

In [70]:
all_results


Out[70]:
folder i RMSD Rw biasQ bias Step Chain Shake Chi ... SSB VTotal pc pc2 name chosen result pc_center inrange prob
980 980 47900.0 1.36968 -6834.949362 0.645772 25.095468 0 3.685220 0.0 0.588609 ... 0.0 -656.184416 0.944122 -0.804111 tr894 True best NaN NaN NaN
981 981 48000.0 1.57229 -6850.297788 0.673878 21.271087 0 4.750551 0.0 0.512601 ... 0.0 -659.909501 0.762314 -0.531746 tr894 True picked 2.03 True 0.523816
0 0 0.0 2.24606 -6810.073249 0.556208 39.390241 0 5.546380 0.0 1.160888 ... 0.0 -668.558313 -0.065067 -0.283718 tr894 True init NaN NaN NaN

3 rows × 31 columns


In [13]:
# out.to_csv("/Users/weilu/Desktop/picked_3.csv")

In [26]:
clf.coef_


Out[26]:
array([[ 0.31865771, -0.24574338, -0.00429271, -0.15621297,  0.12086065,
         0.03529636,  0.05114406,  0.06779384,  0.23049113, -0.0941187 ]])

In [14]:
clf.coef_


Out[14]:
array([[ 0.20157408, -0.69485223,  0.04456798]])

In [15]:
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))


Out[15]:
<seaborn.axisgrid.FacetGrid at 0x1a22576f60>

In [16]:
filtered["name"].unique().shape


Out[16]:
(17,)

In [17]:
picked[["RMSD", "name"]]


Out[17]:
RMSD name
0 5.14331 tr594
1136 4.31091 tr862
1890 3.14792 tr866
2586 1.61058 tr868
3123 8.69691 tr870
3576 2.81351 tr872
4796 3.07719 tr877
5446 2.25775 tr882
6372 4.06919 tr884
6989 2.35937 tr885
7922 1.60893 tr891
9265 1.18208 tr894
10424 4.67184 tr895
10781 8.22359 tr896
14297 3.25991 tr921
14558 3.84193 tr922
16140 5.40097 tr948

In [18]:
# picked.to_csv("/Users/weilu/Desktop/picked_2.csv")

In [19]:
name ="tr894"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]

# you probably want to change the location below
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/nov_2018/structure_selection/{name}/"
RMSD = pd.read_table(location+"rmsd-angstrom.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
# pc location
#     location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
#     location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, awsem, pc], axis=1)
raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})


Out[19]:
folder i RMSD Rw biasQ bias Step Chain Shake Chi ... Helix AMH_Go Frag_Mem Vec_FM Membrane SSB VTotal pc pc2 name
0 0 0.0 2.24606 -6810.073249 0.556208 39.390241 0 5.546380 0.0 1.160888 ... -11.494550 0.0 -330.532789 0.0 0.0 0.0 -668.558313 -0.065067 -0.283718 tr894
1 1 100.0 3.23173 -6286.940968 0.551209 40.282725 0 2.470745 0.0 0.544233 ... -12.663186 0.0 -332.365779 0.0 0.0 0.0 -661.611254 -2.030413 -1.872569 tr894
2 2 200.0 3.21661 -6289.430903 0.497013 50.599184 0 4.764399 0.0 0.562483 ... -13.188086 0.0 -332.765589 0.0 0.0 0.0 -658.149297 -2.028462 -1.644785 tr894
3 3 300.0 3.15813 -6190.501191 0.528574 44.448412 0 2.255989 0.0 0.468350 ... -13.341891 0.0 -333.111932 0.0 0.0 0.0 -662.850984 -1.765059 -1.880311 tr894
4 4 400.0 3.48106 -6070.736394 0.507673 48.477271 0 4.563115 0.0 0.358890 ... -13.054297 0.0 -332.096669 0.0 0.0 0.0 -655.505067 -2.050421 -0.862438 tr894
5 5 500.0 3.21603 -6207.393138 0.507665 48.478663 0 3.079867 0.0 0.562905 ... -12.983445 0.0 -334.065069 0.0 0.0 0.0 -654.514531 -2.024328 -0.505672 tr894
6 6 600.0 3.26742 -6270.166177 0.515849 46.880384 0 3.598324 0.0 0.689476 ... -12.473662 0.0 -329.046856 0.0 0.0 0.0 -661.071468 -2.225011 -1.368251 tr894
7 7 700.0 3.45545 -6275.141716 0.521223 45.845485 0 3.142972 0.0 0.471721 ... -12.879093 0.0 -330.526926 0.0 0.0 0.0 -660.649964 -1.917079 -0.934712 tr894
8 8 800.0 2.84899 -6353.542762 0.521744 45.745735 0 4.776124 0.0 0.537145 ... -12.874610 0.0 -330.107068 0.0 0.0 0.0 -666.482035 -1.836351 -1.092572 tr894
9 9 900.0 3.43413 -6114.807297 0.567139 37.473775 0 3.616298 0.0 0.553106 ... -13.392355 0.0 -332.670193 0.0 0.0 0.0 -664.643019 -1.984028 -1.763320 tr894
10 10 1000.0 3.39099 -6010.453477 0.536807 42.909507 0 3.596885 0.0 0.391673 ... -13.353049 0.0 -332.451981 0.0 0.0 0.0 -659.813810 -2.215176 0.442220 tr894
11 11 1100.0 3.48153 -6114.438719 0.581163 35.084917 0 2.720963 0.0 0.310693 ... -13.200931 0.0 -331.501808 0.0 0.0 0.0 -664.442638 -2.008016 0.629552 tr894
12 12 1200.0 3.23210 -6101.638205 0.519535 46.169300 0 5.605657 0.0 0.721162 ... -10.424754 0.0 -332.022908 0.0 0.0 0.0 -656.524265 -1.985955 -0.727112 tr894
13 13 1300.0 2.92439 -6059.757243 0.529431 44.287121 0 2.323098 0.0 0.547973 ... -13.361106 0.0 -334.500030 0.0 0.0 0.0 -660.984355 -2.111932 -1.343508 tr894
14 14 1400.0 2.90091 -6306.977750 0.545174 41.373291 0 3.194121 0.0 0.466709 ... -13.974103 0.0 -333.122664 0.0 0.0 0.0 -664.931939 -1.862049 -0.307235 tr894
15 15 1500.0 3.44498 -6054.804323 0.481283 53.813425 0 4.393513 0.0 0.413912 ... -10.416986 0.0 -329.519729 0.0 0.0 0.0 -640.413959 -1.964237 -0.545008 tr894
16 16 1600.0 3.72309 -5924.173405 0.517176 46.623769 0 4.403761 0.0 0.530923 ... -12.445575 0.0 -332.027049 0.0 0.0 0.0 -652.482812 -2.177049 -0.822812 tr894
17 17 1700.0 2.94234 -6170.545926 0.605790 31.080307 0 3.403110 0.0 0.394039 ... -13.070255 0.0 -334.764821 0.0 0.0 0.0 -662.858390 -1.977594 -0.605940 tr894
18 18 1800.0 3.14126 -6104.911588 0.565975 37.675492 0 2.148955 0.0 0.416200 ... -14.408324 0.0 -332.427927 0.0 0.0 0.0 -657.525813 -2.012379 -0.666930 tr894
19 19 1900.0 3.03640 -6248.712283 0.549830 40.530526 0 3.192358 0.0 0.373018 ... -12.509658 0.0 -331.034803 0.0 0.0 0.0 -652.148108 -1.860132 -0.951803 tr894
20 20 2000.0 2.80779 -6441.963711 0.556436 39.349743 0 2.222043 0.0 0.423124 ... -12.922997 0.0 -334.503676 0.0 0.0 0.0 -661.244525 -1.888069 -0.825117 tr894
21 21 2100.0 3.31259 -6410.346460 0.513748 47.288295 0 6.191025 0.0 0.604926 ... -12.253130 0.0 -332.203648 0.0 0.0 0.0 -651.160377 -2.001888 -0.592104 tr894
22 22 2200.0 2.91335 -6308.221083 0.593916 32.980882 0 3.234118 0.0 0.664409 ... -13.207077 0.0 -332.990401 0.0 0.0 0.0 -658.201126 -2.152202 -0.717611 tr894
23 23 2300.0 3.19933 -6374.911948 0.539413 42.428153 0 2.538853 0.0 0.341001 ... -13.408200 0.0 -331.572188 0.0 0.0 0.0 -662.264580 -2.190752 -0.847559 tr894
24 24 2400.0 2.94005 -6354.892279 0.603693 31.411886 0 1.888008 0.0 0.590554 ... -13.759548 0.0 -334.756167 0.0 0.0 0.0 -673.536860 -2.020588 -0.800902 tr894
25 25 2500.0 3.24435 -6448.101987 0.570527 36.889392 0 2.534727 0.0 0.389001 ... -12.327469 0.0 -332.104002 0.0 0.0 0.0 -664.744860 -1.970641 -1.751093 tr894
26 26 2600.0 3.21349 -6356.016826 0.552069 40.128420 0 3.240573 0.0 0.340624 ... -14.062783 0.0 -331.951360 0.0 0.0 0.0 -669.486651 -2.017166 -1.114400 tr894
27 27 2700.0 3.08817 -6015.780813 0.543925 41.600811 0 2.849521 0.0 0.563807 ... -11.707489 0.0 -333.438918 0.0 0.0 0.0 -656.923779 -2.079045 -1.713251 tr894
28 28 2800.0 2.91854 -6266.414740 0.570673 36.864257 0 2.305444 0.0 0.542672 ... -13.504228 0.0 -336.875255 0.0 0.0 0.0 -661.990310 -1.959546 -1.258906 tr894
29 29 2900.0 3.51806 -6095.122990 0.575797 35.989700 0 2.400738 0.0 0.481813 ... -12.959278 0.0 -333.359501 0.0 0.0 0.0 -662.430279 -2.148657 -0.842376 tr894
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2475 2475 47100.0 3.04801 -6584.548221 0.528828 44.400564 0 3.443490 0.0 0.689302 ... -8.527353 0.0 -328.333283 0.0 0.0 0.0 -656.799558 1.489679 -1.073833 tr894
2476 2476 47200.0 3.06231 -6524.537951 0.501176 49.765102 0 4.955148 0.0 0.473575 ... -7.623936 0.0 -324.707280 0.0 0.0 0.0 -648.310152 1.433943 -0.830940 tr894
2477 2477 47300.0 3.28581 -6530.410132 0.574789 36.160841 0 4.166641 0.0 0.671689 ... -9.461912 0.0 -327.996908 0.0 0.0 0.0 -661.264516 1.390436 -1.208435 tr894
2478 2478 47400.0 3.23796 -6505.011426 0.455993 59.188681 0 7.585024 0.0 0.630827 ... -6.166685 0.0 -318.313982 0.0 0.0 0.0 -633.543536 1.532387 -1.189597 tr894
2479 2479 47500.0 3.47343 -6412.675255 0.509082 48.200074 0 4.337123 0.0 0.533898 ... -8.175169 0.0 -321.367115 0.0 0.0 0.0 -641.175122 1.170308 -0.850157 tr894
2480 2480 47600.0 3.22195 -6600.284916 0.546451 41.141258 0 4.499017 0.0 0.531768 ... -8.919736 0.0 -328.276578 0.0 0.0 0.0 -656.319864 1.024558 -1.633348 tr894
2481 2481 47700.0 3.29628 -6594.057433 0.450815 60.320799 0 5.516787 0.0 0.532782 ... -6.014002 0.0 -324.793170 0.0 0.0 0.0 -643.313596 1.289813 -1.317772 tr894
2482 2482 47800.0 3.22039 -6538.294794 0.492895 51.431169 0 4.771708 0.0 0.643649 ... -10.536290 0.0 -327.516119 0.0 0.0 0.0 -654.529729 0.546995 -1.410352 tr894
2483 2483 47900.0 3.71419 -6429.439945 0.486252 52.787339 0 4.245413 0.0 0.423871 ... -4.541977 0.0 -327.570965 0.0 0.0 0.0 -650.587467 0.814162 -1.430980 tr894
2484 2484 48000.0 3.19131 -6477.936224 0.522025 45.692109 0 4.294322 0.0 0.558186 ... -7.184502 0.0 -326.483925 0.0 0.0 0.0 -651.265310 1.539578 -1.387833 tr894
2485 2485 48100.0 3.07537 -6498.204684 0.521062 45.876390 0 4.032237 0.0 0.661476 ... -9.748129 0.0 -325.586983 0.0 0.0 0.0 -651.190938 1.369766 -0.825550 tr894
2486 2486 48200.0 3.37094 -6374.215012 0.539856 42.346474 0 3.503117 0.0 0.476007 ... -6.907826 0.0 -326.781573 0.0 0.0 0.0 -650.287189 1.779587 -0.777044 tr894
2487 2487 48300.0 3.28947 -6462.479014 0.529538 44.266976 0 4.450656 0.0 0.778137 ... -9.720084 0.0 -327.426579 0.0 0.0 0.0 -655.430363 1.632520 -1.162894 tr894
2488 2488 48400.0 3.32840 -6504.263804 0.487773 52.475263 0 3.823074 0.0 0.541481 ... -9.621493 0.0 -328.796522 0.0 0.0 0.0 -648.395792 0.591862 -1.299183 tr894
2489 2489 48500.0 3.43751 -6447.190797 0.507673 48.477255 0 5.101223 0.0 0.543194 ... -8.555008 0.0 -326.452097 0.0 0.0 0.0 -648.549022 0.661395 -1.650177 tr894
2490 2490 48600.0 3.82345 -6365.246694 0.495035 50.997870 0 3.198458 0.0 0.629422 ... -9.256821 0.0 -327.410325 0.0 0.0 0.0 -647.431837 -0.195077 -2.383967 tr894
2491 2491 48700.0 3.62716 -6430.212535 0.492129 51.586661 0 4.407783 0.0 0.580651 ... -8.285745 0.0 -323.436894 0.0 0.0 0.0 -646.712158 0.334757 -1.955750 tr894
2492 2492 48800.0 3.25856 -6733.348524 0.528495 44.463321 0 4.332744 0.0 0.787561 ... -10.086368 0.0 -326.230732 0.0 0.0 0.0 -653.059823 0.900004 -1.994071 tr894
2493 2493 48900.0 3.31050 -6597.765990 0.501219 49.756485 0 4.567093 0.0 0.523651 ... -9.982728 0.0 -325.830854 0.0 0.0 0.0 -652.938934 1.253588 -1.430779 tr894
2494 2494 49000.0 3.41650 -6464.407265 0.496882 50.625585 0 4.832824 0.0 0.524553 ... -8.364190 0.0 -325.062802 0.0 0.0 0.0 -650.974420 1.921487 -1.013259 tr894
2495 2495 49100.0 3.10904 -6387.091755 0.548666 40.740404 0 3.316020 0.0 0.581348 ... -10.967092 0.0 -328.988742 0.0 0.0 0.0 -659.262743 1.190546 -1.329242 tr894
2496 2496 49200.0 3.08787 -6544.665697 0.542510 41.859331 0 5.020698 0.0 0.864652 ... -9.712455 0.0 -326.961086 0.0 0.0 0.0 -653.968884 1.103267 -1.117054 tr894
2497 2497 49300.0 3.08903 -6565.835977 0.519321 46.210480 0 4.056949 0.0 0.782764 ... -10.167152 0.0 -325.731478 0.0 0.0 0.0 -649.584897 1.347216 -1.148235 tr894
2498 2498 49400.0 2.86923 -6705.263150 0.478287 54.436844 0 5.751120 0.0 0.550526 ... -8.520623 0.0 -326.361710 0.0 0.0 0.0 -647.484556 0.969805 -1.272144 tr894
2499 2499 49500.0 3.75897 -6463.716890 0.445395 61.517373 0 6.437783 0.0 0.522887 ... -8.618340 0.0 -320.399867 0.0 0.0 0.0 -635.290484 1.147018 -1.649743 tr894
2500 2500 49600.0 3.39286 -6501.433057 0.485665 52.908148 0 4.294195 0.0 0.505823 ... -10.501468 0.0 -323.573172 0.0 0.0 0.0 -647.184033 1.221193 -1.348389 tr894
2501 2501 49700.0 3.32162 -6391.616071 0.532840 43.647727 0 4.949311 0.0 0.640128 ... -9.125481 0.0 -329.320986 0.0 0.0 0.0 -656.508771 0.993919 -0.986276 tr894
2502 2502 49800.0 3.28345 -6563.961634 0.519311 46.212290 0 5.139785 0.0 0.632014 ... -9.679081 0.0 -325.602523 0.0 0.0 0.0 -647.098837 0.616816 -0.979727 tr894
2503 2503 49900.0 3.04609 -6472.654667 0.524059 45.303913 0 5.039938 0.0 0.796512 ... -9.205125 0.0 -324.496863 0.0 0.0 0.0 -647.414385 1.401103 -0.722207 tr894
2504 2504 50000.0 3.99691 -6555.354241 0.470289 56.118701 0 3.803515 0.0 0.692461 ... -8.788567 0.0 -329.318347 0.0 0.0 0.0 -652.710108 0.573119 -1.284276 tr894

2505 rows × 26 columns


In [ ]: