notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)



In [2]:

    
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")
def choose_top_rw(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.Rw, method='first')<=n)
def choose_top_vtotal(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.VTotal, method='first')<=n)
def choose_top(data,col="Qw", n=5, ascending=False):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)



In [6]:

    
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/complete_data_mar27.csv", index_col=0)



In [8]:

    
raw_test_data["Name"].unique()









    Out[8]:





array(['1MBA', 'T0792', 'T0815', 'T0766', 'T0784', 'T0803', 'T0833',
       'T0251'], dtype=object)



In [9]:

    
FEATURES = ['Rw',
#      'VTotal',
     'QGO',
     'VwithoutGo',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
n = 5
def my_transform(data, label, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

def my_transform_predict(data, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    PolynomialDegree = degree

    num_attribs = FEATURES
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    return num_pipeline.fit_transform(data)



In [10]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "Rw", "Qw")



In [11]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "VwithoutGo", "Qw")



In [12]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")



In [13]:

    
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
# raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
# raw_data = raw_data_T0792
# raw_data = raw_test_data.groupby("Name").get_group("1mba")
raw_data = raw_data_T0784



In [14]:

    
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)



In [15]:

    
label = "isGood"
degree = 1
p = 0.1
train_set, train_y, test_set, test_y = train_and_test(raw_data, label=label, degree=degree)
log_clf = LogisticRegression(random_state=140, penalty='l2')

# log_clf = LogisticRegression(random_state=14, class_weight={0:p, 1:(1-p)}, penalty='l1')
log_clf.fit(train_set, train_y)
y_pred = log_clf.predict(train_set)
# n = 100
prediction_list = []
for name, data in raw_test_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
    X = my_transform(data, label, degree)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True")



In [68]:

    
a = best_by_prediction.reset_index(drop=True)[["Name", "Qw", "GDT", "Rmsd", "prediction", "Step"]].groupby("Name").apply(lambda x: x.sort_values("prediction", ascending=False))



In [69]:

    
a



In [65]:

    
best_by_prediction









    Out[65]:







  
    
      
      
      Burial
      Chain
      Chi
      DSSP
      Frag_Mem
      GDT
      Helix
      Name
      P_AP
      QGO
      ...
      Rama
      Rmsd
      Rw
      Step
      VTotal
      VwithoutGo
      Water
      isGood
      prediction
      chosen
    
    
      Name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1MBA
      108
      -124.242780
      158.855236
      30.323929
      -0.000002
      -302.743939
      74.8275
      -51.621281
      1MBA
      -4.705667
      15.175482
      ...
      -616.821418
      2.46068
      -24868.848781
      110
      -947.119845
      -962.295327
      -51.339404
      False
      0.306414
      True
    
    
      152
      -126.243786
      176.912295
      29.712160
      -0.000000
      -322.480824
      78.5975
      -50.214701
      1MBA
      -5.810129
      19.575829
      ...
      -639.491294
      2.05417
      -26003.769885
      154
      -972.763539
      -992.339368
      -54.723089
      True
      0.251528
      True
    
    
      184
      -124.634118
      169.489180
      30.350727
      -0.000000
      -329.377988
      78.4275
      -56.175641
      1MBA
      -7.274706
      18.903284
      ...
      -622.567543
      2.17425
      -26272.750244
      186
      -974.308818
      -993.212102
      -53.022013
      True
      0.295674
      True
    
    
      187
      -125.781771
      181.579885
      36.539224
      -0.000000
      -316.739048
      76.2000
      -54.489866
      1MBA
      -7.759328
      19.712599
      ...
      -622.806880
      2.32998
      -26815.348036
      189
      -949.618461
      -969.331060
      -59.873277
      True
      0.259163
      True
    
    
      198
      -121.494717
      170.322420
      47.135640
      -0.000000
      -336.985619
      77.9100
      -53.332677
      1MBA
      -7.588980
      19.545050
      ...
      -620.922821
      2.13531
      -26478.631081
      200
      -958.046925
      -977.591975
      -54.725220
      True
      0.258174
      True
    
    
      T0251
      14601
      -95.329584
      0.000000
      0.000000
      -18.948112
      -168.256806
      66.4375
      -13.515449
      T0251
      -11.006397
      43.924477
      ...
      -425.283723
      3.36632
      -18874.136963
      606
      -744.839610
      -788.764087
      -56.424015
      False
      0.490454
      True
    
    
      14801
      -95.303688
      0.000000
      0.000000
      -19.236927
      -163.179403
      68.2875
      -11.442570
      T0251
      -13.478508
      43.812953
      ...
      -415.428130
      3.41842
      -18505.596897
      807
      -723.416547
      -767.229500
      -49.160273
      False
      0.362572
      True
    
    
      15079
      -94.987749
      0.000000
      0.000000
      -19.560253
      -177.489653
      65.0475
      -10.094889
      T0251
      -13.630907
      45.275842
      ...
      -429.128066
      4.07155
      -19323.597928
      1086
      -745.840879
      -791.116721
      -46.225204
      False
      0.392429
      True
    
    
      15206
      -94.390840
      0.000000
      0.000000
      -16.066872
      -180.670345
      62.5000
      -13.459546
      T0251
      -13.017631
      44.848086
      ...
      -419.724258
      3.84180
      -19051.675497
      1214
      -740.400004
      -785.248090
      -47.918598
      False
      0.383658
      True
    
    
      15633
      -96.000242
      0.000000
      0.000000
      -16.076913
      -177.203630
      66.6700
      -11.997696
      T0251
      -12.017146
      44.820247
      ...
      -424.179169
      3.66330
      -19104.008048
      1643
      -734.024162
      -778.844409
      -41.369614
      False
      0.368319
      True
    
    
      T0766
      6200
      -112.624465
      188.325481
      30.895513
      -49.008378
      -469.802983
      65.7425
      -8.579221
      T0766
      -21.102260
      35.470681
      ...
      -444.980011
      11.67540
      -19523.037618
      203
      -897.533832
      -933.004513
      -46.128189
      True
      0.554039
      True
    
    
      7525
      -111.417309
      166.218859
      32.328037
      -46.302119
      -469.499556
      61.8075
      -14.270034
      T0766
      -19.926398
      38.522025
      ...
      -438.578385
      10.93520
      -20420.634446
      1534
      -906.391872
      -944.913897
      -43.466993
      True
      0.473008
      True
    
    
      7529
      -112.539366
      187.068071
      35.553058
      -45.208302
      -469.903807
      59.7200
      -12.430592
      T0766
      -18.786025
      37.799677
      ...
      -442.783509
      11.08980
      -20235.005905
      1538
      -887.732418
      -925.532095
      -46.501623
      False
      0.469498
      True
    
    
      7533
      -110.545408
      167.216823
      30.508533
      -41.168442
      -445.476486
      62.2700
      -9.591130
      T0766
      -20.257103
      36.304830
      ...
      -422.801688
      11.06980
      -19676.410888
      1542
      -861.548533
      -897.853363
      -45.738462
      True
      0.461466
      True
    
    
      7802
      -110.327038
      188.886410
      44.613638
      -46.060935
      -482.736497
      63.4250
      -11.642554
      T0766
      -19.903036
      35.384966
      ...
      -429.532882
      11.26950
      -19416.420771
      1813
      -874.386891
      -909.771857
      -43.068964
      True
      0.511339
      True
    
    
      T0784
      8825
      -130.839383
      214.863201
      47.986716
      -62.634052
      -473.757952
      89.6000
      -4.736043
      T0784
      -44.321961
      2.485141
      ...
      -570.755010
      1.75722
      -21825.835799
      831
      -1072.480075
      -1074.965216
      -50.770732
      False
      0.506263
      True
    
    
      8831
      -131.490523
      242.672681
      42.504205
      -66.122428
      -479.780696
      89.4000
      -3.825758
      T0784
      -44.900159
      2.431961
      ...
      -544.966156
      1.77279
      -21751.030745
      837
      -1033.896788
      -1036.328749
      -50.419915
      False
      0.446581
      True
    
    
      8851
      -132.585918
      199.884006
      36.975969
      -65.359975
      -462.829356
      92.2000
      -8.641026
      T0784
      -43.565895
      2.199847
      ...
      -548.792737
      1.62316
      -21983.612570
      857
      -1075.329840
      -1077.529687
      -52.614755
      True
      0.627947
      True
    
    
      8859
      -130.816541
      225.360574
      39.524157
      -66.920459
      -483.529636
      90.2000
      -3.035999
      T0784
      -43.447347
      2.584148
      ...
      -564.238094
      1.60258
      -21538.139413
      865
      -1072.853783
      -1075.437931
      -48.334586
      False
      0.438664
      True
    
    
      9368
      -130.716504
      182.594835
      44.706046
      -63.925670
      -430.386684
      89.8000
      -10.410342
      T0784
      -46.620002
      3.308807
      ...
      -561.202607
      1.58467
      -23541.680644
      1376
      -1066.242275
      -1069.551082
      -53.590154
      False
      0.404382
      True
    
    
      T0792
      2165
      -68.845464
      0.000000
      0.000000
      -11.679158
      -138.922046
      77.8125
      -11.083435
      T0792
      -3.990471
      9.304358
      ...
      -267.919304
      2.71745
      -10877.956343
      167
      -515.813986
      -934.599066
      -22.678466
      False
      0.187593
      True
    
    
      2176
      -68.383185
      0.000000
      0.000000
      -12.333798
      -151.635991
      81.2500
      -13.722188
      T0792
      -4.185675
      7.679520
      ...
      -279.600241
      2.56529
      -11039.573834
      178
      -542.915823
      -901.475310
      -20.734264
      True
      0.235421
      True
    
    
      2177
      -66.829633
      0.000000
      0.000000
      -12.324533
      -136.244804
      82.1875
      -12.164707
      T0792
      -4.249656
      9.491226
      ...
      -260.967613
      2.62657
      -10965.036997
      179
      -505.122731
      -956.995437
      -21.833011
      True
      0.216476
      True
    
    
      2182
      -66.764212
      0.000000
      0.000000
      -11.379303
      -142.648547
      78.4375
      -14.317628
      T0792
      -4.803523
      8.325339
      ...
      -262.103465
      2.72955
      -10903.672217
      184
      -516.587494
      -911.005829
      -22.896154
      False
      0.202526
      True
    
    
      2199
      -67.291334
      0.000000
      0.000000
      -11.763105
      -138.461736
      75.6250
      -12.433650
      T0792
      -4.478029
      9.436115
      ...
      -284.137655
      2.60695
      -10798.128151
      201
      -526.337232
      -960.968029
      -17.207837
      False
      0.203726
      True
    
    
      T0803
      11457
      -115.849649
      0.000000
      0.000000
      -16.823276
      -460.244209
      76.1200
      -24.776161
      T0803
      -11.188183
      2.712733
      ...
      -442.586158
      5.37375
      -20614.083085
      1466
      -1116.080545
      -1118.793278
      -47.325640
      True
      0.107811
      True
    
    
      11600
      -117.789174
      0.000000
      0.000000
      -16.990047
      -518.257762
      72.9500
      -23.860971
      T0803
      -15.088307
      3.395407
      ...
      -431.241950
      5.56915
      -20380.226380
      1610
      -1171.927068
      -1175.322475
      -52.094264
      False
      0.118500
      True
    
    
      11633
      -115.697321
      0.000000
      0.000000
      -13.745015
      -502.367349
      72.5750
      -24.074691
      T0803
      -11.366696
      2.999411
      ...
      -445.463617
      5.43667
      -20418.716456
      1643
      -1150.480631
      -1153.480042
      -40.765352
      False
      0.116946
      True
    
    
      11766
      -113.242257
      0.000000
      0.000000
      -17.548841
      -455.850190
      72.0150
      -22.066432
      T0803
      -12.917178
      2.447048
      ...
      -435.346074
      5.74664
      -20756.668193
      1776
      -1095.727323
      -1098.174371
      -41.203398
      False
      0.107859
      True
    
    
      11855
      -115.883113
      0.000000
      0.000000
      -17.259622
      -449.272054
      74.8150
      -24.824724
      T0803
      -11.281770
      1.780859
      ...
      -426.200479
      5.16925
      -20617.347422
      1866
      -1084.539550
      -1086.320409
      -41.598647
      True
      0.116922
      True
    
    
      T0815
      4401
      -91.132568
      0.000000
      0.000000
      -47.497515
      -132.351116
      72.6425
      -9.710465
      T0815
      -22.947333
      23.548417
      ...
      -464.444885
      3.05629
      -15896.746267
      405
      -778.499432
      -1019.032909
      -33.963967
      False
      0.393732
      True
    
    
      4408
      -92.193956
      0.000000
      0.000000
      -46.772515
      -130.432221
      75.9425
      -17.296504
      T0815
      -22.788201
      22.156532
      ...
      -449.350543
      2.79595
      -15694.707056
      412
      -774.160761
      -1051.866790
      -37.483354
      True
      0.505570
      True
    
    
      4409
      -91.029138
      0.000000
      0.000000
      -46.831771
      -126.794659
      74.7625
      -13.757458
      T0815
      -22.832997
      24.392107
      ...
      -438.357700
      2.91935
      -15689.947420
      413
      -752.497680
      -1075.648430
      -37.286064
      True
      0.418012
      True
    
    
      4410
      -91.217226
      0.000000
      0.000000
      -47.890374
      -132.376990
      73.5875
      -14.400768
      T0815
      -22.872008
      22.819734
      ...
      -443.503743
      2.75682
      -15672.827864
      414
      -765.121476
      -1000.854640
      -35.680102
      True
      0.375505
      True
    
    
      5010
      -90.381384
      0.000000
      0.000000
      -42.235888
      -136.192054
      75.0000
      -17.210996
      T0815
      -20.053185
      23.475082
      ...
      -444.367231
      3.02543
      -15649.781575
      1017
      -757.843937
      -1053.575959
      -30.878280
      True
      0.426500
      True
    
    
      T0833
      13079
      -114.669302
      0.000000
      0.000000
      -73.822987
      -171.109036
      46.0650
      -14.143682
      T0833
      -32.527783
      30.361546
      ...
      -496.971945
      7.85891
      -21843.460599
      1086
      -901.763649
      -932.125195
      -28.880460
      True
      0.384751
      True
    
    
      13086
      -116.972607
      0.000000
      0.000000
      -70.184939
      -163.257791
      44.4475
      -14.049117
      T0833
      -32.334230
      31.232910
      ...
      -514.278482
      7.90148
      -21672.128904
      1093
      -907.954669
      -939.187579
      -28.110413
      False
      0.327418
      True
    
    
      13092
      -116.011856
      0.000000
      0.000000
      -77.184370
      -164.526959
      43.7500
      -13.571145
      T0833
      -32.929213
      32.036537
      ...
      -517.471697
      7.65779
      -21634.319793
      1099
      -920.012557
      -952.049094
      -30.353856
      False
      0.311368
      True
    
    
      13557
      -118.435360
      0.000000
      0.000000
      -64.133283
      -182.506228
      46.9950
      -14.285155
      T0833
      -30.840561
      31.439482
      ...
      -502.075205
      7.94893
      -21664.088411
      1566
      -918.590987
      -950.030469
      -37.754676
      True
      0.350031
      True
    
    
      13589
      -117.131893
      0.000000
      0.000000
      -68.895468
      -181.859193
      45.1425
      -13.989743
      T0833
      -31.813881
      32.279940
      ...
      -497.222893
      7.93678
      -22289.410769
      1598
      -916.864662
      -949.144602
      -38.231531
      False
      0.352314
      True
    
  

40 rows × 21 columns



In [16]:

    
print(*(zip(FEATURES, log_clf.coef_[0])))









    



('Rw', -0.19232275270600122) ('QGO', -1.6223111058603839) ('VwithoutGo', -0.28966535406693028)



In [17]:

    
n = 5
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")



In [22]:

    
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/fig6_GDT.png", dpi=300)
# plt.ylim([0.4,1])
final_gdt = final3



In [23]:

    
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/fig6_Qw.png", dpi=300)
# plt.ylim([0.4,1])
final_Qw = final3



In [24]:

    
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/fig6_Rmsd.png", dpi=300)
# plt.ylim([0.4,1])
final_Rmsd = final3



In [32]:

    
final = pd.concat([final_gdt, final_Qw["Qw"], final_Rmsd["Rmsd"]], axis=1)



In [64]:

    
f, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(15,15))
sns.pointplot("Name", "GDT", data=final, hue=" ", errwidth=0, order=order, ax=ax1)
ax1.legend_.remove()
ax1.get_xaxis().set_visible(False)
sns.pointplot("Name", "Qw", data=final, hue=" ", errwidth=0, order=order, ax=ax2)
ax2.legend_.remove()
ax2.get_xaxis().set_visible(False)
sns.pointplot("Name", "Rmsd", data=final, hue=" ", errwidth=0, order=order, ax=ax3)
plt.legend(loc=9, bbox_to_anchor=(0.9, 2.8), ncol=1, fontsize='x-large')
plt.savefig("/Users/weilu/Desktop/fig6.png", dpi=300)



In [ ]:

		Name	Qw	GDT	Rmsd	prediction	Step
Name
1MBA	0	1MBA	0.720428	74.8275	2.46068	0.306414	110
	2	1MBA	0.769650	78.4275	2.17425	0.295674	186
	3	1MBA	0.764916	76.2000	2.32998	0.259163	189
	4	1MBA	0.765128	77.9100	2.13531	0.258174	200
	1	1MBA	0.774211	78.5975	2.05417	0.251528	154
T0251	5	T0251	0.613264	66.4375	3.36632	0.490454	606
	7	T0251	0.598274	65.0475	4.07155	0.392429	1086
	8	T0251	0.589351	62.5000	3.84180	0.383658	1214
	9	T0251	0.598703	66.6700	3.66330	0.368319	1643
	6	T0251	0.625360	68.2875	3.41842	0.362572	807
T0766	10	T0766	0.563178	65.7425	11.67540	0.554039	203
	14	T0766	0.567218	63.4250	11.26950	0.511339	1813
	11	T0766	0.558608	61.8075	10.93520	0.473008	1534
	12	T0766	0.559387	59.7200	11.08980	0.469498	1538
	13	T0766	0.555598	62.2700	11.06980	0.461466	1542
T0784	17	T0784	0.882307	92.2000	1.62316	0.627947	857
	15	T0784	0.880859	89.6000	1.75722	0.506263	831
	16	T0784	0.872920	89.4000	1.77279	0.446581	837
	18	T0784	0.867933	90.2000	1.60258	0.438664	865
	19	T0784	0.876679	89.8000	1.58467	0.404382	1376
T0792	21	T0792	0.760305	81.2500	2.56529	0.235421	178
	22	T0792	0.764793	82.1875	2.62657	0.216476	179
	24	T0792	0.721099	75.6250	2.60695	0.203726	201
	23	T0792	0.724748	78.4375	2.72955	0.202526	184
	20	T0792	0.734884	77.8125	2.71745	0.187593	167
T0803	26	T0803	0.652454	72.9500	5.56915	0.118500	1610
	27	T0803	0.651672	72.5750	5.43667	0.116946	1643
	29	T0803	0.669025	74.8150	5.16925	0.116922	1866
	28	T0803	0.667617	72.0150	5.74664	0.107859	1776
	25	T0803	0.668172	76.1200	5.37375	0.107811	1466
T0815	31	T0815	0.707513	75.9425	2.79595	0.505570	412
	34	T0815	0.688493	75.0000	3.02543	0.426500	1017
	32	T0815	0.696955	74.7625	2.91935	0.418012	413
	30	T0815	0.670086	72.6425	3.05629	0.393732	405
	33	T0815	0.687528	73.5875	2.75682	0.375505	414
T0833	35	T0833	0.481112	46.0650	7.85891	0.384751	1086
	39	T0833	0.500173	45.1425	7.93678	0.352314	1598
	38	T0833	0.491663	46.9950	7.94893	0.350031	1566
	36	T0833	0.489800	44.4475	7.90148	0.327418	1093
	37	T0833	0.464509	43.7500	7.65779	0.311368	1099

		Burial	Chain	Chi	DSSP	Frag_Mem	GDT	Helix	Name	P_AP	QGO	...	Rama	Rmsd	Rw	Step	VTotal	VwithoutGo	Water	isGood	prediction	chosen
Name
1MBA	108	-124.242780	158.855236	30.323929	-0.000002	-302.743939	74.8275	-51.621281	1MBA	-4.705667	15.175482	...	-616.821418	2.46068	-24868.848781	110	-947.119845	-962.295327	-51.339404	False	0.306414	True
	152	-126.243786	176.912295	29.712160	-0.000000	-322.480824	78.5975	-50.214701	1MBA	-5.810129	19.575829	...	-639.491294	2.05417	-26003.769885	154	-972.763539	-992.339368	-54.723089	True	0.251528	True
	184	-124.634118	169.489180	30.350727	-0.000000	-329.377988	78.4275	-56.175641	1MBA	-7.274706	18.903284	...	-622.567543	2.17425	-26272.750244	186	-974.308818	-993.212102	-53.022013	True	0.295674	True
	187	-125.781771	181.579885	36.539224	-0.000000	-316.739048	76.2000	-54.489866	1MBA	-7.759328	19.712599	...	-622.806880	2.32998	-26815.348036	189	-949.618461	-969.331060	-59.873277	True	0.259163	True
	198	-121.494717	170.322420	47.135640	-0.000000	-336.985619	77.9100	-53.332677	1MBA	-7.588980	19.545050	...	-620.922821	2.13531	-26478.631081	200	-958.046925	-977.591975	-54.725220	True	0.258174	True
T0251	14601	-95.329584	0.000000	0.000000	-18.948112	-168.256806	66.4375	-13.515449	T0251	-11.006397	43.924477	...	-425.283723	3.36632	-18874.136963	606	-744.839610	-788.764087	-56.424015	False	0.490454	True
	14801	-95.303688	0.000000	0.000000	-19.236927	-163.179403	68.2875	-11.442570	T0251	-13.478508	43.812953	...	-415.428130	3.41842	-18505.596897	807	-723.416547	-767.229500	-49.160273	False	0.362572	True
	15079	-94.987749	0.000000	0.000000	-19.560253	-177.489653	65.0475	-10.094889	T0251	-13.630907	45.275842	...	-429.128066	4.07155	-19323.597928	1086	-745.840879	-791.116721	-46.225204	False	0.392429	True
	15206	-94.390840	0.000000	0.000000	-16.066872	-180.670345	62.5000	-13.459546	T0251	-13.017631	44.848086	...	-419.724258	3.84180	-19051.675497	1214	-740.400004	-785.248090	-47.918598	False	0.383658	True
	15633	-96.000242	0.000000	0.000000	-16.076913	-177.203630	66.6700	-11.997696	T0251	-12.017146	44.820247	...	-424.179169	3.66330	-19104.008048	1643	-734.024162	-778.844409	-41.369614	False	0.368319	True
T0766	6200	-112.624465	188.325481	30.895513	-49.008378	-469.802983	65.7425	-8.579221	T0766	-21.102260	35.470681	...	-444.980011	11.67540	-19523.037618	203	-897.533832	-933.004513	-46.128189	True	0.554039	True
	7525	-111.417309	166.218859	32.328037	-46.302119	-469.499556	61.8075	-14.270034	T0766	-19.926398	38.522025	...	-438.578385	10.93520	-20420.634446	1534	-906.391872	-944.913897	-43.466993	True	0.473008	True
	7529	-112.539366	187.068071	35.553058	-45.208302	-469.903807	59.7200	-12.430592	T0766	-18.786025	37.799677	...	-442.783509	11.08980	-20235.005905	1538	-887.732418	-925.532095	-46.501623	False	0.469498	True
	7533	-110.545408	167.216823	30.508533	-41.168442	-445.476486	62.2700	-9.591130	T0766	-20.257103	36.304830	...	-422.801688	11.06980	-19676.410888	1542	-861.548533	-897.853363	-45.738462	True	0.461466	True
	7802	-110.327038	188.886410	44.613638	-46.060935	-482.736497	63.4250	-11.642554	T0766	-19.903036	35.384966	...	-429.532882	11.26950	-19416.420771	1813	-874.386891	-909.771857	-43.068964	True	0.511339	True
T0784	8825	-130.839383	214.863201	47.986716	-62.634052	-473.757952	89.6000	-4.736043	T0784	-44.321961	2.485141	...	-570.755010	1.75722	-21825.835799	831	-1072.480075	-1074.965216	-50.770732	False	0.506263	True
	8831	-131.490523	242.672681	42.504205	-66.122428	-479.780696	89.4000	-3.825758	T0784	-44.900159	2.431961	...	-544.966156	1.77279	-21751.030745	837	-1033.896788	-1036.328749	-50.419915	False	0.446581	True
	8851	-132.585918	199.884006	36.975969	-65.359975	-462.829356	92.2000	-8.641026	T0784	-43.565895	2.199847	...	-548.792737	1.62316	-21983.612570	857	-1075.329840	-1077.529687	-52.614755	True	0.627947	True
	8859	-130.816541	225.360574	39.524157	-66.920459	-483.529636	90.2000	-3.035999	T0784	-43.447347	2.584148	...	-564.238094	1.60258	-21538.139413	865	-1072.853783	-1075.437931	-48.334586	False	0.438664	True
	9368	-130.716504	182.594835	44.706046	-63.925670	-430.386684	89.8000	-10.410342	T0784	-46.620002	3.308807	...	-561.202607	1.58467	-23541.680644	1376	-1066.242275	-1069.551082	-53.590154	False	0.404382	True
T0792	2165	-68.845464	0.000000	0.000000	-11.679158	-138.922046	77.8125	-11.083435	T0792	-3.990471	9.304358	...	-267.919304	2.71745	-10877.956343	167	-515.813986	-934.599066	-22.678466	False	0.187593	True
	2176	-68.383185	0.000000	0.000000	-12.333798	-151.635991	81.2500	-13.722188	T0792	-4.185675	7.679520	...	-279.600241	2.56529	-11039.573834	178	-542.915823	-901.475310	-20.734264	True	0.235421	True
	2177	-66.829633	0.000000	0.000000	-12.324533	-136.244804	82.1875	-12.164707	T0792	-4.249656	9.491226	...	-260.967613	2.62657	-10965.036997	179	-505.122731	-956.995437	-21.833011	True	0.216476	True
	2182	-66.764212	0.000000	0.000000	-11.379303	-142.648547	78.4375	-14.317628	T0792	-4.803523	8.325339	...	-262.103465	2.72955	-10903.672217	184	-516.587494	-911.005829	-22.896154	False	0.202526	True
	2199	-67.291334	0.000000	0.000000	-11.763105	-138.461736	75.6250	-12.433650	T0792	-4.478029	9.436115	...	-284.137655	2.60695	-10798.128151	201	-526.337232	-960.968029	-17.207837	False	0.203726	True
T0803	11457	-115.849649	0.000000	0.000000	-16.823276	-460.244209	76.1200	-24.776161	T0803	-11.188183	2.712733	...	-442.586158	5.37375	-20614.083085	1466	-1116.080545	-1118.793278	-47.325640	True	0.107811	True
	11600	-117.789174	0.000000	0.000000	-16.990047	-518.257762	72.9500	-23.860971	T0803	-15.088307	3.395407	...	-431.241950	5.56915	-20380.226380	1610	-1171.927068	-1175.322475	-52.094264	False	0.118500	True
	11633	-115.697321	0.000000	0.000000	-13.745015	-502.367349	72.5750	-24.074691	T0803	-11.366696	2.999411	...	-445.463617	5.43667	-20418.716456	1643	-1150.480631	-1153.480042	-40.765352	False	0.116946	True
	11766	-113.242257	0.000000	0.000000	-17.548841	-455.850190	72.0150	-22.066432	T0803	-12.917178	2.447048	...	-435.346074	5.74664	-20756.668193	1776	-1095.727323	-1098.174371	-41.203398	False	0.107859	True
	11855	-115.883113	0.000000	0.000000	-17.259622	-449.272054	74.8150	-24.824724	T0803	-11.281770	1.780859	...	-426.200479	5.16925	-20617.347422	1866	-1084.539550	-1086.320409	-41.598647	True	0.116922	True
T0815	4401	-91.132568	0.000000	0.000000	-47.497515	-132.351116	72.6425	-9.710465	T0815	-22.947333	23.548417	...	-464.444885	3.05629	-15896.746267	405	-778.499432	-1019.032909	-33.963967	False	0.393732	True
	4408	-92.193956	0.000000	0.000000	-46.772515	-130.432221	75.9425	-17.296504	T0815	-22.788201	22.156532	...	-449.350543	2.79595	-15694.707056	412	-774.160761	-1051.866790	-37.483354	True	0.505570	True
	4409	-91.029138	0.000000	0.000000	-46.831771	-126.794659	74.7625	-13.757458	T0815	-22.832997	24.392107	...	-438.357700	2.91935	-15689.947420	413	-752.497680	-1075.648430	-37.286064	True	0.418012	True
	4410	-91.217226	0.000000	0.000000	-47.890374	-132.376990	73.5875	-14.400768	T0815	-22.872008	22.819734	...	-443.503743	2.75682	-15672.827864	414	-765.121476	-1000.854640	-35.680102	True	0.375505	True
	5010	-90.381384	0.000000	0.000000	-42.235888	-136.192054	75.0000	-17.210996	T0815	-20.053185	23.475082	...	-444.367231	3.02543	-15649.781575	1017	-757.843937	-1053.575959	-30.878280	True	0.426500	True
T0833	13079	-114.669302	0.000000	0.000000	-73.822987	-171.109036	46.0650	-14.143682	T0833	-32.527783	30.361546	...	-496.971945	7.85891	-21843.460599	1086	-901.763649	-932.125195	-28.880460	True	0.384751	True
	13086	-116.972607	0.000000	0.000000	-70.184939	-163.257791	44.4475	-14.049117	T0833	-32.334230	31.232910	...	-514.278482	7.90148	-21672.128904	1093	-907.954669	-939.187579	-28.110413	False	0.327418	True
	13092	-116.011856	0.000000	0.000000	-77.184370	-164.526959	43.7500	-13.571145	T0833	-32.929213	32.036537	...	-517.471697	7.65779	-21634.319793	1099	-920.012557	-952.049094	-30.353856	False	0.311368	True
	13557	-118.435360	0.000000	0.000000	-64.133283	-182.506228	46.9950	-14.285155	T0833	-30.840561	31.439482	...	-502.075205	7.94893	-21664.088411	1566	-918.590987	-950.030469	-37.754676	True	0.350031	True
	13589	-117.131893	0.000000	0.000000	-68.895468	-181.859193	45.1425	-13.989743	T0833	-31.813881	32.279940	...	-497.222893	7.93678	-22289.410769	1598	-916.864662	-949.144602	-38.231531	False	0.352314	True