In [158]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [159]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")
def choose_top_rw(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.Rw, method='first')<=n)
def choose_top_vtotal(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.VTotal, method='first')<=n)
def choose_top(data,col="Qw", n=5, ascending=False):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)

In [160]:
# protein_list = ["T0766", "1MBA", "T0784", "T0792", "T0803", "T0815", "T0833", "T0251"]
protein_list = ["T0766", "T0784", "T0803", "T0833", "T0251"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []

for protein in protein_list:
    awsem = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_awsem.log", names=name_list)
    rw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_rw.txt", names=["i", "Rw"], sep="\s+")
    rmsd = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_rmsd.txt", names=["i2", "Rmsd"], sep="\s+")
    qw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_qw.txt", names=["i3", "Qw"], sep="\s+")
    gdt = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_GDT.txt", names=["gdt"+str(i) for i in range(1,21)], sep="\s+")
    rw = rw.reset_index(drop=True)
    awsem = awsem.reset_index(drop=True)
    rmsd = rmsd.reset_index(drop=True)
    qw = qw.reset_index(drop=True)
    gdt["GDT"] = (gdt["gdt2"] + gdt["gdt4"] + gdt["gdt8"] + gdt["gdt16"])*25
    gdt = gdt.reset_index(drop=True)
    data = pd.concat([rw, qw, rmsd, gdt["GDT"], awsem], axis=1)
#     print(data)
    remove_columns = ['i', 'i2', 'i3', 'Step', "Shake", "Excluded", "AMH_Go", "Membrane", "Vec_FM", "SSB", "Electro."]
#     if protein == "T0251":
#         p = "T251"
#     elif protein == "1mba":
#         p = "1MBA"
#     else:
#         p = protein
    p = protein
    data = data.drop(remove_columns, axis=1).reset_index().assign(Name=p)
    all_data_list.append(data)
all_data = pd.concat(all_data_list).reset_index(drop=True)

In [215]:
print(all_data.shape)
all_data.dropna().shape


(10050, 17)
Out[215]:
(10050, 17)

In [162]:
# all_data.to_csv("/Users/weilu/Research/data/test_data/test_data_mar21_2.csv")

In [203]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 51)
raw_test_data_old = raw_test_data_2.assign(VwithoutGo = raw_test_data.VTotal - raw_test_data.QGO)

In [204]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar21_2.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 51)
raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data.VTotal - raw_test_data.QGO)
raw_test_data["Step"] = raw_test_data_2["index"]+1
# raw_test_data = raw_test_data_2
a = raw_test_data_old.query("Name not in @protein_list").reset_index(drop=True)
raw_test_data = pd.concat([a, raw_test_data.drop('Unnamed: 0', axis=1)])
raw_test_data = raw_test_data.query("Name != 'T251'").reset_index(drop=True)
raw_test_data = raw_test_data.query("Step != 1").reset_index(drop=True)

In [205]:
raw_test_data.shape


Out[205]:
(16008, 21)

In [166]:
raw_test_data["Name"].unique()


Out[166]:
array(['1MBA', 'T0792', 'T0815', 'T0766', 'T0784', 'T0803', 'T0833',
       'T0251'], dtype=object)

In [207]:
# raw_test_data[["Name", "Step", "Qw", "GDT"]].to_csv("/Users/weilu/Desktop/steps_qw_gdt.csv")

In [4]:
# all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar03.csv")
# all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)
# raw_test_data_2 = all_data
# raw_test_data_2 = all_data.assign(isGood=raw_test_data_2.groupby("Name")["Qw"].rank(ascending=False, method='first') < 6)
# raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data_2.VTotal - raw_test_data_2.QGO)

In [184]:
FEATURES = ['Rw',
#      'VTotal',
     'QGO',
     'VwithoutGo',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
n = 5
def my_transform(data, label, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

def my_transform_predict(data, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    PolynomialDegree = degree

    num_attribs = FEATURES
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    return num_pipeline.fit_transform(data)

In [185]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "Rw", "Qw")



In [186]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "VwithoutGo", "Qw")



In [187]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")



In [188]:
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
# raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
# raw_data = raw_data_T0792
# raw_data = raw_test_data.groupby("Name").get_group("1mba")
raw_data = raw_data_T0784

In [189]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)

In [190]:
label = "isGood"
degree = 1
p = 0.1
train_set, train_y, test_set, test_y = train_and_test(raw_data, label=label, degree=degree)
log_clf = LogisticRegression(random_state=140, penalty='l2')

# log_clf = LogisticRegression(random_state=14, class_weight={0:p, 1:(1-p)}, penalty='l1')
log_clf.fit(train_set, train_y)
y_pred = log_clf.predict(train_set)
# n = 100
prediction_list = []
for name, data in raw_test_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
    X = my_transform(data, label, degree)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True")


1MBA
T0251
T0766
T0784
T0792
T0803
T0815
T0833

In [191]:
print(*(zip(FEATURES, log_clf.coef_[0])))


('Rw', -0.1923227527060043) ('QGO', -1.6223111058603821) ('VwithoutGo', -0.28966535406692889)

In [175]:
print(*(zip(FEATURES, log_clf.coef_[0])))


('Rw', -0.22592705892120249) ('QGO', -1.6229938406423545) ('VwithoutGo', -0.20267792269246535)

In [192]:
n = 5
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")

In [193]:
top_qgo_old = top_qgo

In [208]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])


Out[208]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a22f5ccc0>

In [209]:
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])


Out[209]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a22f53588>

In [211]:
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])


Out[211]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a18149ba8>

In [178]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])



In [195]:
a = best_by_prediction.reset_index(drop=True)[["Name", "Qw", "GDT", "prediction", "Step"]].groupby("Name").apply(lambda x: x.sort_values("prediction", ascending=False))

In [196]:
import itertools

In [198]:
a.reset_index(drop=True).to_csv("/Users/weilu/Desktop/selected.csv")

In [197]:
a.reset_index(drop=True).set_index("Name")


Out[197]:
Qw GDT prediction Step
Name
1MBA 0.720428 74.8275 0.306414 110
1MBA 0.769650 78.4275 0.295674 186
1MBA 0.764916 76.2000 0.259163 189
1MBA 0.765128 77.9100 0.258174 200
1MBA 0.774211 78.5975 0.251528 154
T0251 0.613264 66.4375 0.490454 606
T0251 0.598274 65.0475 0.392429 1086
T0251 0.589351 62.5000 0.383658 1214
T0251 0.598703 66.6700 0.368319 1643
T0251 0.625360 68.2875 0.362572 807
T0766 0.563178 65.7425 0.554039 203
T0766 0.567218 63.4250 0.511339 1813
T0766 0.558608 61.8075 0.473008 1534
T0766 0.559387 59.7200 0.469498 1538
T0766 0.555598 62.2700 0.461466 1542
T0784 0.882307 92.2000 0.627947 857
T0784 0.880859 89.6000 0.506263 831
T0784 0.872920 89.4000 0.446581 837
T0784 0.867933 90.2000 0.438664 865
T0784 0.876679 89.8000 0.404382 1376
T0792 0.760305 81.2500 0.295416 178
T0792 0.739581 79.0625 0.207753 181
T0792 0.730408 77.8125 0.194552 183
T0792 0.729135 79.0625 0.181682 1605
T0792 0.724748 78.4375 0.179628 184
T0803 0.652454 72.9500 0.118500 1610
T0803 0.651672 72.5750 0.116946 1643
T0803 0.669025 74.8150 0.116922 1866
T0803 0.667617 72.0150 0.107859 1776
T0803 0.668172 76.1200 0.107811 1466
T0815 0.707513 75.9425 0.508497 412
T0815 0.670086 72.6425 0.474252 405
T0815 0.687528 73.5875 0.437411 414
T0815 0.688493 75.0000 0.374946 1017
T0815 0.639013 72.1700 0.344641 806
T0833 0.481112 46.0650 0.384751 1086
T0833 0.500173 45.1425 0.352314 1598
T0833 0.491663 46.9950 0.350031 1566
T0833 0.489800 44.4475 0.327418 1093
T0833 0.464509 43.7500 0.311368 1099

In [153]:


In [154]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col="Step", ascending=True).query("chosen==True")[["Name", "GDT", "Qw", "Step"]]


Out[154]:
Name GDT Qw Step
Name
1MBA 0 1MBA 65.2400 0.638744 1
T0251 14007 T0251 66.2050 0.655897 0
T0766 6003 T0766 67.8250 0.617307 0
T0784 8004 T0784 86.0000 0.838527 0
T0792 2001 T0792 74.0625 0.675483 1
T0803 10005 T0803 74.0675 0.671503 0
T0815 4002 T0815 74.7625 0.672388 1
T0833 12006 T0833 43.2875 0.487200 0

In [80]:
best_by_prediction.reset_index(drop=True)[["Name", "GDT", "Qw"]].groupby("Name").mean()


Out[80]:
GDT Qw
Name
1MBA 74.5210 0.731773
T0251 65.3720 0.611098
T0766 63.4720 0.574620
T0784 90.4000 0.880746
T0792 78.1250 0.726105
T0803 74.1795 0.667197
T0815 74.3870 0.685202
T0833 45.1875 0.489990

In [69]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col=label).query("chosen==True")[["GDT"]]


Out[69]:
GDT
Name
1MBA 181 80.1375
T0251 15448 74.7675
T0766 6003 67.8250
T0784 9592 92.6000
T0792 3586 84.3750
T0803 10113 78.1700
T0815 4434 77.3550
T0833 13188 48.3800

In [70]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col="Qw").query("chosen==True")[["Qw"]]


Out[70]:
Qw
Name
1MBA 198 0.782962
T0251 15448 0.691465
T0766 6003 0.617307
T0784 9385 0.896806
T0792 2187 0.772617
T0803 10113 0.700119
T0815 4434 0.710136
T0833 12215 0.502726

In [22]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0f429c88>

In [24]:
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/figure6_qw.png", dpi=300)
# plt.ylim([0.4,1])



In [16]:
initial = raw_test_data.groupby("Name").apply(choose_top, n=1, col='Step', ascending=True).query("chosen==True")
f2 = initial.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Initial"})

In [18]:
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-e01b078f59da> in <module>()
----> 1 a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})

NameError: name 'best' is not defined

In [19]:
f2.melt(id_vars="Name", value_name=label, var_name=" ")


Out[19]:
Name isGood
0 1MBA Initial False
1 T0766 Initial True
2 T0784 Initial False
3 T0792 Initial False
4 T0803 Initial True
5 T0815 Initial True
6 T0833 Initial False
7 T251 Initial False

In [20]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.clf()
t = f2.melt(id_vars="Name", value_name=label, var_name=" ")
sns.stripplot("Name", label, data=t, order=order, color="black")
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0f4325c0>

In [55]:
best_old = best

In [146]:
# best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")

In [162]:
protein_list = ["T0766", "1mba", "T0784", "T0792", "T0803", "T0815", "T0833", "T0251"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []
for protein in protein_list:
    awsem = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_awsem.log", names=name_list)
    rw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rw.txt", names=["i", "Rw"], sep="\s+")
    rmsd = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rmsd.txt", names=["i2", "Rmsd"], sep="\s+")
    qw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_qw.txt", names=["i3", "Qw"], sep="\s+")
    rw = rw[:2000].reset_index(drop=True)
    awsem = awsem[:2000].reset_index(drop=True)
    rmsd = rmsd[:2000].reset_index(drop=True)
    qw = qw[:2000].reset_index(drop=True)
    data = pd.concat([rw, awsem, rmsd, qw], axis=1)
    remove_columns = ['i', 'i2', 'i3', 'Step', "Shake", "Excluded", "AMH_Go", "Membrane", "Vec_FM", "SSB", "Electro."]
    if protein == "T0251":
        p = "T251"
    elif protein == "1mba":
        p = "1MBA"
    else:
        p = protein
    data = data.drop(remove_columns, axis=1).reset_index().assign(Name=p)
    all_data_list.append(data)
all_data = pd.concat(all_data_list).reset_index(drop=True)

In [16]:
a = raw_test_data.query("Name=='1MBA'").assign(T='old')

In [17]:
b = all_data.query("Name=='1MBA'").assign(T='new')

In [18]:
c = pd.concat([a,b])

In [ ]:
b

In [19]:
g = sns.FacetGrid(c, col="Name", hue="T", col_wrap=1)
g = g.map(plt.scatter, "Rw", "Qw", alpha=0.1)



In [23]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")



In [22]:
g = sns.FacetGrid(all_data, col="Name", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")



In [163]:
# all_data.to_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")

In [79]:
all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")
all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)

In [80]:
prediction_list = []
for name, data in all_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
    X = my_transform_predict(data, degree=1)
    eval_set = X
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = all_data.assign(prediction=prediction.values)
best_by_prediction_new = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True").drop('Unnamed: 0',axis=1)


1MBA
T0766
T0784
T0792
T0803
T0815
T0833
T251

In [34]:
best_by_prediction_new


Out[34]:
index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo prediction chosen
Name
1MBA 2000 0 -26473.201583 167.564260 30.265355 -649.046369 -0.000234 -7.145992 -62.636467 -124.256792 -58.545028 -369.651386 9.083653 -1064.368999 2.35034 0.671072 1MBA -1073.452652 0.949798 True
2402 402 -25524.115997 162.588117 29.871546 -637.486851 -0.000000 -6.117750 -59.092704 -122.056701 -53.978581 -374.526317 11.180865 -1049.618378 2.87010 0.598428 1MBA -1060.799243 0.805544 True
3005 1005 -25586.216265 166.137087 34.751798 -624.929558 -0.000000 -3.735631 -57.344730 -124.972776 -55.668879 -351.836631 11.091300 -1006.508022 2.46273 0.661575 1MBA -1017.599322 0.798692 True
3206 1206 -26641.596220 143.172577 33.238930 -643.767739 -0.000000 -6.162572 -66.052242 -122.966350 -60.798873 -387.787094 9.568414 -1101.554949 2.62836 0.646248 1MBA -1111.123363 0.951620 True
3608 1608 -25708.255549 150.437122 38.305480 -617.734785 -0.000000 -5.084803 -58.348555 -124.568879 -57.987746 -385.741798 9.961194 -1050.762769 2.75558 0.647979 1MBA -1060.723963 0.883126 True
T0766 311 311 -17704.811371 123.990918 22.735052 -380.398660 -55.805090 -26.587830 -53.205623 -94.569191 -11.074971 -617.663486 0.456534 -1092.122348 1.54003 0.914866 T0766 -1092.578882 0.177008 True
690 690 -17639.457992 148.283830 30.621936 -378.756833 -53.457257 -27.462251 -48.533265 -95.326469 -11.294877 -531.754000 0.311335 -967.367851 1.47015 0.895680 T0766 -967.679186 0.162040 True
694 694 -17452.458817 147.666186 21.177155 -370.783486 -52.938989 -28.206054 -52.597858 -91.350741 -10.367621 -535.264614 0.346713 -972.319310 1.40461 0.906777 T0766 -972.666023 0.124396 True
762 762 -17563.616919 149.511441 28.270570 -389.741613 -52.871474 -28.166277 -51.905238 -93.320169 -11.444771 -607.752169 0.566944 -1056.852756 1.28903 0.914586 T0766 -1057.419700 0.120021 True
1597 1597 -17647.474507 140.051354 25.654790 -370.026178 -53.598729 -27.379476 -52.964184 -93.993429 -10.296802 -604.672759 0.561102 -1046.664311 1.33343 0.913885 T0766 -1047.225413 0.131527 True
T0784 4425 425 -18531.462212 185.730963 34.946650 -450.223942 -64.001365 -43.157142 -53.390888 -109.582820 -0.010833 -461.257439 3.184231 -957.762586 1.69954 0.861257 T0784 -960.946817 0.589055 True
4426 426 -18340.014479 179.491019 30.393790 -438.113931 -66.980525 -44.054194 -50.487615 -108.206247 -0.005735 -383.966277 2.918712 -879.011003 1.79444 0.856198 T0784 -881.929715 0.566919 True
4436 436 -18549.235385 140.277430 24.474562 -430.603214 -68.431445 -46.133949 -53.663111 -108.832825 -0.009780 -360.186453 3.069609 -900.039177 1.92239 0.862397 T0784 -903.108786 0.595796 True
4437 437 -18300.385795 186.116508 38.522451 -419.742656 -66.553980 -45.814804 -49.971007 -108.718759 -0.030533 -412.046502 2.888911 -875.350371 1.99202 0.863625 T0784 -878.239282 0.561915 True
4464 464 -18215.560827 173.797530 34.959398 -433.576657 -69.404207 -45.582299 -50.801103 -109.121912 -0.001761 -447.600269 2.697612 -944.633670 1.73140 0.859810 T0784 -947.331282 0.627926 True
T0792 6000 0 -10722.090867 0.000000 0.000000 -322.603244 -12.292156 -3.793797 -29.870054 -67.271757 -10.340208 -161.125299 8.501405 -598.795110 3.78606 0.652529 T0792 -607.296515 0.216025 True
6201 201 -10802.793172 0.000000 0.000000 -311.411630 -12.679359 -4.738974 -30.576894 -69.909098 -14.183376 -152.699472 9.043662 -587.155142 5.47543 0.653273 T0792 -596.198804 0.204651 True
7005 1005 -10933.898349 0.000000 0.000000 -318.108106 -9.348588 -3.597018 -30.422177 -70.071184 -10.798472 -165.003199 6.594762 -600.753983 4.61486 0.642170 T0792 -607.348745 0.326640 True
7608 1608 -10483.371159 0.000000 0.000000 -310.550875 -12.222728 -4.014591 -28.343068 -69.086903 -12.808956 -153.112519 7.534226 -582.605414 5.69853 0.663933 T0792 -590.139640 0.187509 True
7809 1809 -11238.544141 0.000000 0.000000 -310.800775 -11.963667 -5.539519 -34.182560 -69.515103 -10.704194 -153.382452 11.314676 -584.773594 5.05561 0.615843 T0792 -596.088270 0.215628 True
T0803 8151 151 -20703.421673 148.579686 33.980616 -427.281804 -14.676221 -13.162436 -47.610050 -112.017937 -23.709906 -501.902491 1.324196 -956.476346 6.39523 0.729766 T0803 -957.800542 0.336168 True
8164 164 -20766.225860 165.328464 42.379268 -429.637537 -14.629371 -12.229928 -47.715274 -111.185710 -24.094283 -430.275259 1.329070 -860.730561 6.10241 0.725550 T0803 -862.059631 0.293669 True
8180 180 -20933.535882 193.697896 34.969874 -416.429394 -15.701267 -12.505622 -44.169422 -109.641645 -24.621304 -477.095735 1.187569 -870.309050 6.03585 0.738008 T0803 -871.496619 0.340815 True
8190 190 -20844.962110 179.110734 31.733673 -424.041076 -14.977842 -11.352433 -47.281069 -109.524348 -25.302347 -474.433750 1.459723 -894.608736 6.15701 0.734347 T0803 -896.068459 0.305125 True
8193 193 -20906.030632 173.850768 28.992856 -423.359886 -15.054973 -11.762510 -42.826267 -111.559722 -26.147188 -464.418699 1.455861 -890.829760 5.70414 0.732467 T0803 -892.285621 0.312087 True
T0815 10000 0 -15635.174335 0.000000 0.000000 -494.048924 -48.611577 -20.980908 -44.869819 -92.643768 -18.156326 -134.028207 18.225475 -835.114054 3.42805 0.607091 T0815 -853.339529 0.937762 True
10603 603 -15826.858040 0.000000 0.000000 -478.167582 -40.519938 -22.216100 -40.884234 -92.897649 -17.414522 -140.766600 18.737975 -814.128651 3.10481 0.657762 T0815 -832.866626 0.931107 True
11005 1005 -16052.966800 0.000000 0.000000 -496.182245 -46.135790 -22.240168 -48.690365 -92.911658 -14.106110 -149.210595 18.850429 -850.626502 2.73917 0.670541 T0815 -869.476931 0.949586 True
11206 1206 -15878.678416 0.000000 0.000000 -490.126164 -47.006207 -22.083533 -44.333210 -93.701689 -6.601143 -150.378775 19.254297 -834.976424 3.10791 0.647490 T0815 -854.230721 0.929100 True
11608 1608 -16093.073170 0.000000 0.000000 -493.028360 -44.874711 -23.333941 -43.717355 -93.031297 -15.421469 -139.902506 16.422745 -836.886894 3.41557 0.605696 T0815 -853.309639 0.976557 True
T0833 12000 0 -14347.027331 125.898222 23.989866 -439.957297 -73.720114 -37.943400 -33.171725 -92.846135 -0.224366 -124.238990 25.645643 -626.568294 5.49476 0.501330 T0833 -652.213937 0.956918 True
12402 402 -14743.328182 158.058162 24.461995 -444.935460 -78.334591 -37.783785 -30.407831 -94.725561 0.319723 -130.712649 30.173411 -603.886586 8.42128 0.476841 T0833 -634.059997 0.705643 True
12804 804 -13823.640680 159.231026 23.597154 -450.461446 -80.361652 -36.281754 -19.493895 -90.543384 -0.186547 -122.303341 27.890135 -588.913702 9.37339 0.445332 T0833 -616.803837 0.797835 True
13005 1005 -15024.095932 149.112160 24.551428 -446.849817 -74.604115 -36.845177 -30.533836 -94.630107 0.370975 -123.415370 27.948081 -604.895778 8.32255 0.499559 T0833 -632.843859 0.907696 True
13809 1809 -13749.486003 136.957995 25.944084 -450.714461 -82.582081 -39.772933 -26.135391 -94.109941 0.293154 -128.107811 27.006862 -631.220523 8.13774 0.471890 T0833 -658.227385 0.874346 True
T251 15050 1050 -18772.633497 0.000000 0.000000 -415.567625 -10.847416 -11.161604 -49.211497 -94.729339 -10.269035 -166.307919 45.807160 -712.287274 3.37350 0.640714 T251 -758.094434 0.798224 True
15063 1063 -18801.931114 0.000000 0.000000 -413.303745 -12.894267 -11.619890 -46.775530 -93.961634 -14.050400 -174.168870 45.335953 -721.438384 3.91536 0.635235 T251 -766.774337 0.833380 True
15078 1078 -18822.844179 0.000000 0.000000 -424.379341 -11.521283 -11.750080 -48.864631 -93.257680 -13.195063 -172.934384 46.224212 -729.678250 3.88014 0.640889 T251 -775.902462 0.792404 True
15113 1113 -18815.917771 0.000000 0.000000 -403.550604 -11.650286 -11.121332 -44.827072 -91.826783 -12.397254 -173.331452 45.103440 -703.601342 4.06113 0.589503 T251 -748.704782 0.835229 True
15147 1147 -18800.856827 0.000000 0.000000 -393.652214 -10.124169 -11.561996 -47.286612 -94.766359 -12.870370 -169.229040 45.322943 -694.167817 3.90440 0.587603 T251 -739.490760 0.815967 True

In [37]:
top_qgo


Out[37]:
Unnamed: 0 index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo chosen
Name
1MBA 2000 2000 0 -26473.201583 167.564260 30.265355 -649.046369 -0.000234 -7.145992 -62.636467 -124.256792 -58.545028 -369.651386 9.083653 -1064.368999 2.35034 0.671072 1MBA -1073.452652 True
2010 2010 10 -25268.085249 175.498256 46.713784 -603.109631 -0.000002 -5.496606 -52.694837 -125.160438 -46.670344 -297.265618 10.418770 -897.766665 2.38582 0.715542 1MBA -908.185435 True
3005 3005 1005 -25586.216265 166.137087 34.751798 -624.929558 -0.000000 -3.735631 -57.344730 -124.972776 -55.668879 -351.836631 11.091300 -1006.508022 2.46273 0.661575 1MBA -1017.599322 True
3206 3206 1206 -26641.596220 143.172577 33.238930 -643.767739 -0.000000 -6.162572 -66.052242 -122.966350 -60.798873 -387.787094 9.568414 -1101.554949 2.62836 0.646248 1MBA -1111.123363 True
3608 3608 1608 -25708.255549 150.437122 38.305480 -617.734785 -0.000000 -5.084803 -58.348555 -124.568879 -57.987746 -385.741798 9.961194 -1050.762769 2.75558 0.647979 1MBA -1060.723963 True
T0766 311 311 311 -17704.811371 123.990918 22.735052 -380.398660 -55.805090 -26.587830 -53.205623 -94.569191 -11.074971 -617.663486 0.456534 -1092.122348 1.54003 0.914866 T0766 -1092.578882 True
690 690 690 -17639.457992 148.283830 30.621936 -378.756833 -53.457257 -27.462251 -48.533265 -95.326469 -11.294877 -531.754000 0.311335 -967.367851 1.47015 0.895680 T0766 -967.679186 True
694 694 694 -17452.458817 147.666186 21.177155 -370.783486 -52.938989 -28.206054 -52.597858 -91.350741 -10.367621 -535.264614 0.346713 -972.319310 1.40461 0.906777 T0766 -972.666023 True
698 698 698 -17490.516533 138.794636 26.518857 -374.228818 -56.553755 -27.631732 -48.891709 -92.588012 -12.504663 -565.484310 0.465184 -1012.104322 1.40915 0.905063 T0766 -1012.569506 True
771 771 771 -17537.167695 132.780644 24.938543 -376.958192 -53.481557 -27.646844 -51.700067 -93.309125 -12.719321 -443.325854 0.517850 -900.903924 1.46685 0.908553 T0766 -901.421774 True
T0784 4418 4418 418 -18188.998927 181.910453 23.631139 -443.361765 -66.420464 -45.611108 -48.179328 -108.106543 -0.027142 -384.594658 3.031566 -887.727850 1.75655 0.861675 T0784 -890.759416 True
4426 4426 426 -18340.014479 179.491019 30.393790 -438.113931 -66.980525 -44.054194 -50.487615 -108.206247 -0.005735 -383.966277 2.918712 -879.011003 1.79444 0.856198 T0784 -881.929715 True
4437 4437 437 -18300.385795 186.116508 38.522451 -419.742656 -66.553980 -45.814804 -49.971007 -108.718759 -0.030533 -412.046502 2.888911 -875.350371 1.99202 0.863625 T0784 -878.239282 True
4464 4464 464 -18215.560827 173.797530 34.959398 -433.576657 -69.404207 -45.582299 -50.801103 -109.121912 -0.001761 -447.600269 2.697612 -944.633670 1.73140 0.859810 T0784 -947.331282 True
5206 5206 1206 -17229.795749 156.635971 26.468856 -470.507923 -62.197314 -43.717577 -48.638125 -106.679502 -0.036323 -530.816104 2.795494 -1076.692547 1.56155 0.839119 T0784 -1079.488041 True
T0792 6000 6000 0 -10722.090867 0.000000 0.000000 -322.603244 -12.292156 -3.793797 -29.870054 -67.271757 -10.340208 -161.125299 8.501405 -598.795110 3.78606 0.652529 T0792 -607.296515 True
6603 6603 603 -10494.787655 0.000000 0.000000 -310.209869 -12.013000 -3.855909 -26.874200 -68.964983 -12.171717 -149.516320 8.526598 -575.079401 6.52985 0.652774 T0792 -583.605999 True
6807 6807 807 -10526.486998 0.000000 0.000000 -285.674866 -12.583541 -4.684706 -26.682697 -67.927567 -7.072391 -125.071908 8.650184 -521.047491 3.77344 0.684037 T0792 -529.697675 True
7005 7005 1005 -10933.898349 0.000000 0.000000 -318.108106 -9.348588 -3.597018 -30.422177 -70.071184 -10.798472 -165.003199 6.594762 -600.753983 4.61486 0.642170 T0792 -607.348745 True
7608 7608 1608 -10483.371159 0.000000 0.000000 -310.550875 -12.222728 -4.014591 -28.343068 -69.086903 -12.808956 -153.112519 7.534226 -582.605414 5.69853 0.663933 T0792 -590.139640 True
T0803 8011 8011 11 -20150.122364 196.969052 39.688194 -400.608082 -14.475727 -12.716851 -45.240343 -111.741272 -20.313633 -480.947867 1.214709 -848.171820 6.92117 0.703184 T0803 -849.386529 True
8150 8150 150 -20755.376084 178.736440 40.743517 -416.377362 -16.817838 -13.509271 -50.665654 -111.756872 -25.282698 -454.709144 1.425733 -868.213150 6.26578 0.733336 T0803 -869.638883 True
8151 8151 151 -20703.421673 148.579686 33.980616 -427.281804 -14.676221 -13.162436 -47.610050 -112.017937 -23.709906 -501.902491 1.324196 -956.476346 6.39523 0.729766 T0803 -957.800542 True
8164 8164 164 -20766.225860 165.328464 42.379268 -429.637537 -14.629371 -12.229928 -47.715274 -111.185710 -24.094283 -430.275259 1.329070 -860.730561 6.10241 0.725550 T0803 -862.059631 True
8180 8180 180 -20933.535882 193.697896 34.969874 -416.429394 -15.701267 -12.505622 -44.169422 -109.641645 -24.621304 -477.095735 1.187569 -870.309050 6.03585 0.738008 T0803 -871.496619 True
T0815 10000 10000 0 -15635.174335 0.000000 0.000000 -494.048924 -48.611577 -20.980908 -44.869819 -92.643768 -18.156326 -134.028207 18.225475 -835.114054 3.42805 0.607091 T0815 -853.339529 True
10201 10201 201 -15628.174659 0.000000 0.000000 -495.483838 -46.669807 -20.935487 -41.287419 -92.897774 -14.951556 -142.035574 18.799878 -835.461577 3.42158 0.639353 T0815 -854.261455 True
10603 10603 603 -15826.858040 0.000000 0.000000 -478.167582 -40.519938 -22.216100 -40.884234 -92.897649 -17.414522 -140.766600 18.737975 -814.128651 3.10481 0.657762 T0815 -832.866626 True
11005 11005 1005 -16052.966800 0.000000 0.000000 -496.182245 -46.135790 -22.240168 -48.690365 -92.911658 -14.106110 -149.210595 18.850429 -850.626502 2.73917 0.670541 T0815 -869.476931 True
11608 11608 1608 -16093.073170 0.000000 0.000000 -493.028360 -44.874711 -23.333941 -43.717355 -93.031297 -15.421469 -139.902506 16.422745 -836.886894 3.41557 0.605696 T0815 -853.309639 True
T0833 12000 12000 0 -14347.027331 125.898222 23.989866 -439.957297 -73.720114 -37.943400 -33.171725 -92.846135 -0.224366 -124.238990 25.645643 -626.568294 5.49476 0.501330 T0833 -652.213937 True
12804 12804 804 -13823.640680 159.231026 23.597154 -450.461446 -80.361652 -36.281754 -19.493895 -90.543384 -0.186547 -122.303341 27.890135 -588.913702 9.37339 0.445332 T0833 -616.803837 True
13005 13005 1005 -15024.095932 149.112160 24.551428 -446.849817 -74.604115 -36.845177 -30.533836 -94.630107 0.370975 -123.415370 27.948081 -604.895778 8.32255 0.499559 T0833 -632.843859 True
13690 13690 1690 -14635.795571 182.370145 27.715998 -387.021542 -75.815393 -35.868208 -11.176493 -91.676046 0.453895 -109.820435 29.691564 -471.146516 7.80125 0.488574 T0833 -500.838080 True
13809 13809 1809 -13749.486003 136.957995 25.944084 -450.714461 -82.582081 -39.772933 -26.135391 -94.109941 0.293154 -128.107811 27.006862 -631.220523 8.13774 0.471890 T0833 -658.227385 True
T251 15050 15050 1050 -18772.633497 0.000000 0.000000 -415.567625 -10.847416 -11.161604 -49.211497 -94.729339 -10.269035 -166.307919 45.807160 -712.287274 3.37350 0.640714 T251 -758.094434 True
15057 15057 1057 -18550.424560 0.000000 0.000000 -411.455186 -12.128955 -11.307285 -46.455922 -93.045272 -15.166220 -168.421130 45.877736 -712.102234 3.38279 0.625698 T251 -757.979970 True
15063 15063 1063 -18801.931114 0.000000 0.000000 -413.303745 -12.894267 -11.619890 -46.775530 -93.961634 -14.050400 -174.168870 45.335953 -721.438384 3.91536 0.635235 T251 -766.774337 True
15113 15113 1113 -18815.917771 0.000000 0.000000 -403.550604 -11.650286 -11.121332 -44.827072 -91.826783 -12.397254 -173.331452 45.103440 -703.601342 4.06113 0.589503 T251 -748.704782 True
15147 15147 1147 -18800.856827 0.000000 0.000000 -393.652214 -10.124169 -11.561996 -47.286612 -94.766359 -12.870370 -169.229040 45.322943 -694.167817 3.90440 0.587603 T251 -739.490760 True

In [63]:
best_old.query("Name == '1MBA'")


Out[63]:
Step Qw Rw VTotal QGO Burial Water Rama Chain Chi DSSP P_AP Helix Frag_Mem GDT Name Good isGood VwithoutGo chosen
Name
1MBA 87 88 0.752795 -25692.055976 -864.423582 25.042131 -124.594161 -52.906979 -592.605726 188.990816 34.977292 -0.000000 -6.610606 -48.899861 -287.816488 78.7675 1MBA 1 True -889.465713 True
177 178 0.750668 -25813.483182 -946.915090 21.671959 -125.739936 -56.475336 -632.644194 182.904382 41.451079 -0.000016 -6.830135 -49.949306 -321.303585 78.9400 1MBA 1 True -968.587049 True
181 182 0.779434 -25916.345599 -882.478750 20.135328 -123.754336 -51.047253 -607.056528 202.963107 38.336534 -0.000000 -6.093471 -55.170120 -300.792013 80.1375 1MBA 1 True -902.614078 True
191 192 0.776781 -26311.813555 -949.413109 20.736743 -125.154787 -53.136842 -627.009584 171.621313 35.537802 -0.000000 -6.592100 -51.940474 -313.475180 78.7700 1MBA 1 True -970.149852 True
198 199 0.782962 -26440.114210 -919.799652 22.403743 -126.716933 -55.753767 -618.107374 181.551899 44.976356 -0.000000 -7.731975 -53.638502 -306.783099 79.6250 1MBA 1 True -942.203395 True

In [62]:
best.query("Name == '1MBA'")


Out[62]:
Unnamed: 0 index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo chosen
Name
1MBA 3823 3823 1823 -26232.335774 189.596152 35.460661 -618.748898 -0.000503 -5.815691 -63.689887 -124.425383 -57.144515 -312.985211 15.114680 -942.638594 1.85442 0.789202 1MBA -957.753274 True
3841 3841 1841 -25826.216986 190.824793 42.227748 -609.542339 -0.001961 -5.439514 -59.998549 -123.361484 -56.063272 -312.955824 20.436977 -913.873423 1.88108 0.792109 1MBA -934.310400 True
3842 3842 1842 -26118.417369 192.759801 43.769817 -631.906384 -0.000833 -4.979912 -55.774995 -124.321893 -49.977464 -324.209426 17.400915 -937.240373 1.88540 0.784810 1MBA -954.641288 True
3848 3848 1848 -25219.863446 181.894497 42.536336 -605.696763 -0.000000 -4.787451 -55.609406 -124.642595 -51.204588 -299.959334 16.311303 -901.158001 1.89601 0.795368 1MBA -917.469304 True
3852 3852 1852 -25742.081124 188.596295 38.201283 -609.710042 -0.000004 -6.442622 -57.853857 -124.720334 -53.236209 -310.864647 19.522207 -916.507930 1.92628 0.785317 1MBA -936.030137 True

In [82]:
top_qgo_old.query("Name == '1MBA'")


Out[82]:
Step Qw Rw VTotal QGO Burial Water Rama Chain Chi DSSP P_AP Helix Frag_Mem GDT Name Good isGood VwithoutGo chosen
Name
1MBA 0 1 0.638744 -25808.704023 -1020.093843 11.154204 -122.945417 -65.130893 -615.658035 166.869218 38.730319 -0.000000 -5.988391 -58.381041 -368.743807 65.2400 1MBA 0 False -1031.248047 True
107 108 0.725040 -24913.262398 -819.639401 14.983033 -125.982809 -56.240570 -589.917850 222.916933 55.422419 -0.000000 -5.200737 -47.471042 -288.148779 76.1975 1MBA 0 True -834.622434 True
108 109 0.744714 -25298.265363 -885.076730 15.608216 -124.227303 -57.805159 -606.186362 202.728519 36.668636 -0.000000 -7.191742 -47.646001 -297.025534 76.7125 1MBA 1 True -900.684946 True
109 110 0.720428 -24868.848781 -947.119845 15.175482 -124.242780 -51.339404 -616.821418 158.855236 30.323929 -0.000002 -4.705667 -51.621281 -302.743939 74.8275 1MBA 0 False -962.295327 True
110 111 0.732315 -25033.617103 -861.470382 16.484788 -125.418972 -55.773822 -580.602912 184.613350 45.787692 -0.000000 -4.225123 -46.424569 -295.910815 75.6875 1MBA 0 True -877.955170 True

In [83]:
top_qgo.query("Name == '1MBA'")


Out[83]:
Unnamed: 0 index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo chosen
Name
1MBA 2000 2000 0 -26473.201583 167.564260 30.265355 -649.046369 -0.000234 -7.145992 -62.636467 -124.256792 -58.545028 -369.651386 9.083653 -1064.368999 2.35034 0.671072 1MBA -1073.452652 True
2010 2010 10 -25268.085249 175.498256 46.713784 -603.109631 -0.000002 -5.496606 -52.694837 -125.160438 -46.670344 -297.265618 10.418770 -897.766665 2.38582 0.715542 1MBA -908.185435 True
3005 3005 1005 -25586.216265 166.137087 34.751798 -624.929558 -0.000000 -3.735631 -57.344730 -124.972776 -55.668879 -351.836631 11.091300 -1006.508022 2.46273 0.661575 1MBA -1017.599322 True
3206 3206 1206 -26641.596220 143.172577 33.238930 -643.767739 -0.000000 -6.162572 -66.052242 -122.966350 -60.798873 -387.787094 9.568414 -1101.554949 2.62836 0.646248 1MBA -1111.123363 True
3608 3608 1608 -25708.255549 150.437122 38.305480 -617.734785 -0.000000 -5.084803 -58.348555 -124.568879 -57.987746 -385.741798 9.961194 -1050.762769 2.75558 0.647979 1MBA -1060.723963 True

In [81]:
raw_test_data = all_data
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction_new.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
# sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0, order=order)
plt.ylim([0.4,1])


Out[81]:
(0.4, 1)

In [27]:
raw_test_data = all_data
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction_new.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a174e0828>

In [30]:
t.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")


Out[30]:
Unnamed: 0 index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo prediction chosen
Name
1MBA 3823 3823 1823 -26232.335774 189.596152 35.460661 -618.748898 -0.000503 -5.815691 -63.689887 -124.425383 -57.144515 -312.985211 15.114680 -942.638594 1.854420 0.789202 1MBA -957.753274 0.529091 True
3841 3841 1841 -25826.216986 190.824793 42.227748 -609.542339 -0.001961 -5.439514 -59.998549 -123.361484 -56.063272 -312.955824 20.436977 -913.873423 1.881080 0.792109 1MBA -934.310400 0.091867 True
3842 3842 1842 -26118.417369 192.759801 43.769817 -631.906384 -0.000833 -4.979912 -55.774995 -124.321893 -49.977464 -324.209426 17.400915 -937.240373 1.885400 0.784810 1MBA -954.641288 0.298374 True
3844 3844 1844 -25939.834499 168.108895 44.270209 -601.552514 -0.001759 -4.541572 -57.346743 -122.843221 -50.206250 -299.234478 21.880694 -901.466740 1.885060 0.774916 1MBA -923.347434 0.057426 True
3846 3846 1846 -25260.390589 174.441583 35.037820 -625.700863 -0.000000 -5.423509 -54.621542 -124.184832 -58.302731 -314.301074 23.288588 -949.766559 1.871290 0.779929 1MBA -973.055147 0.024514 True
T0766 11 11 11 -17610.786968 140.064496 31.921209 -388.034988 -52.074436 -26.167400 -50.865315 -94.782419 -9.608674 -523.713150 1.957866 -971.302810 0.873581 0.917747 T0766 -973.260676 0.011526 True
14 14 14 -17488.004578 134.156068 26.159178 -403.966887 -52.329765 -27.346378 -49.714602 -93.741849 -10.792107 -517.792581 2.342559 -993.026365 0.882986 0.913634 T0766 -995.368924 0.005352 True
145 145 145 -17799.913072 133.944871 29.038961 -380.237626 -53.011393 -28.277909 -52.516315 -94.192181 -10.891758 -502.943693 1.421061 -957.665981 0.872617 0.929375 T0766 -959.087042 0.035137 True
159 159 159 -17739.398137 157.518499 31.513654 -379.868497 -52.717965 -27.255639 -53.654383 -94.340106 -7.708489 -515.043846 1.778728 -939.778044 0.900032 0.920592 T0766 -941.556772 0.017342 True
170 170 170 -17518.655339 142.194580 28.061999 -377.495609 -49.859304 -27.342028 -51.352910 -93.176664 -9.360351 -527.690157 1.324962 -964.695481 0.884048 0.932704 T0766 -966.020443 0.028595 True
T0784 4656 4656 656 -18182.869756 159.902743 28.274408 -435.774413 -61.456801 -42.808967 -53.577431 -108.479900 -0.003781 -408.138693 4.260346 -917.802490 1.330070 0.884502 T0784 -922.062836 0.198515 True
4707 4707 707 -17889.838844 199.084705 30.293010 -431.198824 -65.887055 -40.098639 -54.846292 -108.428142 -0.203810 -349.954283 3.475569 -817.763761 1.283850 0.888834 T0784 -821.239330 0.259500 True
4709 4709 709 -18215.133112 167.827729 34.423890 -446.218362 -68.346664 -41.320087 -50.291123 -108.766787 -0.063701 -359.065750 4.238563 -867.582293 1.317810 0.883401 T0784 -871.820856 0.191205 True
4760 4760 760 -18205.063463 180.301390 23.506033 -419.195266 -63.561306 -43.487552 -54.634217 -107.951332 -0.281736 -368.217820 4.135191 -849.386616 1.338360 0.888948 T0784 -853.521807 0.202032 True
5800 5800 1800 -18496.515093 183.992209 38.487248 -412.417640 -65.228818 -45.683183 -51.128706 -108.928950 -0.020457 -368.391901 4.703346 -824.616851 1.274170 0.877852 T0784 -829.320197 0.148782 True
T0792 7273 7273 1273 -10946.237972 0.000000 0.000000 -277.440305 -12.510977 -5.159923 -25.957151 -67.165503 -12.702319 -139.292047 20.138238 -520.089987 2.733420 0.758389 T0792 -540.228225 0.030295 True
7275 7275 1275 -11039.342805 0.000000 0.000000 -281.348401 -12.184764 -5.260553 -25.364080 -68.276093 -8.055099 -138.974722 23.267440 -516.196272 2.993710 0.736405 T0792 -539.463712 0.019383 True
7294 7294 1294 -10655.995027 0.000000 0.000000 -281.779524 -10.868642 -5.763252 -27.179828 -66.660392 -10.118015 -140.506231 22.025493 -520.850392 2.915740 0.686550 T0792 -542.875885 0.016333 True
7402 7402 1402 -10793.834310 0.000000 0.000000 -289.005218 -11.707311 -5.861895 -25.073242 -68.344248 -10.161492 -137.847092 21.304998 -526.695500 2.978420 0.713520 T0792 -548.000498 0.021991 True
7404 7404 1404 -10731.612651 0.000000 0.000000 -279.489699 -12.243505 -4.783175 -27.329584 -67.795904 -8.848844 -134.020067 26.725313 -507.785465 2.926710 0.700301 T0792 -534.510778 0.007441 True
T0803 9488 9488 1488 -20248.138423 181.261899 32.738907 -411.161959 -18.968860 -9.882192 -43.677599 -110.518593 -22.960870 -480.671215 8.398721 -875.441759 2.903060 0.664417 T0803 -883.840480 0.004157 True
9491 9491 1491 -20173.824994 201.334146 39.663152 -424.863157 -17.683878 -11.397712 -43.538713 -111.920144 -21.973098 -443.625204 8.684388 -825.320220 2.843680 0.656174 T0803 -834.004608 0.002948 True
9505 9505 1505 -20313.107327 176.066419 34.247895 -416.110590 -19.892007 -11.142623 -44.579321 -110.612788 -24.181408 -424.601214 8.367115 -832.438521 2.866150 0.658986 T0803 -840.805636 0.003978 True
9506 9506 1506 -20193.141783 178.508596 38.742196 -406.458412 -19.731280 -10.303253 -43.320244 -111.332851 -21.133881 -412.887854 9.845216 -798.071767 2.913120 0.638475 T0803 -807.916983 0.001380 True
9602 9602 1602 -20073.057582 188.855923 36.821268 -418.910509 -16.911055 -11.132169 -39.894653 -110.047392 -20.865849 -418.753957 10.572217 -800.266176 2.918500 0.657242 T0803 -810.838393 0.000825 True
T0815 10380 10380 380 -15523.793293 0.000000 0.000000 -460.136182 -43.891758 -21.090699 -32.177199 -89.041626 -10.286187 -136.231108 29.742542 -763.112217 2.553950 0.709999 T0815 -792.854759 0.162347 True
10381 10381 381 -15428.884504 0.000000 0.000000 -447.825172 -39.710757 -20.148375 -32.568996 -91.233431 -9.769684 -135.741141 31.940381 -745.057175 2.647630 0.684078 T0815 -776.997556 0.072026 True
10382 10382 382 -15635.030044 0.000000 0.000000 -454.975916 -39.797028 -21.783476 -33.062519 -89.482069 -7.272423 -133.090801 29.843062 -749.621171 2.603220 0.702091 T0815 -779.464233 0.162012 True
10383 10383 383 -15386.517419 0.000000 0.000000 -465.022554 -43.591322 -19.648439 -30.000310 -90.259875 -12.923773 -132.784217 32.280300 -761.950191 2.708160 0.680084 T0815 -794.230491 0.067402 True
10384 10384 384 -15753.780147 0.000000 0.000000 -458.910038 -41.535669 -20.813055 -35.455198 -90.557475 -17.873540 -140.235768 28.706548 -776.674194 2.686100 0.722019 T0815 -805.380742 0.264030 True
T0833 12020 12020 20 -13979.647839 206.370839 36.745797 -388.401978 -70.439159 -33.294736 -14.692104 -88.417033 -0.568794 -111.114497 42.476839 -421.334825 5.140790 0.434854 T0833 -463.811664 0.000953 True
12029 12029 29 -14331.635646 170.128126 24.135122 -382.236260 -70.269009 -33.421297 -17.265699 -88.823521 0.387292 -113.103167 34.674839 -475.793573 5.135260 0.463896 T0833 -510.468412 0.091176 True
12030 12030 30 -14313.763858 157.004610 29.241317 -368.956732 -65.416930 -34.357066 -20.291633 -90.729168 -0.421964 -116.766939 33.854237 -476.840268 5.133230 0.458242 T0833 -510.694505 0.133951 True
12031 12031 31 -14143.341482 169.052100 32.689558 -372.406516 -68.912139 -31.037305 -21.026325 -89.769686 -0.272196 -110.525022 31.110441 -461.097090 5.049850 0.472168 T0833 -492.207531 0.361867 True
12032 12032 32 -13993.598308 179.180204 38.456738 -350.999532 -67.100023 -33.301093 -21.929949 -89.388754 0.487346 -116.547145 38.861173 -422.281037 5.156090 0.455082 T0833 -461.142210 0.006749 True
T251 15154 15154 1154 -18638.364805 0.000000 0.000000 -411.615522 -11.942189 -11.551939 -48.240281 -95.256909 -17.117084 -176.820363 49.634038 -722.910249 3.267290 0.600023 T251 -772.544287 0.483419 True
15157 15157 1157 -18591.106567 0.000000 0.000000 -417.711402 -11.239553 -10.102814 -45.744572 -94.843513 -14.497067 -177.833370 52.047598 -719.924694 3.197870 0.588059 T251 -771.972292 0.270425 True
15188 15188 1188 -18498.250299 0.000000 0.000000 -409.840950 -9.460469 -11.168136 -43.551496 -94.168460 -12.297477 -164.335790 50.091197 -694.731580 3.323480 0.610946 T251 -744.822777 0.386181 True
15194 15194 1194 -18678.389619 0.000000 0.000000 -405.999714 -9.416280 -9.824018 -42.832506 -94.795493 -14.462193 -169.902484 51.054782 -696.177907 3.280660 0.615689 T251 -747.232689 0.337143 True
15198 15198 1198 -18602.923360 0.000000 0.000000 -411.058168 -9.676173 -9.383589 -43.478690 -93.709073 -17.326168 -180.273499 49.894692 -715.010668 3.236590 0.622755 T251 -764.905360 0.444465 True

In [28]:
best_by_prediction


Out[28]:
index Rw Chain Chi Rama DSSP P_AP Water Burial Helix Frag_Mem QGO VTotal Rmsd Qw Name VwithoutGo prediction chosen
Name
1MBA 2000 0 -26473.201583 167.564260 30.265355 -649.046369 -0.000234 -7.145992 -62.636467 -124.256792 -58.545028 -369.651386 9.083653 -1064.368999 2.35034 0.671072 1MBA -1073.452652 0.949798 True
2402 402 -25524.115997 162.588117 29.871546 -637.486851 -0.000000 -6.117750 -59.092704 -122.056701 -53.978581 -374.526317 11.180865 -1049.618378 2.87010 0.598428 1MBA -1060.799243 0.805544 True
3005 1005 -25586.216265 166.137087 34.751798 -624.929558 -0.000000 -3.735631 -57.344730 -124.972776 -55.668879 -351.836631 11.091300 -1006.508022 2.46273 0.661575 1MBA -1017.599322 0.798692 True
3206 1206 -26641.596220 143.172577 33.238930 -643.767739 -0.000000 -6.162572 -66.052242 -122.966350 -60.798873 -387.787094 9.568414 -1101.554949 2.62836 0.646248 1MBA -1111.123363 0.951620 True
3608 1608 -25708.255549 150.437122 38.305480 -617.734785 -0.000000 -5.084803 -58.348555 -124.568879 -57.987746 -385.741798 9.961194 -1050.762769 2.75558 0.647979 1MBA -1060.723963 0.883126 True
T0766 311 311 -17704.811371 123.990918 22.735052 -380.398660 -55.805090 -26.587830 -53.205623 -94.569191 -11.074971 -617.663486 0.456534 -1092.122348 1.54003 0.914866 T0766 -1092.578882 0.177008 True
690 690 -17639.457992 148.283830 30.621936 -378.756833 -53.457257 -27.462251 -48.533265 -95.326469 -11.294877 -531.754000 0.311335 -967.367851 1.47015 0.895680 T0766 -967.679186 0.162040 True
694 694 -17452.458817 147.666186 21.177155 -370.783486 -52.938989 -28.206054 -52.597858 -91.350741 -10.367621 -535.264614 0.346713 -972.319310 1.40461 0.906777 T0766 -972.666023 0.124396 True
762 762 -17563.616919 149.511441 28.270570 -389.741613 -52.871474 -28.166277 -51.905238 -93.320169 -11.444771 -607.752169 0.566944 -1056.852756 1.28903 0.914586 T0766 -1057.419700 0.120021 True
1597 1597 -17647.474507 140.051354 25.654790 -370.026178 -53.598729 -27.379476 -52.964184 -93.993429 -10.296802 -604.672759 0.561102 -1046.664311 1.33343 0.913885 T0766 -1047.225413 0.131527 True
T0784 4425 425 -18531.462212 185.730963 34.946650 -450.223942 -64.001365 -43.157142 -53.390888 -109.582820 -0.010833 -461.257439 3.184231 -957.762586 1.69954 0.861257 T0784 -960.946817 0.589055 True
4426 426 -18340.014479 179.491019 30.393790 -438.113931 -66.980525 -44.054194 -50.487615 -108.206247 -0.005735 -383.966277 2.918712 -879.011003 1.79444 0.856198 T0784 -881.929715 0.566919 True
4436 436 -18549.235385 140.277430 24.474562 -430.603214 -68.431445 -46.133949 -53.663111 -108.832825 -0.009780 -360.186453 3.069609 -900.039177 1.92239 0.862397 T0784 -903.108786 0.595796 True
4437 437 -18300.385795 186.116508 38.522451 -419.742656 -66.553980 -45.814804 -49.971007 -108.718759 -0.030533 -412.046502 2.888911 -875.350371 1.99202 0.863625 T0784 -878.239282 0.561915 True
4464 464 -18215.560827 173.797530 34.959398 -433.576657 -69.404207 -45.582299 -50.801103 -109.121912 -0.001761 -447.600269 2.697612 -944.633670 1.73140 0.859810 T0784 -947.331282 0.627926 True
T0792 6000 0 -10722.090867 0.000000 0.000000 -322.603244 -12.292156 -3.793797 -29.870054 -67.271757 -10.340208 -161.125299 8.501405 -598.795110 3.78606 0.652529 T0792 -607.296515 0.216025 True
6201 201 -10802.793172 0.000000 0.000000 -311.411630 -12.679359 -4.738974 -30.576894 -69.909098 -14.183376 -152.699472 9.043662 -587.155142 5.47543 0.653273 T0792 -596.198804 0.204651 True
7005 1005 -10933.898349 0.000000 0.000000 -318.108106 -9.348588 -3.597018 -30.422177 -70.071184 -10.798472 -165.003199 6.594762 -600.753983 4.61486 0.642170 T0792 -607.348745 0.326640 True
7608 1608 -10483.371159 0.000000 0.000000 -310.550875 -12.222728 -4.014591 -28.343068 -69.086903 -12.808956 -153.112519 7.534226 -582.605414 5.69853 0.663933 T0792 -590.139640 0.187509 True
7809 1809 -11238.544141 0.000000 0.000000 -310.800775 -11.963667 -5.539519 -34.182560 -69.515103 -10.704194 -153.382452 11.314676 -584.773594 5.05561 0.615843 T0792 -596.088270 0.215628 True
T0803 8151 151 -20703.421673 148.579686 33.980616 -427.281804 -14.676221 -13.162436 -47.610050 -112.017937 -23.709906 -501.902491 1.324196 -956.476346 6.39523 0.729766 T0803 -957.800542 0.336168 True
8164 164 -20766.225860 165.328464 42.379268 -429.637537 -14.629371 -12.229928 -47.715274 -111.185710 -24.094283 -430.275259 1.329070 -860.730561 6.10241 0.725550 T0803 -862.059631 0.293669 True
8180 180 -20933.535882 193.697896 34.969874 -416.429394 -15.701267 -12.505622 -44.169422 -109.641645 -24.621304 -477.095735 1.187569 -870.309050 6.03585 0.738008 T0803 -871.496619 0.340815 True
8190 190 -20844.962110 179.110734 31.733673 -424.041076 -14.977842 -11.352433 -47.281069 -109.524348 -25.302347 -474.433750 1.459723 -894.608736 6.15701 0.734347 T0803 -896.068459 0.305125 True
8193 193 -20906.030632 173.850768 28.992856 -423.359886 -15.054973 -11.762510 -42.826267 -111.559722 -26.147188 -464.418699 1.455861 -890.829760 5.70414 0.732467 T0803 -892.285621 0.312087 True
T0815 10000 0 -15635.174335 0.000000 0.000000 -494.048924 -48.611577 -20.980908 -44.869819 -92.643768 -18.156326 -134.028207 18.225475 -835.114054 3.42805 0.607091 T0815 -853.339529 0.937762 True
10603 603 -15826.858040 0.000000 0.000000 -478.167582 -40.519938 -22.216100 -40.884234 -92.897649 -17.414522 -140.766600 18.737975 -814.128651 3.10481 0.657762 T0815 -832.866626 0.931107 True
11005 1005 -16052.966800 0.000000 0.000000 -496.182245 -46.135790 -22.240168 -48.690365 -92.911658 -14.106110 -149.210595 18.850429 -850.626502 2.73917 0.670541 T0815 -869.476931 0.949586 True
11206 1206 -15878.678416 0.000000 0.000000 -490.126164 -47.006207 -22.083533 -44.333210 -93.701689 -6.601143 -150.378775 19.254297 -834.976424 3.10791 0.647490 T0815 -854.230721 0.929100 True
11608 1608 -16093.073170 0.000000 0.000000 -493.028360 -44.874711 -23.333941 -43.717355 -93.031297 -15.421469 -139.902506 16.422745 -836.886894 3.41557 0.605696 T0815 -853.309639 0.976557 True
T0833 12000 0 -14347.027331 125.898222 23.989866 -439.957297 -73.720114 -37.943400 -33.171725 -92.846135 -0.224366 -124.238990 25.645643 -626.568294 5.49476 0.501330 T0833 -652.213937 0.956918 True
12402 402 -14743.328182 158.058162 24.461995 -444.935460 -78.334591 -37.783785 -30.407831 -94.725561 0.319723 -130.712649 30.173411 -603.886586 8.42128 0.476841 T0833 -634.059997 0.705643 True
12804 804 -13823.640680 159.231026 23.597154 -450.461446 -80.361652 -36.281754 -19.493895 -90.543384 -0.186547 -122.303341 27.890135 -588.913702 9.37339 0.445332 T0833 -616.803837 0.797835 True
13005 1005 -15024.095932 149.112160 24.551428 -446.849817 -74.604115 -36.845177 -30.533836 -94.630107 0.370975 -123.415370 27.948081 -604.895778 8.32255 0.499559 T0833 -632.843859 0.907696 True
13809 1809 -13749.486003 136.957995 25.944084 -450.714461 -82.582081 -39.772933 -26.135391 -94.109941 0.293154 -128.107811 27.006862 -631.220523 8.13774 0.471890 T0833 -658.227385 0.874346 True
T251 15050 1050 -18772.633497 0.000000 0.000000 -415.567625 -10.847416 -11.161604 -49.211497 -94.729339 -10.269035 -166.307919 45.807160 -712.287274 3.37350 0.640714 T251 -758.094434 0.798224 True
15063 1063 -18801.931114 0.000000 0.000000 -413.303745 -12.894267 -11.619890 -46.775530 -93.961634 -14.050400 -174.168870 45.335953 -721.438384 3.91536 0.635235 T251 -766.774337 0.833380 True
15078 1078 -18822.844179 0.000000 0.000000 -424.379341 -11.521283 -11.750080 -48.864631 -93.257680 -13.195063 -172.934384 46.224212 -729.678250 3.88014 0.640889 T251 -775.902462 0.792404 True
15113 1113 -18815.917771 0.000000 0.000000 -403.550604 -11.650286 -11.121332 -44.827072 -91.826783 -12.397254 -173.331452 45.103440 -703.601342 4.06113 0.589503 T251 -748.704782 0.835229 True
15147 1147 -18800.856827 0.000000 0.000000 -393.652214 -10.124169 -11.561996 -47.286612 -94.766359 -12.870370 -169.229040 45.322943 -694.167817 3.90440 0.587603 T251 -739.490760 0.815967 True

In [126]:
best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction.csv")

In [171]:
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")

In [168]:
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction.csv")

In [165]:
for name, data in a.groupby("Name"):
    print(name)
#     print(data["index"])
    for i in data["index"]:
        print(i)


1MBA
0
108
109
185
188
T0766
2010
2274
2276
2277
2627
T0784
4677
4741
5482
5485
5622
T0792
6030
6207
6210
6212
6213
T0803
8040
8178
8201
8560
8636
T0815
10050
10454
10461
10463
11066
T0833
12060
12475
12478
13068
13071
T251
14042
15059
15102
15112
15870

In [ ]:


In [138]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[138]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1828f7f0>

In [122]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1787b6d8>

In [114]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.boxplot("Name","value", data=final2, hue="variable")


Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0a5b15f8>

In [99]:
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a17653d68>

In [97]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a16c612b0>

In [12]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0b8457b8>

In [15]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0b845240>

In [17]:
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0b44f390>

In [ ]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)

In [ ]:
# 1mba
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)

In [ ]:
# t0792
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)

In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)

In [ ]:
sns.boxplot("Name","value", data=final2, hue="variable")

In [60]:
sns.swarmplot(x='Name', y='value', data=final2, hue="variable", size=10)


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a0a2ba2e8>

In [51]:
sns.swarmplot(x='Name', y='value', data=final2, hue="variable")


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a143e9f98>

In [52]:
sns.stripplot("Name", "value", data=final2, hue="variable", jitter=True)


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x103716eb8>

In [ ]:
sns.stripplot("value", "Name", data=final2, hue="variable", jitter=True)

In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")

In [ ]:


In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")

In [ ]:
np.sum(y_pred != train_y)

In [ ]:


In [ ]:
prediction.shape

In [ ]:
raw_test_data.shape

In [ ]:
y_pred.shape

In [ ]:
eval_y.argsort()[-n:][::-1]

In [ ]:
n = 10
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= regr.predict(eval_set)
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    position_of_top_n = eval_y.argsort()[-50:][::-1]
    threshold = eval_y[position_of_top_n][-1]
    test_y = np.zeros(len(eval_y),)
    test_y[position_of_top_n] = 1
    plt.figure()
    plt.scatter(test, eval_y)
    plt.show()

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(test_y, predict_y))

In [ ]:
plt.scatter(y_pred, train_y)

In [ ]:


In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)
    
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")

In [ ]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 50)
raw_test_data = raw_test_data_2
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
raw_data = raw_data_T0792

In [ ]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
PolynomialDegree = 2
p = 0.1


num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])
    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]



# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})

log_clf.fit(train_set, train_y)


# check on training set
n = 5

clf = log_clf
#     y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)



for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")

In [ ]:
strat_train_set["a"] = prob

In [ ]:
strat_train_set.plot("a", "GDT", kind="scatter")

In [ ]:
prob

In [ ]:
pd.concat([strat_train_set, pd.Series(prob)], axis=1)

In [ ]:
def compute_with_my_score_function(p=0.9, PolynomialDegree=3):
    FEATURES = ['Rw',
     'VTotal',
     'QGO',
     'Burial',
     'Water',
     'Rama',
     'DSSP',
     'P_AP',
     'Helix',
     'Frag_Mem']
    # LABEL = "Qw"
    LABEL = "isGood"

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    frame = 201
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    my_full_pipeline = Pipeline([
    #         ('removeFirstFrame', RemoveFirstFrame(frame)),
            ('featureSelection', full_pipeline)
    ])

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_full_pipeline.fit_transform(strat_train_set)
    X_test = my_full_pipeline.fit_transform(strat_test_set)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]

    # log_clf = LogisticRegression(random_state=142)
    # rnd_clf = RandomForestClassifier(random_state=432)
    # svm_clf = SVC(probability=True, random_state=412)
    log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
    log_clf.fit(train_set, train_y)

#     voting_clf.fit(train_set, train_y)
    n = 10
    cl_name = "lr"
    clf = log_clf
#     for cl_name, clf in ("voting", voting_clf):
    my_evaluation = 1.0
    another_evaluation = 0.0
    for name, data in raw_test_data.groupby("Name"):
#             print(name)
#         X = full_pipeline.fit_transform(data)
#         validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)
        validation_data = my_full_pipeline.fit_transform(raw_data_T0784)
        validation_y = validation_data[:,-1]
        validation_set = validation_data[:,:-1]
        clf.fit(train_set, train_y)
        test= clf.predict_proba(validation_set)[:,1]
        position_of_top_n = test.argsort()[-n:][::-1]
        threshold = test[position_of_top_n][-1]
        predict_y = np.zeros(len(validation_y),)
        predict_y[position_of_top_n] = 1
    #     predict_y = (test > threshold)
#         print(threshold)
        cm = confusion_matrix(validation_y, predict_y)
#             print(cm)
        precision = cm[1][1] / (cm[1][1] + cm[0][1])
#             print(name,  " precision", precision,end = " ")
        if name != "T0766" and name != "T0833":
            my_evaluation *= precision
            another_evaluation += precision
#         print("")
    print("classifier:", cl_name, ", p:",p, ", degree", PolynomialDegree, ", score", my_evaluation, ", another score", another_evaluation)
    return (cl_name, p, PolynomialDegree, my_evaluation)

In [ ]:
def myGridSerach():
    p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
    degree_list = [3, 2, 1]
#     p_list = [0.1, 0.8, 0.9, 0.95]
#     degree_list = [1, 2, 3]
#     p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
#     degree_list = [1, 2, 3, 4]
    result = []
    for p in p_list:
        for degree in degree_list:
            result += [compute_with_my_score_function(p, degree)]

In [ ]:
myGridSerach()

In [ ]:
compute_with_my_score_function(0.1, 1)

In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")

In [ ]:
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])

In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)

In [ ]:
# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)

In [ ]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))

In [ ]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = my_full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1


#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))

In [ ]: