notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)



In [2]:

    
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")
def choose_top_rw(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.Rw)<=n)
def choose_top_vtotal(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.VTotal)<=n)
def choose_top(data,col="Qw", n=5, ascending=False):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending)<=n)



In [3]:

    
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["Qw"].rank(ascending=False, method='first') < 51)
raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data.VTotal - raw_test_data.QGO)
# raw_test_data = raw_test_data_2



In [19]:

    
all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")
all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)
raw_test_data_2 = all_data
raw_test_data_2 = all_data.assign(isGood=raw_test_data_2.groupby("Name")["Qw"].rank(ascending=False, method='first') < 6)
raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data_2.VTotal - raw_test_data_2.QGO)



In [21]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "Rw", "Qw")



In [22]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "VwithoutGo", "Qw")



In [23]:

    
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")



In [24]:

    
FEATURES = ['Rw',
#      'VTotal',
     'QGO',
     'VwithoutGo',
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
n = 5
def my_transform(data, label, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

def my_transform_predict(data, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    PolynomialDegree = degree

    num_attribs = FEATURES
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    return num_pipeline.fit_transform(data)



In [25]:

    
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
# raw_data = raw_data_T0792
# raw_data = raw_test_data.groupby("Name").get_group("1mba")
raw_data = raw_data_T0784



In [26]:

    
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)



In [27]:

    
label = "isGood"
degree = 1
p = 0.1
train_set, train_y, test_set, test_y = train_and_test(raw_data, label=label, degree=degree)
log_clf = LogisticRegression(random_state=140, penalty='l2')

# log_clf = LogisticRegression(random_state=14, class_weight={0:p, 1:(1-p)}, penalty='l1')
log_clf.fit(train_set, train_y)
y_pred = log_clf.predict(train_set)
# n = 100
prediction_list = []
for name, data in raw_test_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
    X = my_transform(data, label, degree)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True")



In [28]:

    
print(*(zip(FEATURES, log_clf.coef_[0])))









    



('Rw', -0.26172648454510883) ('QGO', -0.57871478923113406) ('VwithoutGo', -0.081520116588173139)



In [29]:

    
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")



In [31]:

    
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0e22a710>



In [14]:

    
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a188b48d0>



In [15]:

    
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x107192860>



In [149]:

    
# T0784
label = "GDT"
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[149]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a15ec9ac8>



In [146]:

    
best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")



In [11]:



In [162]:

    
protein_list = ["T0766", "1mba", "T0784", "T0792", "T0803", "T0815", "T0833", "T0251"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []
for protein in protein_list:
    awsem = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_awsem.log", names=name_list)
    rw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rw.txt", names=["i", "Rw"], sep="\s+")
    rmsd = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rmsd.txt", names=["i2", "Rmsd"], sep="\s+")
    qw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_qw.txt", names=["i3", "Qw"], sep="\s+")
    rw = rw[:2000].reset_index(drop=True)
    awsem = awsem[:2000].reset_index(drop=True)
    rmsd = rmsd[:2000].reset_index(drop=True)
    qw = qw[:2000].reset_index(drop=True)
    data = pd.concat([rw, awsem, rmsd, qw], axis=1)
    remove_columns = ['i', 'i2', 'i3', 'Step', "Shake", "Excluded", "AMH_Go", "Membrane", "Vec_FM", "SSB", "Electro."]
    if protein == "T0251":
        p = "T251"
    elif protein == "1mba":
        p = "1MBA"
    else:
        p = protein
    data = data.drop(remove_columns, axis=1).reset_index().assign(Name=p)
    all_data_list.append(data)
all_data = pd.concat(all_data_list).reset_index(drop=True)



In [ ]:



In [163]:

    
all_data.to_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")



In [164]:

    
all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")
all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)



In [165]:

    
prediction_list = []
for name, data in all_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
    X = my_transform_predict(data, degree=1)
    eval_set = X
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = all_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True").drop('Unnamed: 0',axis=1)



In [166]:

    
raw_test_data = all_data
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[166]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a1169b0>



In [19]:

    
best_by_prediction









    Out[19]:







  
    
      
      
      index
      Rw
      Chain
      Chi
      Rama
      DSSP
      P_AP
      Water
      Burial
      Helix
      Frag_Mem
      QGO
      VTotal
      Rmsd
      Qw
      Name
      VwithoutGo
      prediction
      chosen
    
    
      Name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1mba
      3050
      1050
      -25668.934863
      188.362784
      47.568387
      -623.269227
      -0.002060
      -6.122339
      -51.366841
      -123.306045
      -55.697016
      -329.652933
      34.653521
      -918.831767
      4.41750
      0.517421
      1mba
      -953.485288
      0.798224
      True
    
    
      3063
      1063
      -25642.538441
      193.180635
      40.801919
      -603.733959
      -0.002218
      -6.071182
      -51.821636
      -123.479884
      -43.882664
      -319.205247
      31.221287
      -882.992950
      4.49287
      0.532833
      1mba
      -914.214237
      0.833380
      True
    
    
      3078
      1078
      -25818.058079
      203.088673
      39.071277
      -616.459828
      -0.005223
      -6.639706
      -54.863242
      -123.371367
      -49.869419
      -318.902394
      32.990088
      -894.961143
      4.36205
      0.517205
      1mba
      -927.951231
      0.792404
      True
    
    
      3113
      1113
      -26194.903471
      174.845709
      51.534425
      -620.731595
      -0.000000
      -5.303780
      -53.916686
      -123.361001
      -58.463907
      -341.886124
      30.007701
      -947.275257
      4.33429
      0.528589
      1mba
      -977.282958
      0.835229
      True
    
    
      3147
      1147
      -26212.316297
      199.462767
      36.102053
      -614.154999
      -0.000004
      -5.832983
      -59.233419
      -123.518396
      -53.213548
      -331.911306
      23.791139
      -928.508696
      3.53488
      0.544209
      1mba
      -952.299835
      0.815967
      True
    
    
      T0251
      14000
      0
      -19077.704744
      0.000000
      0.000000
      -441.605335
      -15.255526
      -13.474660
      -62.605609
      -96.658133
      -20.929080
      -188.140148
      50.247258
      -788.421233
      3.39870
      0.552992
      T0251
      -838.668491
      0.956918
      True
    
    
      14402
      402
      -18013.842801
      0.000000
      0.000000
      -447.097493
      -16.364849
      -15.235617
      -59.233454
      -97.671568
      -19.803157
      -188.318154
      57.156918
      -786.567374
      4.96755
      0.487310
      T0251
      -843.724292
      0.705643
      True
    
    
      14804
      804
      -18245.591771
      0.000000
      0.000000
      -453.122289
      -15.905990
      -12.362693
      -61.275689
      -97.947330
      -21.894612
      -185.475895
      61.281099
      -786.703399
      5.07265
      0.472612
      T0251
      -847.984498
      0.797835
      True
    
    
      15005
      1005
      -18000.827772
      0.000000
      0.000000
      -427.470394
      -7.582649
      -13.119749
      -61.779600
      -95.737925
      -15.172857
      -185.459790
      50.552695
      -755.770268
      3.85729
      0.599933
      T0251
      -806.322963
      0.907696
      True
    
    
      15809
      1809
      -18638.719085
      0.000000
      0.000000
      -448.519637
      -17.481876
      -15.671105
      -58.833573
      -96.589674
      -15.501984
      -185.347017
      63.856166
      -774.088700
      4.30655
      0.443934
      T0251
      -837.944866
      0.874346
      True
    
    
      T0766
      0
      0
      -17213.185246
      115.628799
      21.795659
      -401.238938
      -50.514721
      -26.249246
      -47.507644
      -94.343514
      -11.386663
      -650.792467
      1.867385
      -1142.741350
      1.21386
      0.893880
      T0766
      -1144.608735
      0.949798
      True
    
    
      402
      402
      -17460.586267
      142.255390
      19.363798
      -404.168852
      -48.622003
      -27.139873
      -50.741840
      -95.833920
      -11.055376
      -620.009443
      0.817069
      -1095.135050
      1.42096
      0.904559
      T0766
      -1095.952119
      0.805544
      True
    
    
      1005
      1005
      -17513.935527
      160.807747
      18.046265
      -409.940394
      -50.866838
      -25.586106
      -48.405243
      -94.998609
      -12.884592
      -656.539459
      1.560692
      -1118.806537
      1.47072
      0.884528
      T0766
      -1120.367229
      0.798692
      True
    
    
      1206
      1206
      -17437.085465
      129.149716
      22.957345
      -400.999639
      -49.327841
      -25.916715
      -51.290571
      -93.599708
      -9.825087
      -676.046765
      1.428747
      -1153.470519
      1.03706
      0.916326
      T0766
      -1154.899266
      0.951620
      True
    
    
      1608
      1608
      -17193.556092
      149.790571
      24.967125
      -404.096143
      -53.767317
      -26.944336
      -49.565923
      -93.966839
      -12.442834
      -628.651534
      1.785025
      -1092.892204
      1.24034
      0.894871
      T0766
      -1094.677229
      0.883126
      True
    
    
      T0784
      4311
      311
      -17790.388773
      172.006238
      42.301708
      -423.169882
      -53.688516
      -42.866065
      -52.207805
      -108.338088
      -0.016669
      -357.302010
      6.675683
      -816.605406
      2.45253
      0.791893
      T0784
      -823.281089
      0.177008
      True
    
    
      4690
      690
      -17779.884202
      160.077993
      34.527754
      -429.454151
      -69.401195
      -41.405390
      -53.996417
      -108.620388
      -0.007854
      -375.487237
      5.999863
      -877.767022
      2.02618
      0.849591
      T0784
      -883.766885
      0.162040
      True
    
    
      4694
      694
      -18115.408634
      163.993974
      27.565366
      -429.295744
      -62.495072
      -41.869436
      -54.215824
      -109.287047
      -0.136691
      -383.106827
      4.189229
      -884.658071
      1.47136
      0.883480
      T0784
      -888.847300
      0.124396
      True
    
    
      4762
      762
      -18261.349908
      167.276551
      29.235053
      -433.333557
      -63.107493
      -43.180998
      -53.032621
      -106.291134
      -0.168822
      -360.562862
      4.036132
      -859.129751
      1.41793
      0.887978
      T0784
      -863.165883
      0.120021
      True
    
    
      5597
      1597
      -17650.941892
      181.175401
      25.769981
      -429.333138
      -55.967482
      -39.977174
      -55.087960
      -107.362945
      -0.017062
      -324.953519
      10.309939
      -795.443960
      2.32883
      0.778717
      T0784
      -805.753899
      0.131527
      True
    
    
      T0792
      6425
      425
      -11114.901279
      0.000000
      0.000000
      -300.134036
      -12.470584
      -4.831806
      -22.726316
      -68.246065
      -4.484484
      -153.134180
      19.288209
      -546.739263
      7.01416
      0.673084
      T0792
      -566.027472
      0.589055
      True
    
    
      6426
      426
      -10917.058304
      0.000000
      0.000000
      -276.838360
      -11.131518
      -4.609209
      -20.036211
      -68.027265
      -6.663339
      -136.018791
      17.503348
      -505.821346
      7.02225
      0.668682
      T0792
      -523.324694
      0.566919
      True
    
    
      6436
      436
      -10781.052824
      0.000000
      0.000000
      -281.789608
      -11.297005
      -3.666596
      -24.536640
      -67.794657
      -5.864976
      -144.374105
      21.283305
      -518.040283
      5.47535
      0.669834
      T0792
      -539.323588
      0.595796
      True
    
    
      6437
      437
      -10864.652686
      0.000000
      0.000000
      -282.437955
      -13.257966
      -4.601048
      -25.859225
      -67.660238
      -9.546479
      -142.927229
      14.146699
      -532.143441
      5.67942
      0.660779
      T0792
      -546.290140
      0.561915
      True
    
    
      6464
      464
      -10961.791460
      0.000000
      0.000000
      -261.186635
      -11.493735
      -4.362560
      -22.956799
      -66.466707
      -8.541653
      -143.393317
      19.436948
      -498.964458
      5.85229
      0.659078
      T0792
      -518.401406
      0.627926
      True
    
    
      T0803
      8000
      0
      -19861.129034
      160.995808
      37.520467
      -442.112608
      -19.569684
      -14.137854
      -54.157202
      -114.150717
      -21.671180
      -656.848769
      1.824290
      -1122.307448
      6.68803
      0.670458
      T0803
      -1124.131738
      0.216025
      True
    
    
      8201
      201
      -19469.650769
      173.224426
      39.791078
      -440.649099
      -14.457358
      -16.407512
      -56.170295
      -116.170425
      -22.653950
      -586.198437
      4.400490
      -1035.291082
      7.32675
      0.644108
      T0803
      -1039.691572
      0.204651
      True
    
    
      9005
      1005
      -19226.886197
      170.780230
      30.153493
      -438.148101
      -16.147199
      -11.494153
      -48.835171
      -115.693482
      -21.173096
      -574.051793
      2.741175
      -1021.868096
      6.83928
      0.609516
      T0803
      -1024.609271
      0.326640
      True
    
    
      9608
      1608
      -19432.885521
      161.641151
      32.377136
      -441.148114
      -17.631531
      -17.494679
      -51.660735
      -113.658802
      -22.108639
      -599.783424
      1.830782
      -1067.636856
      5.89592
      0.647433
      T0803
      -1069.467638
      0.187509
      True
    
    
      9809
      1809
      -19464.480000
      167.911743
      42.869046
      -444.462776
      -15.556580
      -13.950089
      -49.674335
      -114.616031
      -20.559497
      -566.708800
      3.072288
      -1011.675029
      7.09471
      0.618167
      T0803
      -1014.747317
      0.215628
      True
    
    
      T0815
      10151
      151
      -15474.568755
      0.000000
      0.000000
      -430.626865
      -48.826497
      -20.809696
      -29.483155
      -89.869895
      -11.834988
      -140.410006
      35.310886
      -736.550216
      3.23743
      0.631509
      T0815
      -771.861102
      0.336168
      True
    
    
      10164
      164
      -15329.162447
      0.000000
      0.000000
      -457.809116
      -36.914114
      -18.747275
      -28.221452
      -85.754506
      -13.415093
      -145.545002
      39.588280
      -746.818279
      4.00120
      0.569437
      T0815
      -786.406559
      0.293669
      True
    
    
      10180
      180
      -15410.540183
      0.000000
      0.000000
      -438.085216
      -42.225680
      -20.638568
      -32.508001
      -91.201691
      -11.749104
      -132.841664
      33.387503
      -735.862423
      3.46268
      0.632121
      T0815
      -769.249926
      0.340815
      True
    
    
      10190
      190
      -15614.262927
      0.000000
      0.000000
      -463.517624
      -43.762311
      -19.316092
      -34.191246
      -89.616186
      -9.052628
      -139.576523
      33.067140
      -765.965471
      3.36234
      0.635324
      T0815
      -799.032611
      0.305125
      True
    
    
      10193
      193
      -15276.982093
      0.000000
      0.000000
      -453.499118
      -44.294816
      -17.527847
      -29.721187
      -91.683871
      -7.954028
      -138.872867
      40.456669
      -743.097066
      3.40225
      0.601252
      T0815
      -783.553735
      0.312087
      True
    
    
      T0833
      12000
      0
      -14347.027331
      125.898222
      23.989866
      -439.957297
      -73.720114
      -37.943400
      -33.171725
      -92.846135
      -0.224366
      -124.238990
      25.645643
      -626.568294
      5.49476
      0.501330
      T0833
      -652.213937
      0.937762
      True
    
    
      12603
      603
      -14701.547826
      138.917304
      32.995803
      -443.299762
      -79.511003
      -38.170243
      -27.914074
      -93.765783
      0.124468
      -125.626753
      30.788565
      -605.461478
      7.74010
      0.476920
      T0833
      -636.250043
      0.931107
      True
    
    
      13005
      1005
      -15024.095932
      149.112160
      24.551428
      -446.849817
      -74.604115
      -36.845177
      -30.533836
      -94.630107
      0.370975
      -123.415370
      27.948081
      -604.895778
      8.32255
      0.499559
      T0833
      -632.843859
      0.949586
      True
    
    
      13206
      1206
      -13642.710159
      143.728681
      30.403366
      -457.602172
      -75.679105
      -39.949399
      -21.676088
      -94.171657
      0.436941
      -129.780119
      30.649766
      -613.639785
      10.94590
      0.439767
      T0833
      -644.289551
      0.929100
      True
    
    
      13608
      1608
      -15234.470919
      161.067720
      25.613174
      -453.118256
      -80.961181
      -39.700876
      -33.697349
      -94.848560
      -0.639133
      -130.593643
      34.276312
      -612.601792
      7.98275
      0.463533
      T0833
      -646.878104
      0.976557
      True



In [32]:

    
best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction_based_on_new.csv")



In [171]:

    
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")



In [168]:

    
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction.csv")



In [165]:

    
for name, data in a.groupby("Name"):
    print(name)
#     print(data["index"])
    for i in data["index"]:
        print(i)



In [ ]:



In [138]:

    
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[138]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a1828f7f0>



In [122]:

    
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[122]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a1787b6d8>



In [114]:

    
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.boxplot("Name","value", data=final2, hue="variable")









    Out[114]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0a5b15f8>



In [99]:

    
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[99]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a17653d68>



In [97]:

    
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[97]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a16c612b0>



In [12]:

    
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0b8457b8>



In [15]:

    
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0b845240>



In [17]:

    
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0b44f390>



In [ ]:

    
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)



In [ ]:

    
# 1mba
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)



In [ ]:

    
# t0792
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)



In [ ]:

    
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)



In [ ]:

    
sns.boxplot("Name","value", data=final2, hue="variable")



In [60]:

    
sns.swarmplot(x='Name', y='value', data=final2, hue="variable", size=10)









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a0a2ba2e8>



In [51]:

    
sns.swarmplot(x='Name', y='value', data=final2, hue="variable")









    Out[51]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a143e9f98>



In [52]:

    
sns.stripplot("Name", "value", data=final2, hue="variable", jitter=True)









    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0x103716eb8>



In [ ]:

    
sns.stripplot("value", "Name", data=final2, hue="variable", jitter=True)



In [ ]:

    
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")



In [ ]:



In [ ]:

    
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")



In [ ]:

    
np.sum(y_pred != train_y)



In [ ]:



In [ ]:

    
prediction.shape



In [ ]:

    
raw_test_data.shape



In [ ]:

    
y_pred.shape



In [ ]:

    
eval_y.argsort()[-n:][::-1]



In [ ]:

    
n = 10
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= regr.predict(eval_set)
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    position_of_top_n = eval_y.argsort()[-50:][::-1]
    threshold = eval_y[position_of_top_n][-1]
    test_y = np.zeros(len(eval_y),)
    test_y[position_of_top_n] = 1
    plt.figure()
    plt.scatter(test, eval_y)
    plt.show()

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(test_y, predict_y))



In [ ]:

    
plt.scatter(y_pred, train_y)



In [ ]:



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)
    
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")



In [ ]:

    
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 50)
raw_test_data = raw_test_data_2
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
raw_data = raw_data_T0792



In [ ]:

    
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
PolynomialDegree = 2
p = 0.1


num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])
    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]



# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})

log_clf.fit(train_set, train_y)


# check on training set
n = 5

clf = log_clf
#     y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)



for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")



In [ ]:

    
strat_train_set["a"] = prob



In [ ]:

    
strat_train_set.plot("a", "GDT", kind="scatter")



In [ ]:

    
prob



In [ ]:

    
pd.concat([strat_train_set, pd.Series(prob)], axis=1)



In [ ]:

    
def compute_with_my_score_function(p=0.9, PolynomialDegree=3):
    FEATURES = ['Rw',
     'VTotal',
     'QGO',
     'Burial',
     'Water',
     'Rama',
     'DSSP',
     'P_AP',
     'Helix',
     'Frag_Mem']
    # LABEL = "Qw"
    LABEL = "isGood"

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    frame = 201
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    my_full_pipeline = Pipeline([
    #         ('removeFirstFrame', RemoveFirstFrame(frame)),
            ('featureSelection', full_pipeline)
    ])

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_full_pipeline.fit_transform(strat_train_set)
    X_test = my_full_pipeline.fit_transform(strat_test_set)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]

    # log_clf = LogisticRegression(random_state=142)
    # rnd_clf = RandomForestClassifier(random_state=432)
    # svm_clf = SVC(probability=True, random_state=412)
    log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
    log_clf.fit(train_set, train_y)

#     voting_clf.fit(train_set, train_y)
    n = 10
    cl_name = "lr"
    clf = log_clf
#     for cl_name, clf in ("voting", voting_clf):
    my_evaluation = 1.0
    another_evaluation = 0.0
    for name, data in raw_test_data.groupby("Name"):
#             print(name)
#         X = full_pipeline.fit_transform(data)
#         validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)
        validation_data = my_full_pipeline.fit_transform(raw_data_T0784)
        validation_y = validation_data[:,-1]
        validation_set = validation_data[:,:-1]
        clf.fit(train_set, train_y)
        test= clf.predict_proba(validation_set)[:,1]
        position_of_top_n = test.argsort()[-n:][::-1]
        threshold = test[position_of_top_n][-1]
        predict_y = np.zeros(len(validation_y),)
        predict_y[position_of_top_n] = 1
    #     predict_y = (test > threshold)
#         print(threshold)
        cm = confusion_matrix(validation_y, predict_y)
#             print(cm)
        precision = cm[1][1] / (cm[1][1] + cm[0][1])
#             print(name,  " precision", precision,end = " ")
        if name != "T0766" and name != "T0833":
            my_evaluation *= precision
            another_evaluation += precision
#         print("")
    print("classifier:", cl_name, ", p:",p, ", degree", PolynomialDegree, ", score", my_evaluation, ", another score", another_evaluation)
    return (cl_name, p, PolynomialDegree, my_evaluation)



In [ ]:

    
def myGridSerach():
    p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
    degree_list = [3, 2, 1]
#     p_list = [0.1, 0.8, 0.9, 0.95]
#     degree_list = [1, 2, 3]
#     p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
#     degree_list = [1, 2, 3, 4]
    result = []
    for p in p_list:
        for degree in degree_list:
            result += [compute_with_my_score_function(p, degree)]



In [ ]:

    
myGridSerach()



In [ ]:

    
compute_with_my_score_function(0.1, 1)



In [ ]:

    
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")



In [ ]:

    
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])



In [ ]:

    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)



In [ ]:

    
# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)



In [ ]:

    
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))



In [ ]:

    
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = my_full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1


#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))



In [ ]:

		index	Rw	Chain	Chi	Rama	DSSP	P_AP	Water	Burial	Helix	Frag_Mem	QGO	VTotal	Rmsd	Qw	Name	VwithoutGo	prediction	chosen
Name
1mba	3050	1050	-25668.934863	188.362784	47.568387	-623.269227	-0.002060	-6.122339	-51.366841	-123.306045	-55.697016	-329.652933	34.653521	-918.831767	4.41750	0.517421	1mba	-953.485288	0.798224	True
	3063	1063	-25642.538441	193.180635	40.801919	-603.733959	-0.002218	-6.071182	-51.821636	-123.479884	-43.882664	-319.205247	31.221287	-882.992950	4.49287	0.532833	1mba	-914.214237	0.833380	True
	3078	1078	-25818.058079	203.088673	39.071277	-616.459828	-0.005223	-6.639706	-54.863242	-123.371367	-49.869419	-318.902394	32.990088	-894.961143	4.36205	0.517205	1mba	-927.951231	0.792404	True
	3113	1113	-26194.903471	174.845709	51.534425	-620.731595	-0.000000	-5.303780	-53.916686	-123.361001	-58.463907	-341.886124	30.007701	-947.275257	4.33429	0.528589	1mba	-977.282958	0.835229	True
	3147	1147	-26212.316297	199.462767	36.102053	-614.154999	-0.000004	-5.832983	-59.233419	-123.518396	-53.213548	-331.911306	23.791139	-928.508696	3.53488	0.544209	1mba	-952.299835	0.815967	True
T0251	14000	0	-19077.704744	0.000000	0.000000	-441.605335	-15.255526	-13.474660	-62.605609	-96.658133	-20.929080	-188.140148	50.247258	-788.421233	3.39870	0.552992	T0251	-838.668491	0.956918	True
	14402	402	-18013.842801	0.000000	0.000000	-447.097493	-16.364849	-15.235617	-59.233454	-97.671568	-19.803157	-188.318154	57.156918	-786.567374	4.96755	0.487310	T0251	-843.724292	0.705643	True
	14804	804	-18245.591771	0.000000	0.000000	-453.122289	-15.905990	-12.362693	-61.275689	-97.947330	-21.894612	-185.475895	61.281099	-786.703399	5.07265	0.472612	T0251	-847.984498	0.797835	True
	15005	1005	-18000.827772	0.000000	0.000000	-427.470394	-7.582649	-13.119749	-61.779600	-95.737925	-15.172857	-185.459790	50.552695	-755.770268	3.85729	0.599933	T0251	-806.322963	0.907696	True
	15809	1809	-18638.719085	0.000000	0.000000	-448.519637	-17.481876	-15.671105	-58.833573	-96.589674	-15.501984	-185.347017	63.856166	-774.088700	4.30655	0.443934	T0251	-837.944866	0.874346	True
T0766	0	0	-17213.185246	115.628799	21.795659	-401.238938	-50.514721	-26.249246	-47.507644	-94.343514	-11.386663	-650.792467	1.867385	-1142.741350	1.21386	0.893880	T0766	-1144.608735	0.949798	True
	402	402	-17460.586267	142.255390	19.363798	-404.168852	-48.622003	-27.139873	-50.741840	-95.833920	-11.055376	-620.009443	0.817069	-1095.135050	1.42096	0.904559	T0766	-1095.952119	0.805544	True
	1005	1005	-17513.935527	160.807747	18.046265	-409.940394	-50.866838	-25.586106	-48.405243	-94.998609	-12.884592	-656.539459	1.560692	-1118.806537	1.47072	0.884528	T0766	-1120.367229	0.798692	True
	1206	1206	-17437.085465	129.149716	22.957345	-400.999639	-49.327841	-25.916715	-51.290571	-93.599708	-9.825087	-676.046765	1.428747	-1153.470519	1.03706	0.916326	T0766	-1154.899266	0.951620	True
	1608	1608	-17193.556092	149.790571	24.967125	-404.096143	-53.767317	-26.944336	-49.565923	-93.966839	-12.442834	-628.651534	1.785025	-1092.892204	1.24034	0.894871	T0766	-1094.677229	0.883126	True
T0784	4311	311	-17790.388773	172.006238	42.301708	-423.169882	-53.688516	-42.866065	-52.207805	-108.338088	-0.016669	-357.302010	6.675683	-816.605406	2.45253	0.791893	T0784	-823.281089	0.177008	True
	4690	690	-17779.884202	160.077993	34.527754	-429.454151	-69.401195	-41.405390	-53.996417	-108.620388	-0.007854	-375.487237	5.999863	-877.767022	2.02618	0.849591	T0784	-883.766885	0.162040	True
	4694	694	-18115.408634	163.993974	27.565366	-429.295744	-62.495072	-41.869436	-54.215824	-109.287047	-0.136691	-383.106827	4.189229	-884.658071	1.47136	0.883480	T0784	-888.847300	0.124396	True
	4762	762	-18261.349908	167.276551	29.235053	-433.333557	-63.107493	-43.180998	-53.032621	-106.291134	-0.168822	-360.562862	4.036132	-859.129751	1.41793	0.887978	T0784	-863.165883	0.120021	True
	5597	1597	-17650.941892	181.175401	25.769981	-429.333138	-55.967482	-39.977174	-55.087960	-107.362945	-0.017062	-324.953519	10.309939	-795.443960	2.32883	0.778717	T0784	-805.753899	0.131527	True
T0792	6425	425	-11114.901279	0.000000	0.000000	-300.134036	-12.470584	-4.831806	-22.726316	-68.246065	-4.484484	-153.134180	19.288209	-546.739263	7.01416	0.673084	T0792	-566.027472	0.589055	True
	6426	426	-10917.058304	0.000000	0.000000	-276.838360	-11.131518	-4.609209	-20.036211	-68.027265	-6.663339	-136.018791	17.503348	-505.821346	7.02225	0.668682	T0792	-523.324694	0.566919	True
	6436	436	-10781.052824	0.000000	0.000000	-281.789608	-11.297005	-3.666596	-24.536640	-67.794657	-5.864976	-144.374105	21.283305	-518.040283	5.47535	0.669834	T0792	-539.323588	0.595796	True
	6437	437	-10864.652686	0.000000	0.000000	-282.437955	-13.257966	-4.601048	-25.859225	-67.660238	-9.546479	-142.927229	14.146699	-532.143441	5.67942	0.660779	T0792	-546.290140	0.561915	True
	6464	464	-10961.791460	0.000000	0.000000	-261.186635	-11.493735	-4.362560	-22.956799	-66.466707	-8.541653	-143.393317	19.436948	-498.964458	5.85229	0.659078	T0792	-518.401406	0.627926	True
T0803	8000	0	-19861.129034	160.995808	37.520467	-442.112608	-19.569684	-14.137854	-54.157202	-114.150717	-21.671180	-656.848769	1.824290	-1122.307448	6.68803	0.670458	T0803	-1124.131738	0.216025	True
	8201	201	-19469.650769	173.224426	39.791078	-440.649099	-14.457358	-16.407512	-56.170295	-116.170425	-22.653950	-586.198437	4.400490	-1035.291082	7.32675	0.644108	T0803	-1039.691572	0.204651	True
	9005	1005	-19226.886197	170.780230	30.153493	-438.148101	-16.147199	-11.494153	-48.835171	-115.693482	-21.173096	-574.051793	2.741175	-1021.868096	6.83928	0.609516	T0803	-1024.609271	0.326640	True
	9608	1608	-19432.885521	161.641151	32.377136	-441.148114	-17.631531	-17.494679	-51.660735	-113.658802	-22.108639	-599.783424	1.830782	-1067.636856	5.89592	0.647433	T0803	-1069.467638	0.187509	True
	9809	1809	-19464.480000	167.911743	42.869046	-444.462776	-15.556580	-13.950089	-49.674335	-114.616031	-20.559497	-566.708800	3.072288	-1011.675029	7.09471	0.618167	T0803	-1014.747317	0.215628	True
T0815	10151	151	-15474.568755	0.000000	0.000000	-430.626865	-48.826497	-20.809696	-29.483155	-89.869895	-11.834988	-140.410006	35.310886	-736.550216	3.23743	0.631509	T0815	-771.861102	0.336168	True
	10164	164	-15329.162447	0.000000	0.000000	-457.809116	-36.914114	-18.747275	-28.221452	-85.754506	-13.415093	-145.545002	39.588280	-746.818279	4.00120	0.569437	T0815	-786.406559	0.293669	True
	10180	180	-15410.540183	0.000000	0.000000	-438.085216	-42.225680	-20.638568	-32.508001	-91.201691	-11.749104	-132.841664	33.387503	-735.862423	3.46268	0.632121	T0815	-769.249926	0.340815	True
	10190	190	-15614.262927	0.000000	0.000000	-463.517624	-43.762311	-19.316092	-34.191246	-89.616186	-9.052628	-139.576523	33.067140	-765.965471	3.36234	0.635324	T0815	-799.032611	0.305125	True
	10193	193	-15276.982093	0.000000	0.000000	-453.499118	-44.294816	-17.527847	-29.721187	-91.683871	-7.954028	-138.872867	40.456669	-743.097066	3.40225	0.601252	T0815	-783.553735	0.312087	True
T0833	12000	0	-14347.027331	125.898222	23.989866	-439.957297	-73.720114	-37.943400	-33.171725	-92.846135	-0.224366	-124.238990	25.645643	-626.568294	5.49476	0.501330	T0833	-652.213937	0.937762	True
	12603	603	-14701.547826	138.917304	32.995803	-443.299762	-79.511003	-38.170243	-27.914074	-93.765783	0.124468	-125.626753	30.788565	-605.461478	7.74010	0.476920	T0833	-636.250043	0.931107	True
	13005	1005	-15024.095932	149.112160	24.551428	-446.849817	-74.604115	-36.845177	-30.533836	-94.630107	0.370975	-123.415370	27.948081	-604.895778	8.32255	0.499559	T0833	-632.843859	0.949586	True
	13206	1206	-13642.710159	143.728681	30.403366	-457.602172	-75.679105	-39.949399	-21.676088	-94.171657	0.436941	-129.780119	30.649766	-613.639785	10.94590	0.439767	T0833	-644.289551	0.929100	True
	13608	1608	-15234.470919	161.067720	25.613174	-453.118256	-80.961181	-39.700876	-33.697349	-94.848560	-0.639133	-130.593643	34.276312	-612.601792	7.98275	0.463533	T0833	-646.878104	0.976557	True