In [10]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime

%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")

In [23]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 50)
raw_test_data = raw_test_data_2
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
raw_data = raw_data_T0792


# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
PolynomialDegree = 1
p = 0.1

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")
    
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])
    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)
    
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")


LogisticRegression 
 [[1555    6]
 [  35    4]]
RandomForestClassifier 
 [[1561    0]
 [  29   10]]
SVC 
 [[1557    4]
 [  33    6]]
VotingClassifier 
 [[1561    0]
 [  29   10]]
1MBA
[[1944    8]
 [  47    2]]
T0766
[[1946    6]
 [  45    4]]
T0784
[[1942   10]
 [  49    0]]
T0792
[[1948    4]
 [  43    6]]
T0803
[[1944    8]
 [  47    2]]
T0815
[[1946    6]
 [  45    4]]
T0833
[[1915    9]
 [  48    1]]
T251
[[1943    9]
 [  48    1]]
p0.1_poly1

In [12]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 50)
raw_test_data = raw_test_data_2
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
raw_data = raw_data_T0792

In [26]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
PolynomialDegree = 2
p = 0.1


num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])
    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]



# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})

log_clf.fit(train_set, train_y)


# check on training set
n = 5

clf = log_clf
#     y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)



for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
    print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")


LogisticRegression 
 [[1559    2]
 [  36    3]]
1MBA
[[1947    5]
 [  49    0]]
T0766
[[1947    5]
 [  49    0]]
T0784
[[1947    5]
 [  49    0]]
T0792
[[1949    3]
 [  47    2]]
T0803
[[1947    5]
 [  49    0]]
T0815
[[1949    3]
 [  47    2]]
T0833
[[1919    5]
 [  49    0]]
T251
[[1947    5]
 [  49    0]]
p0.1_poly2

In [46]:
strat_train_set["a"] = prob


/Users/weilu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [49]:
strat_train_set.plot("a", "GDT", kind="scatter")


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x107a0e978>

In [45]:
prob


Out[45]:
array([  1.16491821e-11,   3.72178125e-02,   1.06612780e-01, ...,
         4.89392979e-03,   2.67241498e-05,   2.70575860e-03])

In [43]:
pd.concat([strat_train_set, pd.Series(prob)], axis=1)


Out[43]:
Step Qw Rw VTotal QGO Burial Water Rama Chain Chi DSSP P_AP Helix Frag_Mem GDT Name Good isGood 0
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.164918e-11
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.721781e-02
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.066128e-01
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.563204e-04
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.127779e-03
5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.576430e-02
6 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.272314e-03
7 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.276116e-02
8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.763054e-03
9 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.304295e-03
10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.630703e-01
11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.785011e-06
12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.076388e-03
13 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.518153e-02
14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.209818e-02
15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.666894e-02
16 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.515937e-04
17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.976814e-10
18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.768104e-03
19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.341261e-05
20 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.703344e-02
21 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.648333e-03
22 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.073082e-01
23 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.661211e-01
24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.989559e-02
25 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.778049e-04
26 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.146910e-07
27 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.057560e-03
28 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.648288e-02
29 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.149428e-05
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8006 1977.0 0.673644 -10593.311770 -512.049764 20.236916 -67.812683 -22.184711 -278.062233 0.0 0.0 -13.095452 -5.979070 -8.054282 -137.098250 75.0000 T0792 0.0 False NaN
8008 1979.0 0.658984 -10910.168623 -516.043438 20.959818 -67.384450 -27.034083 -283.032187 0.0 0.0 -10.337310 -5.914448 -14.333326 -128.967451 71.8750 T0792 0.0 False NaN
8009 1980.0 0.669392 -10808.838737 -510.685731 20.174154 -69.985929 -26.759350 -273.270986 0.0 0.0 -11.131982 -5.751354 -12.168164 -131.792120 70.0000 T0792 0.0 False NaN
8011 1982.0 0.650550 -10710.248358 -503.013859 18.053729 -66.983437 -25.137358 -270.321328 0.0 0.0 -12.017909 -5.530218 -12.006409 -129.070929 66.2500 T0792 0.0 False NaN
8012 1983.0 0.685214 -10691.855763 -522.016160 20.236039 -68.275471 -28.011384 -286.631050 0.0 0.0 -12.535751 -6.087053 -14.097049 -126.614441 73.4375 T0792 0.0 False NaN
8013 1984.0 0.665166 -10574.798646 -502.934013 18.294475 -66.177617 -31.415967 -269.281665 0.0 0.0 -11.208359 -5.915740 -11.716979 -125.512160 70.6250 T0792 0.0 False NaN
8014 1985.0 0.648859 -10533.233444 -498.100380 19.709472 -67.679027 -26.093886 -271.551445 0.0 0.0 -11.095048 -5.723215 -9.105124 -126.562106 69.0625 T0792 0.0 False NaN
8015 1986.0 0.651357 -10524.377624 -510.714068 20.912523 -66.966413 -29.927654 -276.947931 0.0 0.0 -11.065816 -6.259854 -10.832408 -129.626516 69.6875 T0792 0.0 False NaN
8016 1987.0 0.667974 -10613.356188 -494.239741 24.187234 -67.990841 -29.886000 -264.840312 0.0 0.0 -9.332347 -5.533874 -9.704129 -131.139472 70.6250 T0792 0.0 False NaN
8017 1988.0 0.647765 -10474.683872 -509.541456 21.639569 -69.090910 -29.046156 -289.634132 0.0 0.0 -12.229456 -5.435881 -7.054904 -118.689587 69.3750 T0792 0.0 False NaN
8018 1989.0 0.681786 -10831.538327 -518.332556 19.926551 -68.044240 -26.796200 -285.541167 0.0 0.0 -12.556425 -5.233047 -13.082628 -127.005400 72.1875 T0792 0.0 False NaN
8019 1990.0 0.679430 -10576.888976 -522.583097 19.914406 -67.297044 -28.390955 -289.612025 0.0 0.0 -13.107322 -4.728231 -8.109065 -131.252860 71.8750 T0792 0.0 False NaN
8020 1991.0 0.655303 -10564.326296 -500.781869 20.554229 -68.479685 -24.964039 -278.836109 0.0 0.0 -12.044167 -5.667629 -5.691850 -125.652619 67.5000 T0792 0.0 False NaN
8021 1992.0 0.631370 -10769.714400 -505.466108 23.076395 -68.322989 -27.142742 -275.575743 0.0 0.0 -11.578795 -5.285562 -8.554865 -132.081807 65.9375 T0792 0.0 False NaN
8023 1994.0 0.684918 -10713.725818 -530.500090 19.317672 -68.405937 -25.224987 -290.255622 0.0 0.0 -11.814930 -5.638768 -14.371399 -134.106119 71.8750 T0792 0.0 False NaN
8024 1995.0 0.661095 -10625.157107 -511.080552 18.092228 -67.910231 -26.201063 -289.611421 0.0 0.0 -10.894607 -6.046963 -9.221339 -119.287156 69.0625 T0792 0.0 False NaN
8025 1996.0 0.638725 -10497.850915 -511.121695 20.572087 -68.852224 -23.291928 -279.488099 0.0 0.0 -12.650985 -5.664810 -12.633876 -129.111861 65.6250 T0792 0.0 False NaN
8026 1997.0 0.653354 -10777.222800 -506.663878 20.733806 -67.652987 -28.592670 -274.635791 0.0 0.0 -12.564561 -6.075437 -11.513044 -126.363193 67.5000 T0792 0.0 False NaN
8027 1998.0 0.657097 -10392.082129 -520.223713 19.196379 -68.784738 -29.643841 -290.777248 0.0 0.0 -13.088264 -5.129537 -9.064074 -122.932390 68.1250 T0792 0.0 False NaN
8028 1999.0 0.649594 -10693.705663 -529.215899 22.072205 -66.900385 -28.680765 -293.429028 0.0 0.0 -11.401104 -5.521903 -12.282255 -133.072663 69.0625 T0792 0.0 False NaN
8029 2000.0 0.677115 -10707.594672 -533.335040 16.337619 -66.771902 -26.052065 -290.927237 0.0 0.0 -12.369208 -5.456457 -14.734835 -133.360956 72.1875 T0792 0.0 False NaN
8030 2001.0 0.679864 -10785.792271 -530.228951 18.697403 -67.118936 -26.184378 -295.350130 0.0 0.0 -12.742940 -5.287526 -7.008405 -135.234039 71.2500 T0792 0.0 False NaN
8031 2002.0 0.653806 -10736.275317 -523.428394 18.933004 -67.458620 -32.270373 -289.782726 0.0 0.0 -12.574242 -5.526367 -8.710672 -126.038398 68.7500 T0792 0.0 False NaN
8032 2003.0 0.670273 -10874.279415 -547.724272 18.656697 -69.098619 -25.463678 -302.654968 0.0 0.0 -11.101013 -5.431765 -14.888201 -137.742725 72.1875 T0792 0.0 False NaN
8033 2004.0 0.669436 -10892.368654 -510.008237 22.834987 -68.154433 -25.764409 -273.413226 0.0 0.0 -12.492297 -5.112278 -12.832885 -135.073695 72.8125 T0792 0.0 False NaN
8034 2005.0 0.656888 -10703.910853 -543.171503 17.905962 -66.398787 -29.120641 -298.888010 0.0 0.0 -13.132435 -5.836789 -10.355362 -137.345441 68.7500 T0792 0.0 False NaN
8035 2006.0 0.692382 -10828.222331 -534.712330 19.098412 -68.456227 -23.709948 -294.261432 0.0 0.0 -11.859785 -5.409702 -12.117053 -137.996595 75.3125 T0792 0.0 False NaN
8037 2008.0 0.660913 -10577.606729 -505.014066 18.977488 -67.414641 -25.320651 -282.519473 0.0 0.0 -12.224664 -4.776406 -9.789953 -121.945767 72.5000 T0792 0.0 False NaN
8038 2009.0 0.652170 -10611.947525 -526.856227 13.633329 -67.234170 -22.807722 -291.414826 0.0 0.0 -11.001837 -4.846785 -13.016733 -130.167483 70.9375 T0792 0.0 False NaN
8039 2010.0 0.649245 -10571.133114 -532.716634 17.491296 -68.178990 -22.686104 -295.800491 0.0 0.0 -12.015912 -5.429175 -12.869567 -133.227691 71.5625 T0792 0.0 False NaN

3200 rows × 19 columns


In [32]:
def compute_with_my_score_function(p=0.9, PolynomialDegree=3):
    FEATURES = ['Rw',
     'VTotal',
     'QGO',
     'Burial',
     'Water',
     'Rama',
     'DSSP',
     'P_AP',
     'Helix',
     'Frag_Mem']
    # LABEL = "Qw"
    LABEL = "isGood"

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    frame = 201
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    my_full_pipeline = Pipeline([
    #         ('removeFirstFrame', RemoveFirstFrame(frame)),
            ('featureSelection', full_pipeline)
    ])

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_full_pipeline.fit_transform(strat_train_set)
    X_test = my_full_pipeline.fit_transform(strat_test_set)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]

    # log_clf = LogisticRegression(random_state=142)
    # rnd_clf = RandomForestClassifier(random_state=432)
    # svm_clf = SVC(probability=True, random_state=412)
    log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
    log_clf.fit(train_set, train_y)

#     voting_clf.fit(train_set, train_y)
    n = 10
    cl_name = "lr"
    clf = log_clf
#     for cl_name, clf in ("voting", voting_clf):
    my_evaluation = 1.0
    another_evaluation = 0.0
    for name, data in raw_test_data.groupby("Name"):
#             print(name)
#         X = full_pipeline.fit_transform(data)
#         validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)
        validation_data = my_full_pipeline.fit_transform(raw_data_T0784)
        validation_y = validation_data[:,-1]
        validation_set = validation_data[:,:-1]
        clf.fit(train_set, train_y)
        test= clf.predict_proba(validation_set)[:,1]
        position_of_top_n = test.argsort()[-n:][::-1]
        threshold = test[position_of_top_n][-1]
        predict_y = np.zeros(len(validation_y),)
        predict_y[position_of_top_n] = 1
    #     predict_y = (test > threshold)
#         print(threshold)
        cm = confusion_matrix(validation_y, predict_y)
#             print(cm)
        precision = cm[1][1] / (cm[1][1] + cm[0][1])
#             print(name,  " precision", precision,end = " ")
        if name != "T0766" and name != "T0833":
            my_evaluation *= precision
            another_evaluation += precision
#         print("")
    print("classifier:", cl_name, ", p:",p, ", degree", PolynomialDegree, ", score", my_evaluation, ", another score", another_evaluation)
    return (cl_name, p, PolynomialDegree, my_evaluation)

In [34]:
def myGridSerach():
    p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
    degree_list = [3, 2, 1]
#     p_list = [0.1, 0.8, 0.9, 0.95]
#     degree_list = [1, 2, 3]
#     p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
#     degree_list = [1, 2, 3, 4]
    result = []
    for p in p_list:
        for degree in degree_list:
            result += [compute_with_my_score_function(p, degree)]

In [35]:
myGridSerach()


classifier: lr , p: 0.9 , degree 3 , score 1e-06 , another score 0.6
classifier: lr , p: 0.9 , degree 2 , score 0.0 , another score 0.0
classifier: lr , p: 0.9 , degree 1 , score 0.0 , another score 0.0
classifier: lr , p: 0.8 , degree 3 , score 1e-06 , another score 0.6
classifier: lr , p: 0.8 , degree 2 , score 0.0 , another score 0.0
classifier: lr , p: 0.8 , degree 1 , score 0.0 , another score 0.0
classifier: lr , p: 0.7 , degree 3 , score 0.0 , another score 0.0
classifier: lr , p: 0.7 , degree 2 , score 0.0 , another score 0.0
classifier: lr , p: 0.7 , degree 1 , score 0.0 , another score 0.0
classifier: lr , p: 0.5 , degree 3 , score 0.0 , another score 0.0
classifier: lr , p: 0.5 , degree 2 , score 0.0 , another score 0.0
classifier: lr , p: 0.5 , degree 1 , score 0.0 , another score 0.0
classifier: lr , p: 0.1 , degree 3 , score 0.0 , another score 0.0
classifier: lr , p: 0.1 , degree 2 , score 0.0 , another score 0.0
classifier: lr , p: 0.1 , degree 1 , score 0.0 , another score 0.0

In [33]:
compute_with_my_score_function(0.1, 1)


classifier: lr , p: 0.1 , degree 1 , score 0.0 , another score 0.0
Out[33]:
('lr', 0.1, 1, 0.0)

In [185]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")

In [186]:
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
my_full_pipeline = Pipeline([
#         ('removeFirstFrame', RemoveFirstFrame(frame)),
        ('featureSelection', full_pipeline)
])

In [187]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
    strat_train_set = raw_data.iloc[train_index]
    strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]

In [188]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


Out[188]:
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight={0: 0.1, 1: 0.9}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=142,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ('r...f',
  max_iter=-1, probability=True, random_state=412, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [189]:
# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)


LogisticRegression 
 [[1558    4]
 [  32    6]]
RandomForestClassifier 
 [[1562    0]
 [  28   10]]
SVC 
 [[1559    3]
 [  31    7]]
VotingClassifier 
 [[1561    1]
 [  29    9]]

In [190]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1

    with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")

#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))


1MBA
[[1942    9]
 [  49    1]]
T0766
[[1943    9]
 [  48    1]]
T0784
[[1939   10]
 [  52    0]]
T0792
[[1950    4]
 [  41    6]]
T0803
[[1943    9]
 [  48    1]]
T0815
[[1945    9]
 [  46    1]]
T0833
[[1914   10]
 [  49    0]]
T251
[[1940   10]
 [  51    0]]

In [180]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = my_full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= log_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1


#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))


1MBA
[[1943    8]
 [  48    2]]
T0766
[[1946    6]
 [  45    4]]
T0784
[[1939   10]
 [  52    0]]
T0792
[[1949    5]
 [  42    5]]
T0803
[[1944    8]
 [  47    2]]
T0815
[[1946    8]
 [  45    2]]
T0833
[[1915    9]
 [  48    1]]
T251
[[1941    9]
 [  50    1]]

In [ ]: