In [9]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [10]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_data = raw_test_data.groupby("Name").get_group("T0792")

In [11]:
raw_test_data.columns


Out[11]:
Index(['Step', 'Qw', 'Rw', 'VTotal', 'QGO', 'Burial', 'Water', 'Rama', 'Chain',
       'Chi', 'DSSP', 'P_AP', 'Helix', 'Frag_Mem', 'GDT', 'Name', 'Good'],
      dtype='object')

In [12]:
list(raw_test_data.columns[2:-3])


Out[12]:
['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'Chain',
 'Chi',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']

In [13]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
FEATURES = list(raw_test_data.columns[2:-3])
LABEL = ["Good"]

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [15]:
# I want to start with the simplest linear regression

In [16]:
num_attribs = FEATURES
cat_attribs = LABEL
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=3, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [17]:
from sklearn.model_selection import train_test_split
# raw_data = raw_test_data.groupby("Name").get_group("1MBA")
train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.fit_transform(test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
p = 0.9
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


Out[18]:
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight={0: 0.9, 1: 0.09999999999999998},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=142, solver='liblinear', tol=0.0001, verbose=0,
       ...rbf', max_iter=-1, probability=True, random_state=412,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [19]:
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)


LogisticRegression 
 [[1567    0]
 [  31   10]]
RandomForestClassifier 
 [[1567    0]
 [  31   10]]
SVC 
 [[1567    0]
 [  31   10]]
VotingClassifier 
 [[1567    0]
 [  31   10]]

In [20]:
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(test_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(test_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(test_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)


LogisticRegression 
 [[391   3]
 [  1   7]]
RandomForestClassifier 
 [[389   5]
 [  3   5]]
SVC 
 [[390   4]
 [  2   6]]
VotingClassifier 
 [[389   5]
 [  3   5]]

In [21]:
def compute_with_my_score_function(p=0.9, degree=3):
    num_attribs = FEATURES
    cat_attribs = LABEL
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=degree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
    X_train = full_pipeline.fit_transform(train_set)
    X_test = full_pipeline.fit_transform(test_set)
#     X_train = full_pipeline.fit_transform(raw_data)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
    rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
    svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})
    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
        voting='soft')

#     voting_clf.fit(train_set, train_y)
    n = 10
    for cl_name, clf in zip(("lr", "rf", "svc", "voting"), (log_clf, rnd_clf, svm_clf, voting_clf)):
#     for cl_name, clf in ("voting", voting_clf):
        my_evaluation = 1.0
        another_evaluation = 0.0
        for name, data in raw_test_data.groupby("Name"):
#             print(name)
            X = full_pipeline.fit_transform(data)
            validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)

            validation_y = validation_data[:,-1]
            validation_set = validation_data[:,:-1]
            clf.fit(train_set, train_y)
            test= clf.predict_proba(validation_set)[:,1]
            position_of_top_n = test.argsort()[-n:][::-1]
            threshold = test[position_of_top_n][-1]
            predict_y = np.zeros(len(validation_y),)
            predict_y[position_of_top_n] = 1
        #     predict_y = (test > threshold)
    #         print(threshold)
            cm = confusion_matrix(validation_y, predict_y)
#             print(cm)
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
#             print(name,  " precision", precision,end = " ")
            if name != "T0766" and name != "T0833":
                my_evaluation *= precision
                another_evaluation += precision
#         print("")
        print("classifier:", cl_name, ", p:",p, ", degree", degree, ", score", my_evaluation, ", another score", another_evaluation)
    return (cl_name, p, degree, my_evaluation)

In [22]:
def myGridSerach():
    p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
    degree_list = [3, 2, 1]
#     p_list = [0.1, 0.8, 0.9, 0.95]
#     degree_list = [1, 2, 3]
#     p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
#     degree_list = [1, 2, 3, 4]
    result = []
    for p in p_list:
        for degree in degree_list:
            result += [compute_with_my_score_function(p, degree)]

In [23]:
myGridSerach()


classifier: lr , p: 0.9 , degree 3 , score 0.0 , another score 1.6
classifier: rf , p: 0.9 , degree 3 , score 0.00432 , another score 2.8
classifier: svc , p: 0.9 , degree 3 , score 0.0 , another score 1.6
classifier: voting , p: 0.9 , degree 3 , score 0.00504 , another score 2.9
classifier: lr , p: 0.9 , degree 2 , score 0.0 , another score 0.7
classifier: rf , p: 0.9 , degree 2 , score 0.0 , another score 2.1
classifier: svc , p: 0.9 , degree 2 , score 0.0 , another score 1.9
classifier: voting , p: 0.9 , degree 2 , score 0.0 , another score 2.0
classifier: lr , p: 0.9 , degree 1 , score 0.0 , another score 1.1
classifier: rf , p: 0.9 , degree 1 , score 0.00252 , another score 2.8
classifier: svc , p: 0.9 , degree 1 , score 0.0 , another score 1.9
classifier: voting , p: 0.9 , degree 1 , score 0.00224 , another score 2.8
classifier: lr , p: 0.8 , degree 3 , score 0.0 , another score 1.7
classifier: rf , p: 0.8 , degree 3 , score 0.0048 , another score 3.0
classifier: svc , p: 0.8 , degree 3 , score 0.0 , another score 1.6
classifier: voting , p: 0.8 , degree 3 , score 0.00288 , another score 2.9
classifier: lr , p: 0.8 , degree 2 , score 0.0 , another score 0.8
classifier: rf , p: 0.8 , degree 2 , score 0.00054 , another score 2.4
classifier: svc , p: 0.8 , degree 2 , score 0.0 , another score 1.9
classifier: voting , p: 0.8 , degree 2 , score 0.000972 , another score 2.4
classifier: lr , p: 0.8 , degree 1 , score 0.0 , another score 1.2
classifier: rf , p: 0.8 , degree 1 , score 0.0 , another score 2.6
classifier: svc , p: 0.8 , degree 1 , score 0.0 , another score 2.0
classifier: voting , p: 0.8 , degree 1 , score 0.0 , another score 2.8
classifier: lr , p: 0.7 , degree 3 , score 0.0 , another score 1.7
classifier: rf , p: 0.7 , degree 3 , score 0.00672 , another score 3.0
classifier: svc , p: 0.7 , degree 3 , score 0.0 , another score 1.5
classifier: voting , p: 0.7 , degree 3 , score 0.0018 , another score 2.6
classifier: lr , p: 0.7 , degree 2 , score 0.0 , another score 0.8
classifier: rf , p: 0.7 , degree 2 , score 0.001296 , another score 2.6
classifier: svc , p: 0.7 , degree 2 , score 0.0 , another score 1.6
classifier: voting , p: 0.7 , degree 2 , score 0.000972 , another score 2.4
classifier: lr , p: 0.7 , degree 1 , score 0.0 , another score 1.2
classifier: rf , p: 0.7 , degree 1 , score 0.00072 , another score 2.3
classifier: svc , p: 0.7 , degree 1 , score 0.0 , another score 1.5
classifier: voting , p: 0.7 , degree 1 , score 0.000648 , another score 2.2
classifier: lr , p: 0.5 , degree 3 , score 0.0 , another score 1.7
classifier: rf , p: 0.5 , degree 3 , score 0.00048 , another score 2.3
classifier: svc , p: 0.5 , degree 3 , score 9.6e-05 , another score 1.9
classifier: voting , p: 0.5 , degree 3 , score 0.00072 , another score 2.2
classifier: lr , p: 0.5 , degree 2 , score 0.0 , another score 0.7
classifier: rf , p: 0.5 , degree 2 , score 0.0 , another score 2.3
classifier: svc , p: 0.5 , degree 2 , score 0.0 , another score 1.9
classifier: voting , p: 0.5 , degree 2 , score 0.0 , another score 2.2
classifier: lr , p: 0.5 , degree 1 , score 0.0 , another score 1.4
classifier: rf , p: 0.5 , degree 1 , score 0.0 , another score 3.1
classifier: svc , p: 0.5 , degree 1 , score 0.0 , another score 2.0
classifier: voting , p: 0.5 , degree 1 , score 0.0 , another score 2.6
classifier: lr , p: 0.1 , degree 3 , score 0.0 , another score 1.6
classifier: rf , p: 0.1 , degree 3 , score 0.0003 , another score 2.2
classifier: svc , p: 0.1 , degree 3 , score 0.0 , another score 0.9
classifier: voting , p: 0.1 , degree 3 , score 0.00027 , another score 2.1
classifier: lr , p: 0.1 , degree 2 , score 0.0 , another score 1.0
classifier: rf , p: 0.1 , degree 2 , score 0.0 , another score 1.5
classifier: svc , p: 0.1 , degree 2 , score 0.0 , another score 1.1
classifier: voting , p: 0.1 , degree 2 , score 0.0 , another score 1.6
classifier: lr , p: 0.1 , degree 1 , score 0.0 , another score 1.3
classifier: rf , p: 0.1 , degree 1 , score 0.00027 , another score 2.1
classifier: svc , p: 0.1 , degree 1 , score 0.0 , another score 1.3
classifier: voting , p: 0.1 , degree 1 , score 8e-05 , another score 1.8

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
num_attribs = FEATURES
cat_attribs = LABEL
degree = 3
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# raw_data = raw_test_data.groupby("Name").get_group("1MBA")
train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.fit_transform(test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
# test_y = X_test[:,-1]
# test_set = X_test[:,:-1]

p = 0.9
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(train_set, train_y)
n = 6
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= voting_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
    with open("/Users/weilu/Research/data/structure_selector/{}_results.csv".format(name), "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))


1MBA
[[1955    6]
 [  49    0]]
T0766
[[1955    6]
 [  49    0]]
T0784
[[1956    5]
 [  48    1]]
T0792
[[1961    0]
 [  43    6]]
T0803
[[1956    5]
 [  48    1]]
T0815
[[1954    3]
 [  50    3]]
T0833
[[1927    6]
 [  49    0]]
T251
[[1961    0]
 [  43    6]]

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
num_attribs = FEATURES
cat_attribs = LABEL
degree = 3
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# raw_data = raw_test_data.groupby("Name").get_group("1MBA")
train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.fit_transform(test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
# test_y = X_test[:,-1]
# test_set = X_test[:,:-1]

p = 0.9
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(train_set, train_y)
n = 6
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= voting_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
#     with open("/Users/weilu/Research/data/structure_selector/{}_results.csv".format(name), "w") as f:
#         f.write("Result\n")
#         for i in test:
#             f.write(str(i) + "\n")
#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))


1MBA
[[1955    6]
 [  49    0]]
T0766
[[1955    6]
 [  49    0]]
T0784
[[1956    5]
 [  48    1]]
T0792
[[1961    0]
 [  43    6]]
T0803
[[1956    5]
 [  48    1]]
T0815
[[1954    3]
 [  50    3]]
T0833
[[1927    6]
 [  49    0]]
T251
[[1961    0]
 [  43    6]]

In [ ]:


In [ ]: