In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_data = raw_test_data.groupby("Name").get_group("T0792")

In [4]:
raw_test_data.columns


Out[4]:
Index(['Step', 'Qw', 'Rw', 'VTotal', 'QGO', 'Burial', 'Water', 'Rama', 'Chain',
       'Chi', 'DSSP', 'P_AP', 'Helix', 'Frag_Mem', 'GDT', 'Name', 'Good'],
      dtype='object')

In [5]:
list(raw_test_data.columns[2:-3])


Out[5]:
['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'Chain',
 'Chi',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']

In [71]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
 'VTotal',
 'QGO',
 'Burial',
 'Water',
 'Rama',
 'DSSP',
 'P_AP',
 'Helix',
 'Frag_Mem']
LABEL = ["Good"]

In [72]:
test = raw_data.filter(FEATURES)

In [73]:
a = test - test.mean()

In [74]:
U, s, V = np.linalg.svd(a)

In [75]:
c1 = V.T[:, 0]
c2 = V.T[:, 1]

In [76]:
c1


Out[76]:
array([  9.99676899e-01,   2.30606380e-02,   4.19808837e-03,
         7.66989107e-04,   2.39040414e-03,   2.76195027e-03,
         1.32539971e-03,   7.67124115e-04,   2.14383033e-03,
         8.70685183e-03])

In [77]:
np.std(a/np.std(a))


Out[77]:
Rw          1.0
VTotal      1.0
QGO         1.0
Burial      1.0
Water       1.0
Rama        1.0
DSSP        1.0
P_AP        1.0
Helix       1.0
Frag_Mem    1.0
dtype: float64

In [78]:
b = a/np.std(a)
b
U, s, V = np.linalg.svd(b)
c1 = V.T[:, 0]
c2 = V.T[:, 1]

In [79]:
c1


Out[79]:
array([ 0.26534191,  0.55426683,  0.1082651 , -0.01471415,  0.03828705,
        0.41946372,  0.20433614,  0.22997682,  0.39455654,  0.427447  ])

In [83]:
a = StandardScaler(test)

In [87]:
a


Out[87]:
StandardScaler(copy=                Rw      VTotal        QGO     Burial      Water        Rama  \
6030 -10913.216331 -598.271323   8.105555 -69.408140 -31.126137 -327.905525
6031 -10167.657091 -498.314220  20.261380 -69.158464 -23.624554 -265.301770
6032 -10294.674660 -530.439020  17.701859 -66.558903 -25.7...13.016733 -130.167483
8039 -12.015912 -5.429175 -12.869567 -133.227691

[2010 rows x 10 columns],
        with_mean=True, with_std=True)

In [81]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
pca.fit(StandardScaler(test))
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-81-2d886fe19b82> in <module>()
      1 from sklearn.decomposition import PCA
      2 pca = PCA(n_components = 0.95)
----> 3 pca.fit(StandardScaler(test))
      4 cumsum = np.cumsum(pca.explained_variance_ratio_)
      5 d = np.argmax(cumsum >= 0.95) + 1

~/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py in fit(self, X, y)
    327             Returns the instance itself.
    328         """
--> 329         self._fit(X)
    330         return self
    331 

~/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py in _fit(self, X)
    368 
    369         X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
--> 370                         copy=self.copy)
    371 
    372         # Handle n_components==None

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

TypeError: float() argument must be a string or a number, not 'StandardScaler'

In [67]:
d


Out[67]:
8

In [69]:
len(FEATURES)


Out[69]:
10

In [68]:
plt.plot(cumsum)


Out[68]:
[<matplotlib.lines.Line2D at 0x1a26850f98>]

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [ ]:
# I want to start with the simplest linear regression

In [20]:
num_attribs = FEATURES
cat_attribs = LABEL
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=3, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [21]:
from sklearn.model_selection import train_test_split
# raw_data = raw_test_data.groupby("Name").get_group("1MBA")
train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.fit_transform(test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
p = 0.9
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)


Out[22]:
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight={0: 0.9, 1: 0.09999999999999998},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=142, solver='liblinear', tol=0.0001, verbose=0,
       ...rbf', max_iter=-1, probability=True, random_state=412,
  shrinking=True, tol=0.001, verbose=False))],
         n_jobs=1, voting='soft', weights=None)

In [23]:
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(train_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(train_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(train_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)


LogisticRegression 
 [[1567    0]
 [  31   10]]
RandomForestClassifier 
 [[1567    0]
 [  31   10]]
SVC 
 [[1567    0]
 [  31   10]]
VotingClassifier 
 [[1567    0]
 [  31   10]]

In [24]:
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
#     y_pred = clf.predict(train_set)
    prob= clf.predict_proba(test_set)[:,1]
    position_of_top_n = prob.argsort()[-n:][::-1]
    threshold = prob[position_of_top_n][-1]
    predict_y = np.zeros(len(test_y),)
    predict_y[position_of_top_n] = 1
#     predict_y = (test > threshold)
#     print(threshold)
    cm = confusion_matrix(test_y, predict_y)
#     print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
    print(clf.__class__.__name__, "\n", cm)


LogisticRegression 
 [[391   3]
 [  1   7]]
RandomForestClassifier 
 [[389   5]
 [  3   5]]
SVC 
 [[390   4]
 [  2   6]]
VotingClassifier 
 [[389   5]
 [  3   5]]

In [15]:
def compute_with_my_score_function(p=0.9, degree=3):
    num_attribs = FEATURES
    cat_attribs = LABEL
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=degree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
    X_train = full_pipeline.fit_transform(train_set)
    X_test = full_pipeline.fit_transform(test_set)
#     X_train = full_pipeline.fit_transform(raw_data)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
    rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
    svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})
    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
        voting='soft')

#     voting_clf.fit(train_set, train_y)
    n = 10
    for cl_name, clf in zip(("lr", "rf", "svc", "voting"), (log_clf, rnd_clf, svm_clf, voting_clf)):
#     for cl_name, clf in ("voting", voting_clf):
        my_evaluation = 1.0
        another_evaluation = 0.0
        for name, data in raw_test_data.groupby("Name"):
#             print(name)
            X = full_pipeline.fit_transform(data)
            validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)

            validation_y = validation_data[:,-1]
            validation_set = validation_data[:,:-1]
            clf.fit(train_set, train_y)
            test= clf.predict_proba(validation_set)[:,1]
            position_of_top_n = test.argsort()[-n:][::-1]
            threshold = test[position_of_top_n][-1]
            predict_y = np.zeros(len(validation_y),)
            predict_y[position_of_top_n] = 1
        #     predict_y = (test > threshold)
    #         print(threshold)
            cm = confusion_matrix(validation_y, predict_y)
#             print(cm)
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
#             print(name,  " precision", precision,end = " ")
            if name != "T0766" and name != "T0833":
                my_evaluation *= precision
                another_evaluation += precision
#         print("")
        print("classifier:", cl_name, ", p:",p, ", degree", degree, ", score", my_evaluation, ", another score", another_evaluation)
    return (cl_name, p, degree, my_evaluation)

In [16]:
def myGridSerach():
    p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
    degree_list = [3, 2, 1]
#     p_list = [0.1, 0.8, 0.9, 0.95]
#     degree_list = [1, 2, 3]
#     p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
#     degree_list = [1, 2, 3, 4]
    result = []
    for p in p_list:
        for degree in degree_list:
            result += [compute_with_my_score_function(p, degree)]

In [17]:
myGridSerach()


classifier: lr , p: 0.9 , degree 3 , score 0.01008 , another score 3.0
classifier: rf , p: 0.9 , degree 3 , score 0.00576 , another score 3.1
classifier: svc , p: 0.9 , degree 3 , score 0.0 , another score 1.8
classifier: voting , p: 0.9 , degree 3 , score 0.0 , another score 3.0
classifier: lr , p: 0.9 , degree 2 , score 0.0 , another score 1.6
classifier: rf , p: 0.9 , degree 2 , score 0.00216 , another score 2.6
classifier: svc , p: 0.9 , degree 2 , score 0.00048 , another score 2.3
classifier: voting , p: 0.9 , degree 2 , score 0.0036 , another score 2.7
classifier: lr , p: 0.9 , degree 1 , score 0.0 , another score 2.0
classifier: rf , p: 0.9 , degree 1 , score 0.00448 , another score 3.0
classifier: svc , p: 0.9 , degree 1 , score 0.000216 , another score 2.0
classifier: voting , p: 0.9 , degree 1 , score 0.00336 , another score 2.9
classifier: lr , p: 0.8 , degree 3 , score 0.0075 , another score 2.8
classifier: rf , p: 0.8 , degree 3 , score 0.0 , another score 2.1
classifier: svc , p: 0.8 , degree 3 , score 0.0 , another score 1.8
classifier: voting , p: 0.8 , degree 3 , score 0.0024 , another score 2.7
classifier: lr , p: 0.8 , degree 2 , score 0.0 , another score 2.0
classifier: rf , p: 0.8 , degree 2 , score 0.00192 , another score 2.7
classifier: svc , p: 0.8 , degree 2 , score 0.00048 , another score 2.3
classifier: voting , p: 0.8 , degree 2 , score 0.00576 , another score 3.1
classifier: lr , p: 0.8 , degree 1 , score 0.0 , another score 2.0
classifier: rf , p: 0.8 , degree 1 , score 0.0 , another score 2.6
classifier: svc , p: 0.8 , degree 1 , score 0.000384 , another score 2.0
classifier: voting , p: 0.8 , degree 1 , score 0.0 , another score 2.4
classifier: lr , p: 0.7 , degree 3 , score 0.00225 , another score 2.5
classifier: rf , p: 0.7 , degree 3 , score 0.0 , another score 2.2
classifier: svc , p: 0.7 , degree 3 , score 0.0 , another score 1.8
classifier: voting , p: 0.7 , degree 3 , score 0.0048 , another score 2.8
classifier: lr , p: 0.7 , degree 2 , score 0.0 , another score 2.1
classifier: rf , p: 0.7 , degree 2 , score 0.0084 , another score 3.1
classifier: svc , p: 0.7 , degree 2 , score 0.000324 , another score 2.1
classifier: voting , p: 0.7 , degree 2 , score 0.010368 , another score 3.1
classifier: lr , p: 0.7 , degree 1 , score 0.0 , another score 2.0
classifier: rf , p: 0.7 , degree 1 , score 0.0 , another score 1.9
classifier: svc , p: 0.7 , degree 1 , score 0.0012 , another score 2.4
classifier: voting , p: 0.7 , degree 1 , score 0.001296 , another score 2.5
classifier: lr , p: 0.5 , degree 3 , score 0.0024 , another score 2.5
classifier: rf , p: 0.5 , degree 3 , score 0.000486 , another score 2.1
classifier: svc , p: 0.5 , degree 3 , score 0.0 , another score 2.2
classifier: voting , p: 0.5 , degree 3 , score 0.0081 , another score 2.9
classifier: lr , p: 0.5 , degree 2 , score 0.000336 , another score 2.0
classifier: rf , p: 0.5 , degree 2 , score 0.0 , another score 2.2
classifier: svc , p: 0.5 , degree 2 , score 0.0 , another score 2.3
classifier: voting , p: 0.5 , degree 2 , score 0.0 , another score 2.4
classifier: lr , p: 0.5 , degree 1 , score 0.0 , another score 1.9
classifier: rf , p: 0.5 , degree 1 , score 0.00048 , another score 2.3
classifier: svc , p: 0.5 , degree 1 , score 0.0 , another score 2.6
classifier: voting , p: 0.5 , degree 1 , score 0.0 , another score 2.5
classifier: lr , p: 0.1 , degree 3 , score 0.00384 , another score 2.4
classifier: rf , p: 0.1 , degree 3 , score 0.0012 , another score 2.5
classifier: svc , p: 0.1 , degree 3 , score 4e-05 , another score 1.4
classifier: voting , p: 0.1 , degree 3 , score 0.00324 , another score 2.6
classifier: lr , p: 0.1 , degree 2 , score 0.000576 , another score 2.0
classifier: rf , p: 0.1 , degree 2 , score 0.0 , another score 2.2
classifier: svc , p: 0.1 , degree 2 , score 2.4e-05 , another score 1.2
classifier: voting , p: 0.1 , degree 2 , score 0.0 , another score 1.8
classifier: lr , p: 0.1 , degree 1 , score 0.000525 , another score 2.2
classifier: rf , p: 0.1 , degree 1 , score 0.0 , another score 1.7
classifier: svc , p: 0.1 , degree 1 , score 9.6e-05 , another score 1.8
classifier: voting , p: 0.1 , degree 1 , score 0.0 , another score 1.8

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
num_attribs = FEATURES
cat_attribs = LABEL
degree = 3
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree, include_bias=False))
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs))
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# raw_data = raw_test_data.groupby("Name").get_group("1MBA")
train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.fit_transform(test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
# test_y = X_test[:,-1]
# test_set = X_test[:,:-1]

p = 0.9
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(train_set, train_y)
n = 6
for name, data in raw_test_data.groupby("Name"):
    print(name)
    X = full_pipeline.fit_transform(data)
    eval_y = X[:,-1]
    eval_set = X[:,:-1]
    test= voting_clf.predict_proba(eval_set)[:,1]
    position_of_top_n = test.argsort()[-n:][::-1]
    threshold = test[position_of_top_n][-1]
    predict_y = np.zeros(len(eval_y),)
    predict_y[position_of_top_n] = 1
    with open("/Users/weilu/Research/data/structure_selector/{}_results.csv".format(name), "w") as f:
        f.write("Result\n")
        for i in test:
            f.write(str(i) + "\n")
#     predict_y = (test > threshold)
#     print(threshold)
    print(confusion_matrix(eval_y, predict_y))


1MBA
[[1955    6]
 [  49    0]]
T0766
[[1955    6]
 [  49    0]]
T0784
[[1956    5]
 [  48    1]]
T0792
[[1961    0]
 [  43    6]]
T0803
[[1956    5]
 [  48    1]]
T0815
[[1954    3]
 [  50    3]]
T0833
[[1927    6]
 [  49    0]]
T251
[[1961    0]
 [  43    6]]

In [ ]: