Initial Data Exploration


In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
### Include all quantitative features. In addition, 'std_from_poi' and
### 'std_to_poi' are standardized feature (see details below).

features_list = ['poi','salary',
                 'bonus',
                 'expenses',
                 'exercised_stock_options', 'other',
                 'restricted_stock', 'shared_receipt_with_poi',
                 'std_from_poi','std_to_poi']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
# Add new features: std_from_poi and std_to_poi by dividing the message
# to/from poi by the total sent or received messages, respectively.
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')
data_dict.pop('LOCKHART EUGENE E')
for key in data_dict:
    if (type(data_dict[key]['from_poi_to_this_person']) == int and
        type(data_dict[key]['from_messages']) == int):
        data_dict[key]['std_from_poi'] = \
        (data_dict[key]['from_poi_to_this_person']/
         data_dict[key]['from_messages'])
    else:
        data_dict[key]['std_from_poi'] = 0
    if (type(data_dict[key]['from_this_person_to_poi']) == int and
        type(data_dict[key]['to_messages']) == int):
        data_dict[key]['std_to_poi'] = \
        (data_dict[key]['from_this_person_to_poi']/
         data_dict[key]['to_messages'])
    else:
        data_dict[key]['std_to_poi'] = 0
my_dataset = data_dict
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# The followings are the major steps in the analysis:
# A. Visualize the data using dimensionality reduction PCA and LDA to gain
#    further insight into the data
# B. Algorithm selection using repeated nested cross validation to choose
#    the algorithm that has highest accuracy
# C. Model selection using repeated cross validation to identify the best
#    hyperparameter values

# The following classification algorithms are used:
# 1. Logistic Regression
# 2. Random Forest Classifier
# 3. KNN Classifier
# 4. Support Vector Classifier
# 5. Neural Network: Multi-layer Perceptron Classifier
from IPython.core.display import display
from __future__ import division
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from time import time

# For simplicity, rename features as X and labels as y
X = features
y = labels
### First, explore the dataset.
### Identify the total number of data points.
print 'Total number of data points:',np.shape(X)[0]
print 'Total number of features:', np.shape(X)[1]

X_std = MinMaxScaler().fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_std)
print 'PCA explained_variance_ratio_', pca.explained_variance_ratio_

#This section is adapted from Udacity Forum 'What are the testing
#features when using SelectKBest?'
K_best = SelectKBest(chi2,k=9)
features_kbest = K_best.fit_transform(X_std,y)
print features_kbest.shape

feature_scores = ['%.3f' %elem for elem in K_best.scores_]
feature_scores_pvalues = ['%.3f' %elem for elem in K_best.pvalues_]
features_selected_tuple = [(features_list[i+1],feature_scores[i],feature_scores_pvalues[i]) 
                   for i in K_best.get_support(indices=True)]
features_selected_tuple = sorted(features_selected_tuple,key=lambda feature: float(feature[1]), reverse=True)
sorted_scores = []
sorted_p_value = []
sorted_feature = []
for feature_tuple in features_selected_tuple:
    sorted_feature.append(feature_tuple[0])
    sorted_scores.append(feature_tuple[1])
    sorted_p_value.append(feature_tuple[2])
    print(feature_tuple)    
df = pd.DataFrame(features_selected_tuple).set_index(0)


Total number of data points: 141
Total number of features: 9
PCA explained_variance_ratio_ [ 0.44757043  0.16723059  0.14853688  0.09639177  0.05159985  0.03633587
  0.0319113   0.0204233   0.        ]
(141, 9)
('exercised_stock_options', '6.682', '0.010')
('bonus', '4.976', '0.026')
('salary', '2.926', '0.087')
('shared_receipt_with_poi', '2.333', '0.127')
('std_from_poi', '1.990', '0.158')
('other', '1.667', '0.197')
('expenses', '1.408', '0.235')
('restricted_stock', '0.578', '0.447')
('std_to_poi', 'nan', 'nan')

Scatterplot Matrix


In [15]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(X_std)
pg = sns.PairGrid(df)
pg.map_diag(plt.hist)
pg.map_offdiag(plt.scatter)
plt.show()


Logistic Regression


In [2]:
clf_labels = \
['Logistic Regression','KNN','Random Forest','SVC','Kernel SVC','MLP']

#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Logistic Regression
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='recall'))
        
print 'CV Recall Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
CV F1 Score of Logistic Regression: 0.180 +/- 0.234
Complete in 36.4 sec
/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
CV Precision Score of Logistic Regression: 0.347 +/- 0.466
Complete in 37.0 sec
CV Recall Score of Logistic Regression: 0.123 +/- 0.166
Complete in 37.2 sec

In [16]:
clf_labels = \
['Logistic Regression','KNN','Random Forest','SVC','Kernel SVC','MLP']

#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Logistic Regression
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='recall'))
        
print 'CV Recall Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Logistic Regression: 0.172 +/- 0.222
Complete in 40.9 sec
CV Precision Score of Logistic Regression: 0.251 +/- 0.364
Complete in 43.5 sec
CV Recall Score of Logistic Regression: 0.207 +/- 0.318
Complete in 40.4 sec

Random Forest Classifier


In [3]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Random Forest Classifier
scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Random Forest Classifier: 0.328 +/- 0.222
Complete in 195.9 sec
CV Precision Score of Random Forest Classifier: 0.400 +/- 0.343
Complete in 202.6 sec
CV Recall Score of Random Forest Classifier: 0.337 +/- 0.221
Complete in 222.2 sec

In [17]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Random Forest Classifier
scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Random Forest Classifier: 0.313 +/- 0.223
Complete in 211.2 sec
CV Precision Score of Random Forest Classifier: 0.337 +/- 0.290
Complete in 422.8 sec
CV Recall Score of Random Forest Classifier: 0.328 +/- 0.267
Complete in 638.0 sec

KNN Classifier


In [9]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#KNN Classifier
scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=9)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=9)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=9)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of KNN Classifier: 0.239 +/- 0.189
Complete in 36.5 sec
CV Precision Score of KNN Classifier: 0.253 +/- 0.295
Complete in 36.5 sec
CV Recall Score of KNN Classifier: 0.253 +/- 0.179
Complete in 35.3 sec

In [18]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#KNN Classifier
scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=9)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of KNN Classifier: 0.199 +/- 0.181
Complete in 32.2 sec
CV Precision Score of KNN Classifier: 0.214 +/- 0.223
Complete in 32.0 sec
CV Recall Score of KNN Classifier: 0.209 +/- 0.208
Complete in 31.0 sec

Linear SVC


In [4]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Linear SVC
scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Linear SVC: 0.123 +/- 0.207
Complete in 46.5 sec
CV Precision Score of Linear SVC: 0.260 +/- 0.432
Complete in 44.7 sec
CV Recall Score of Linear SVC: 0.086 +/- 0.142
Complete in 42.4 sec

In [19]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Linear SVC
scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Linear SVC: 0.097 +/- 0.178
Complete in 46.4 sec
CV Precision Score of Linear SVC: 0.160 +/- 0.324
Complete in 46.7 sec
CV Recall Score of Linear SVC: 0.129 +/- 0.277
Complete in 44.3 sec

Kernel SVC


In [5]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',MinMaxScaler()],
                      ['kbest',SelectKBest(chi2,k=2)],
                      ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',MinMaxScaler()],
                      ['kbest',SelectKBest(chi2,k=2)],
                      ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',MinMaxScaler()],
                      ['kbest',SelectKBest(chi2,k=2)],
                      ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Kernel SVC: 0.234 +/- 0.195
Complete in 384.1 sec
CV Precision Score of Kernel SVC: 0.413 +/- 0.433
Complete in 364.3 sec
CV Recall Score of Kernel SVC: 0.185 +/- 0.178
Complete in 320.3 sec

In [20]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',MinMaxScaler()],
                      ['kbest',SelectKBest(chi2,k=2)],
                      ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Kernel SVC: 0.232 +/- 0.182
Complete in 338.8 sec
CV Precision Score of Kernel SVC: 0.319 +/- 0.340
Complete in 675.1 sec
CV Recall Score of Kernel SVC: 0.279 +/- 0.300
Complete in 1217.8 sec

Naive Bayes


In [6]:
#Set the number of repeats of the cross validation
N_outer = 5

#Naive Bayes
scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Naive Bayes: 0.332 +/- 0.205
Complete in 0.1 sec
CV Precision Score of Naive Bayes: 0.432 +/- 0.297
Complete in 0.1 sec
CV Recall Score of Naive Bayes: 0.293 +/- 0.190
Complete in 0.1 sec

In [21]:
#Set the number of repeats of the cross validation
N_outer = 5

#Naive Bayes
scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',MinMaxScaler()],
                    ['kbest',SelectKBest(chi2,k=2)],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Naive Bayes: 0.332 +/- 0.202
Complete in 0.2 sec
CV Precision Score of Naive Bayes: 0.389 +/- 0.270
Complete in 0.2 sec
CV Recall Score of Naive Bayes: 0.357 +/- 0.251
Complete in 0.3 sec

Multi-Layer Perceptron


In [7]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of MLP: 0.270 +/- 0.239
Complete in 627.9 sec
CV Precision of MLP: 0.430 +/- 0.414
Complete in 622.5 sec
CV Recall Score of MLP: 0.207 +/- 0.200
Complete in 625.6 sec

In [22]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of MLP: 0.272 +/- 0.240
Complete in 757.6 sec
CV Precision of MLP: 0.339 +/- 0.342
Complete in 771.9 sec
CV Recall Score of MLP: 0.310 +/- 0.312
Complete in 773.8 sec

AdaBoost Classifier


In [8]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#AdaBoost
scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of AdaBoost: 0.338 +/- 0.202
Complete in 899.6 sec
CV F1 Score of AdaBoost: 0.376 +/- 0.274
Complete in 886.9 sec
CV F1 Score of AdaBoost: 0.338 +/- 0.215
Complete in 874.3 sec

In [23]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#AdaBoost
scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of AdaBoost: 0.337 +/- 0.204
Complete in 1060.6 sec
CV F1 Score of AdaBoost: 0.360 +/- 0.252
Complete in 940.3 sec
CV F1 Score of AdaBoost: 0.353 +/- 0.243
Complete in 949.7 sec

Model Selection for KNN based on F1 Score


In [24]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}

t0 = time()
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_knn_cv = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,cv=k_fold,scoring='f1')
    gs_knn_cv = gs_knn_cv.fit(X,y)
    best_param = gs_knn_cv.best_params_
    best_param.update({'Best Score': gs_knn_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_neighbors':'N Neighbors'})
best_params_df = best_params_df.groupby('N Neighbors')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)
print time() - t0


count mean std min 25% 50% 75% max
N Neighbors
3 334.0 0.41 0.06 0.19 0.36 0.41 0.46 0.62
5 167.0 0.41 0.05 0.24 0.38 0.42 0.45 0.55
2 336.0 0.40 0.06 0.14 0.36 0.41 0.45 0.52
4 56.0 0.39 0.06 0.22 0.34 0.39 0.44 0.48
1 107.0 0.34 0.06 0.20 0.31 0.34 0.38 0.45
235.519067049

Model Selection for KNN based on precision


In [25]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}

t0 = time()
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_knn_cv = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,cv=k_fold,scoring='precision')
    gs_knn_cv = gs_knn_cv.fit(X,y)
    best_param = gs_knn_cv.best_params_
    best_param.update({'Best Score': gs_knn_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_neighbors':'N Neighbors'})
best_params_df = best_params_df.groupby('N Neighbors')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)
print time() - t0


count mean std min 25% 50% 75% max
N Neighbors
4 246.0 0.66 0.13 0.29 0.59 0.65 0.74 1.00
5 128.0 0.65 0.14 0.30 0.54 0.64 0.73 1.00
2 508.0 0.64 0.14 0.21 0.54 0.66 0.73 1.00
3 110.0 0.56 0.13 0.23 0.46 0.55 0.63 0.92
1 8.0 0.36 0.11 0.21 0.26 0.36 0.42 0.55
236.258044958

Model Selection for KNN based on recall


In [26]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',MinMaxScaler()],
                     ['kbest',SelectKBest(chi2,k=2)],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}

t0 = time()
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_knn_cv = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,cv=k_fold,scoring='recall')
    gs_knn_cv = gs_knn_cv.fit(X,y)
    best_param = gs_knn_cv.best_params_
    best_param.update({'Best Score': gs_knn_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_neighbors':'N Neighbors'})
best_params_df = best_params_df.groupby('N Neighbors')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)
print time() - t0


count mean std min 25% 50% 75% max
N Neighbors
3 131.0 0.38 0.05 0.27 0.33 0.40 0.42 0.51
1 860.0 0.37 0.05 0.10 0.33 0.38 0.40 0.51
5 9.0 0.36 0.05 0.28 0.33 0.38 0.40 0.41
237.360317945

In [ ]: