Initial Data Exploration



In [1]:

    
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
### Include all quantitative features. In addition, 'std_from_poi' and
### 'std_to_poi' are standardized feature (see details below).
features_list = ['poi','salary',
                 'bonus',
                 'expenses',
                 'exercised_stock_options', 'other',
                 'restricted_stock', 'shared_receipt_with_poi',
                 'std_from_poi','std_to_poi']

additional_removed = ['deferral_payments','long_term_incentive','loan_advances','restricted_stock_deferred','deferred_income',
                      'director_fees']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
# Add new features: std_from_poi and std_to_poi by dividing the message
# to/from poi by the total sent or received messages, respectively.
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')
data_dict.pop('LOCKHART EUGENE E')
for key in data_dict:
    if (type(data_dict[key]['from_poi_to_this_person']) == int and
        type(data_dict[key]['from_messages']) == int):
        data_dict[key]['std_from_poi'] = \
        (data_dict[key]['from_poi_to_this_person']/
         data_dict[key]['from_messages'])
    else:
        data_dict[key]['std_from_poi'] = 0
    if (type(data_dict[key]['from_this_person_to_poi']) == int and
        type(data_dict[key]['to_messages']) == int):
        data_dict[key]['std_to_poi'] = \
        (data_dict[key]['from_this_person_to_poi']/
         data_dict[key]['to_messages'])
    else:
        data_dict[key]['std_to_poi'] = 0
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# The followings are the major steps in the analysis:
# A. Visualize the data using dimensionality reduction PCA and LDA to gain
#    further insight into the data
# B. Algorithm selection using repeated nested cross validation to choose
#    the algorithm that has highest accuracy
# C. Model selection using repeated cross validation to identify the best
#    hyperparameter values

# The following classification algorithms are used:
# 1. Logistic Regression
# 2. Random Forest Classifier
# 3. KNN Classifier
# 4. Support Vector Classifier
# 5. Neural Network: Multi-layer Perceptron Classifier
from __future__ import division
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from time import time

# For simplicity, rename features as X and labels as y
X = features
y = labels
### First, explore the dataset.
### Identify the total number of data points.
print 'Total number of data points:',np.shape(X)[0]
print 'Total number of features:', np.shape(X)[1]

X_std = StandardScaler().fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_std)
print 'PCA explained_variance_ratio_', pca.explained_variance_ratio_









    



Total number of data points: 141
Total number of features: 9
PCA explained_variance_ratio_ [ 0.44430549  0.16421304  0.12155078  0.10288132  0.05593281  0.0412784
  0.03872116  0.03111701  0.        ]

Scatterplot Matrix



In [7]:

    
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(X_std)
pg = sns.PairGrid(df)
pg.map_diag(plt.hist)
pg.map_offdiag(plt.scatter)
plt.show()

Logistic Regression



In [2]:

    
clf_labels = \
['Logistic Regression','KNN','Random Forest','SVC','Kernel SVC','MLP']

#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Logistic Regression
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='recall'))
        
print 'CV Recall Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)









    



/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)






    



CV F1 Score of Logistic Regression: 0.270 +/- 0.207
Complete in 36.1 sec






    



/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)






    



CV Precision Score of Logistic Regression: 0.331 +/- 0.350
Complete in 36.6 sec
CV Recall Score of Logistic Regression: 0.319 +/- 0.225
Complete in 35.9 sec

Random Forest Classifier



In [3]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Random Forest Classifier
scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of Random Forest Classifier: 0.277 +/- 0.258
Complete in 214.4 sec
CV Precision Score of Random Forest Classifier: 0.321 +/- 0.385
Complete in 216.4 sec
CV Recall Score of Random Forest Classifier: 0.310 +/- 0.282
Complete in 214.5 sec

KNN Classifier



In [4]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#KNN Classifier
scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of KNN Classifier: 0.190 +/- 0.188
Complete in 27.2 sec
CV Precision Score of KNN Classifier: 0.174 +/- 0.256
Complete in 27.8 sec
CV Recall Score of KNN Classifier: 0.204 +/- 0.189
Complete in 29.2 sec

Linear SVC



In [5]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Linear SVC
scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of Linear SVC: 0.148 +/- 0.177
Complete in 40.5 sec
CV Precision Score of Linear SVC: 0.155 +/- 0.221
Complete in 39.6 sec
CV Recall Score of Linear SVC: 0.175 +/- 0.193
Complete in 41.1 sec

Kernel SVC



In [6]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of Kernel SVC: 0.154 +/- 0.202
Complete in 344.6 sec
CV Precision Score of Kernel SVC: 0.182 +/- 0.324
Complete in 342.0 sec
CV Recall Score of Kernel SVC: 0.155 +/- 0.198
Complete in 341.8 sec

Naive Bayes



In [7]:

    
#Set the number of repeats of the cross validation
N_outer = 5

#Naive Bayes
scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of Naive Bayes: 0.254 +/- 0.216
Complete in 0.1 sec
CV Precision Score of Naive Bayes: 0.309 +/- 0.285
Complete in 0.1 sec
CV Recall Score of Naive Bayes: 0.237 +/- 0.212
Complete in 0.1 sec

Multi-Layer Perceptron



In [8]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#MLP Classifier
scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of MLP: 0.151 +/- 0.183
Complete in 1169.3 sec
CV Precision of MLP: 0.143 +/- 0.243
Complete in 997.8 sec
CV Recall Score of MLP: 0.207 +/- 0.216
Complete in 820.5 sec

AdaBoost Classifier



In [9]:

    
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#AdaBoost
scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of AdaBoost: 0.326 +/- 0.259
Complete in 860.2 sec
CV F1 Score of AdaBoost: 0.415 +/- 0.353
Complete in 958.8 sec
CV F1 Score of AdaBoost: 0.299 +/- 0.242
Complete in 969.1 sec

Model Selection for Logistic Regression based on F1 Score



In [16]:

    
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='f1')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)

Model Selection for Logistic Regression based on Precision



In [17]:

    
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='precision')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)

Model Selection for Logistic Regression based on Recall



In [18]:

    
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='recall')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)

Model Selection for AdaBoost Classifier based on F1 Score



In [19]:

    
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='f1')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)









    



Complete in 660.6 sec






    







  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
    
      N estimators
      
      
      
      
      
      
      
      
    
  
  
    
      100
      6.0
      0.41
      0.04
      0.33
      0.40
      0.42
      0.44
      0.45
    
    
      70
      8.0
      0.40
      0.07
      0.30
      0.35
      0.41
      0.45
      0.48
    
    
      60
      4.0
      0.40
      0.03
      0.37
      0.39
      0.41
      0.42
      0.42
    
    
      10
      22.0
      0.38
      0.08
      0.20
      0.35
      0.37
      0.41
      0.55
    
    
      40
      11.0
      0.38
      0.08
      0.25
      0.34
      0.37
      0.43
      0.55
    
    
      50
      9.0
      0.37
      0.06
      0.28
      0.33
      0.39
      0.42
      0.43
    
    
      90
      14.0
      0.36
      0.06
      0.28
      0.31
      0.35
      0.39
      0.48
    
    
      80
      10.0
      0.36
      0.05
      0.27
      0.34
      0.35
      0.40
      0.44
    
    
      30
      7.0
      0.34
      0.04
      0.27
      0.30
      0.34
      0.37
      0.38
    
    
      20
      9.0
      0.33
      0.08
      0.18
      0.31
      0.35
      0.37
      0.45

Model Selection for AdaBoost Classifier based on precision



In [20]:

    
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='precision')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)









    



Complete in 653.6 sec






    







  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
    
      N estimators
      
      
      
      
      
      
      
      
    
  
  
    
      60
      3.0
      0.55
      0.12
      0.42
      0.50
      0.57
      0.62
      0.66
    
    
      70
      6.0
      0.53
      0.04
      0.50
      0.50
      0.53
      0.56
      0.58
    
    
      80
      6.0
      0.52
      0.11
      0.36
      0.47
      0.52
      0.57
      0.68
    
    
      50
      11.0
      0.49
      0.12
      0.30
      0.40
      0.52
      0.55
      0.66
    
    
      10
      31.0
      0.48
      0.10
      0.32
      0.40
      0.48
      0.53
      0.70
    
    
      100
      7.0
      0.48
      0.15
      0.29
      0.38
      0.40
      0.59
      0.69
    
    
      40
      9.0
      0.47
      0.15
      0.25
      0.38
      0.48
      0.51
      0.78
    
    
      90
      10.0
      0.46
      0.10
      0.28
      0.41
      0.49
      0.52
      0.59
    
    
      30
      7.0
      0.45
      0.06
      0.37
      0.40
      0.46
      0.49
      0.52
    
    
      20
      10.0
      0.44
      0.12
      0.20
      0.41
      0.46
      0.51
      0.58

Model Selection for AdaBoost Classifier based on recall



In [21]:

    
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='recall')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)









    



Complete in 652.7 sec






    







  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
    
      N estimators
      
      
      
      
      
      
      
      
    
  
  
    
      60
      8.0
      0.39
      0.07
      0.32
      0.33
      0.36
      0.42
      0.50
    
    
      30
      10.0
      0.36
      0.06
      0.26
      0.32
      0.37
      0.41
      0.45
    
    
      50
      9.0
      0.36
      0.05
      0.28
      0.33
      0.35
      0.40
      0.44
    
    
      70
      7.0
      0.36
      0.07
      0.27
      0.30
      0.38
      0.42
      0.45
    
    
      90
      7.0
      0.36
      0.08
      0.23
      0.32
      0.35
      0.42
      0.46
    
    
      80
      6.0
      0.35
      0.04
      0.28
      0.32
      0.36
      0.38
      0.38
    
    
      10
      26.0
      0.34
      0.08
      0.17
      0.29
      0.35
      0.40
      0.50
    
    
      20
      14.0
      0.34
      0.04
      0.27
      0.32
      0.33
      0.34
      0.44
    
    
      40
      12.0
      0.34
      0.05
      0.23
      0.33
      0.35
      0.38
      0.40
    
    
      100
      1.0
      0.28
      NaN
      0.28
      0.28
      0.28
      0.28
      0.28

Dummy Classifier



In [22]:

    
from sklearn.dummy import DummyClassifier

#Set the number of repeats of the cross validation
N_outer = 5

#Dummy Classifier
scores=[]
clf_dm = DummyClassifier(strategy='uniform')

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)









    



CV F1 Score of Dummy Classifier: 0.168 +/- 0.098
Complete in 0.0 sec
CV Precision Score of Dummy Classifier: 0.144 +/- 0.082
Complete in 0.0 sec
CV Recall Score of Dummy Classifier: 0.259 +/- 0.236
Complete in 0.0 sec



In [ ]:

	count	mean	std	min	25%	50%	75%	max
C
1.0000	53.0	0.34	0.04	0.26	0.32	0.35	0.37	0.40
0.0100	161.0	0.33	0.04	0.22	0.30	0.33	0.35	0.43
0.1000	91.0	0.33	0.04	0.20	0.31	0.34	0.36	0.40
10.0000	29.0	0.33	0.04	0.25	0.31	0.34	0.36	0.41
100.0000	3.0	0.33	0.02	0.30	0.32	0.33	0.34	0.35
0.0010	291.0	0.32	0.04	0.21	0.30	0.32	0.34	0.43
0.0001	371.0	0.31	0.04	0.19	0.29	0.32	0.34	0.43
1000.0000	1.0	0.29	NaN	0.29	0.29	0.29	0.29	0.29

	count	mean	std	min	25%	50%	75%	max
C
1.0000	139.0	0.53	0.10	0.27	0.47	0.52	0.60	0.77
10.0000	45.0	0.51	0.11	0.29	0.44	0.50	0.58	0.72
0.1000	473.0	0.49	0.11	0.19	0.40	0.48	0.57	0.84
100.0000	5.0	0.46	0.08	0.37	0.37	0.49	0.50	0.56
0.0100	200.0	0.44	0.10	0.23	0.37	0.45	0.51	0.78
0.0010	67.0	0.40	0.08	0.26	0.34	0.40	0.46	0.58
0.0001	71.0	0.37	0.08	0.20	0.32	0.39	0.43	0.55

	count	mean	std	min	25%	50%	75%	max
C
0.0001	999.0	0.33	0.04	0.21	0.31	0.33	0.35	0.46
1.0000	1.0	0.29	NaN	0.29	0.29	0.29	0.29	0.29

	count	mean	std	min	25%	50%	75%	max
N estimators
100	6.0	0.41	0.04	0.33	0.40	0.42	0.44	0.45
70	8.0	0.40	0.07	0.30	0.35	0.41	0.45	0.48
60	4.0	0.40	0.03	0.37	0.39	0.41	0.42	0.42
10	22.0	0.38	0.08	0.20	0.35	0.37	0.41	0.55
40	11.0	0.38	0.08	0.25	0.34	0.37	0.43	0.55
50	9.0	0.37	0.06	0.28	0.33	0.39	0.42	0.43
90	14.0	0.36	0.06	0.28	0.31	0.35	0.39	0.48
80	10.0	0.36	0.05	0.27	0.34	0.35	0.40	0.44
30	7.0	0.34	0.04	0.27	0.30	0.34	0.37	0.38
20	9.0	0.33	0.08	0.18	0.31	0.35	0.37	0.45

	count	mean	std	min	25%	50%	75%	max
N estimators
60	3.0	0.55	0.12	0.42	0.50	0.57	0.62	0.66
70	6.0	0.53	0.04	0.50	0.50	0.53	0.56	0.58
80	6.0	0.52	0.11	0.36	0.47	0.52	0.57	0.68
50	11.0	0.49	0.12	0.30	0.40	0.52	0.55	0.66
10	31.0	0.48	0.10	0.32	0.40	0.48	0.53	0.70
100	7.0	0.48	0.15	0.29	0.38	0.40	0.59	0.69
40	9.0	0.47	0.15	0.25	0.38	0.48	0.51	0.78
90	10.0	0.46	0.10	0.28	0.41	0.49	0.52	0.59
30	7.0	0.45	0.06	0.37	0.40	0.46	0.49	0.52
20	10.0	0.44	0.12	0.20	0.41	0.46	0.51	0.58