Initial Data Exploration


In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
### Include all quantitative features. In addition, 'std_from_poi' and
### 'std_to_poi' are standardized feature (see details below).
features_list = ['poi','salary',
                 'bonus',
                 'expenses',
                 'exercised_stock_options', 'other',
                 'restricted_stock', 'shared_receipt_with_poi',
                 'std_from_poi','std_to_poi']

additional_removed = ['deferral_payments','long_term_incentive','loan_advances','restricted_stock_deferred','deferred_income',
                      'director_fees']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
# Add new features: std_from_poi and std_to_poi by dividing the message
# to/from poi by the total sent or received messages, respectively.
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')
data_dict.pop('LOCKHART EUGENE E')
for key in data_dict:
    if (type(data_dict[key]['from_poi_to_this_person']) == int and
        type(data_dict[key]['from_messages']) == int):
        data_dict[key]['std_from_poi'] = \
        (data_dict[key]['from_poi_to_this_person']/
         data_dict[key]['from_messages'])
    else:
        data_dict[key]['std_from_poi'] = 0
    if (type(data_dict[key]['from_this_person_to_poi']) == int and
        type(data_dict[key]['to_messages']) == int):
        data_dict[key]['std_to_poi'] = \
        (data_dict[key]['from_this_person_to_poi']/
         data_dict[key]['to_messages'])
    else:
        data_dict[key]['std_to_poi'] = 0
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# The followings are the major steps in the analysis:
# A. Visualize the data using dimensionality reduction PCA and LDA to gain
#    further insight into the data
# B. Algorithm selection using repeated nested cross validation to choose
#    the algorithm that has highest accuracy
# C. Model selection using repeated cross validation to identify the best
#    hyperparameter values

# The following classification algorithms are used:
# 1. Logistic Regression
# 2. Random Forest Classifier
# 3. KNN Classifier
# 4. Support Vector Classifier
# 5. Neural Network: Multi-layer Perceptron Classifier
from __future__ import division
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from time import time

# For simplicity, rename features as X and labels as y
X = features
y = labels
### First, explore the dataset.
### Identify the total number of data points.
print 'Total number of data points:',np.shape(X)[0]
print 'Total number of features:', np.shape(X)[1]

X_std = StandardScaler().fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_std)
print 'PCA explained_variance_ratio_', pca.explained_variance_ratio_


Total number of data points: 141
Total number of features: 9
PCA explained_variance_ratio_ [ 0.44430549  0.16421304  0.12155078  0.10288132  0.05593281  0.0412784
  0.03872116  0.03111701  0.        ]

Scatterplot Matrix


In [7]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(X_std)
pg = sns.PairGrid(df)
pg.map_diag(plt.hist)
pg.map_offdiag(plt.scatter)
plt.show()


Logistic Regression


In [2]:
clf_labels = \
['Logistic Regression','KNN','Random Forest','SVC','Kernel SVC','MLP']

#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Logistic Regression
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)
        
scores=[]
clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    k_fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        k_fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_lr = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,
                             cv=k_fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_lr,X,y,cv=k_fold_outer,
                                      scoring='recall'))
        
print 'CV Recall Score of Logistic Regression: %.3f +/- %.3f' %(np.mean(scores),np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
CV F1 Score of Logistic Regression: 0.270 +/- 0.207
Complete in 36.1 sec
/Users/Raga/anaconda/envs/enron/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
CV Precision Score of Logistic Regression: 0.331 +/- 0.350
Complete in 36.6 sec
CV Recall Score of Logistic Regression: 0.319 +/- 0.225
Complete in 35.9 sec

Random Forest Classifier


In [3]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Random Forest Classifier
scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_rf]])
params_rf = {'clf__n_estimators':np.arange(1,11)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_rf = GridSearchCV(estimator=pipe_rf,param_grid=params_rf,
                             cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_rf,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Random Forest Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Random Forest Classifier: 0.277 +/- 0.258
Complete in 214.4 sec
CV Precision Score of Random Forest Classifier: 0.321 +/- 0.385
Complete in 216.4 sec
CV Recall Score of Random Forest Classifier: 0.310 +/- 0.282
Complete in 214.5 sec

KNN Classifier


In [4]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#KNN Classifier
scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_knn = KNeighborsClassifier()
pipe_knn = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_knn]])
params_knn = {'clf__n_neighbors':np.arange(1,6)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_knn = GridSearchCV(estimator=pipe_knn,param_grid=params_knn,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_knn,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of KNN Classifier: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of KNN Classifier: 0.190 +/- 0.188
Complete in 27.2 sec
CV Precision Score of KNN Classifier: 0.174 +/- 0.256
Complete in 27.8 sec
CV Recall Score of KNN Classifier: 0.204 +/- 0.189
Complete in 29.2 sec

Linear SVC


In [5]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Linear SVC
scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_svc = SVC()
pipe_svc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_svc]])
params_svc = {'clf__C':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_svc = GridSearchCV(estimator=pipe_svc,param_grid=params_svc,
                              cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_svc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Linear SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Linear SVC: 0.148 +/- 0.177
Complete in 40.5 sec
CV Precision Score of Linear SVC: 0.155 +/- 0.221
Complete in 39.6 sec
CV Recall Score of Linear SVC: 0.175 +/- 0.193
Complete in 41.1 sec

Kernel SVC


In [6]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#Kernel SVC
scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ksvc = SVC(kernel='rbf')
pipe_ksvc = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ksvc]])
params_ksvc = {'clf__C':10.0**np.arange(-4,4),'clf__gamma':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ksvc = GridSearchCV(estimator=pipe_ksvc,param_grid=params_ksvc,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ksvc,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of Kernel SVC: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Kernel SVC: 0.154 +/- 0.202
Complete in 344.6 sec
CV Precision Score of Kernel SVC: 0.182 +/- 0.324
Complete in 342.0 sec
CV Recall Score of Kernel SVC: 0.155 +/- 0.198
Complete in 341.8 sec

Naive Bayes


In [7]:
#Set the number of repeats of the cross validation
N_outer = 5

#Naive Bayes
scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_nb = GaussianNB()
pipe_nb = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_nb]])
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(pipe_nb,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Naive Bayes: %.3f +/- %.3f' %(np.mean(scores),
                                                               np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Naive Bayes: 0.254 +/- 0.216
Complete in 0.1 sec
CV Precision Score of Naive Bayes: 0.309 +/- 0.285
Complete in 0.1 sec
CV Recall Score of Naive Bayes: 0.237 +/- 0.212
Complete in 0.1 sec

Multi-Layer Perceptron


In [8]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#MLP Classifier
scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV Precision of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_mlp = MLPClassifier(solver='lbfgs')
pipe_mlp = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_mlp]])
params_mlp = {'clf__activation':['logistic','relu'],'clf__alpha':10.0**np.arange(-4,4)}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_mlp = GridSearchCV(estimator=pipe_mlp,param_grid=params_mlp,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_mlp,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV Recall Score of MLP: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of MLP: 0.151 +/- 0.183
Complete in 1169.3 sec
CV Precision of MLP: 0.143 +/- 0.243
Complete in 997.8 sec
CV Recall Score of MLP: 0.207 +/- 0.216
Complete in 820.5 sec

AdaBoost Classifier


In [9]:
#Set the number of repeats of the cross validation
N_outer = 5
N_inner = 5

#AdaBoost
scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='f1')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='f1'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='precision')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='precision'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)

scores=[]
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    for j in range(N_inner):
        fold_inner = StratifiedKFold(n_splits=5,shuffle=True,random_state=j)
        gs_ada = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,
                               cv=fold_inner,scoring='recall')
        scores.append(cross_val_score(gs_ada,X,y,cv=fold_outer,
                                      scoring='recall'))
print ('CV F1 Score of AdaBoost: %.3f +/- %.3f'
       %(np.mean(scores), np.std(scores)))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of AdaBoost: 0.326 +/- 0.259
Complete in 860.2 sec
CV F1 Score of AdaBoost: 0.415 +/- 0.353
Complete in 958.8 sec
CV F1 Score of AdaBoost: 0.299 +/- 0.242
Complete in 969.1 sec

Model Selection for Logistic Regression based on F1 Score


In [16]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='f1')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


count mean std min 25% 50% 75% max
C
1.0000 53.0 0.34 0.04 0.26 0.32 0.35 0.37 0.40
0.0100 161.0 0.33 0.04 0.22 0.30 0.33 0.35 0.43
0.1000 91.0 0.33 0.04 0.20 0.31 0.34 0.36 0.40
10.0000 29.0 0.33 0.04 0.25 0.31 0.34 0.36 0.41
100.0000 3.0 0.33 0.02 0.30 0.32 0.33 0.34 0.35
0.0010 291.0 0.32 0.04 0.21 0.30 0.32 0.34 0.43
0.0001 371.0 0.31 0.04 0.19 0.29 0.32 0.34 0.43
1000.0000 1.0 0.29 NaN 0.29 0.29 0.29 0.29 0.29

Model Selection for Logistic Regression based on Precision


In [17]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='precision')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


count mean std min 25% 50% 75% max
C
1.0000 139.0 0.53 0.10 0.27 0.47 0.52 0.60 0.77
10.0000 45.0 0.51 0.11 0.29 0.44 0.50 0.58 0.72
0.1000 473.0 0.49 0.11 0.19 0.40 0.48 0.57 0.84
100.0000 5.0 0.46 0.08 0.37 0.37 0.49 0.50 0.56
0.0100 200.0 0.44 0.10 0.23 0.37 0.45 0.51 0.78
0.0010 67.0 0.40 0.08 0.26 0.34 0.40 0.46 0.58
0.0001 71.0 0.37 0.08 0.20 0.32 0.39 0.43 0.55

Model Selection for Logistic Regression based on Recall


In [18]:
from IPython.core.display import display
n_reps = 1000
best_params = []

clf_lr = LogisticRegression(penalty='l2')
pipe_lr = Pipeline([['sc',StandardScaler()],
                    ['clf',clf_lr]])
params_lr = {'clf__C':10.0**np.arange(-4,4)}

for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_lr_cv = GridSearchCV(estimator=pipe_lr,param_grid=params_lr,cv=k_fold,scoring='recall')
    gs_lr_cv = gs_lr_cv.fit(X,y)
    best_param = gs_lr_cv.best_params_
    best_param.update({'Best Score': gs_lr_cv.best_score_})
    best_params.append(best_param)

#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__C':'C'})
best_params_df = best_params_df.groupby('C')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


count mean std min 25% 50% 75% max
C
0.0001 999.0 0.33 0.04 0.21 0.31 0.33 0.35 0.46
1.0000 1.0 0.29 NaN 0.29 0.29 0.29 0.29 0.29

Model Selection for AdaBoost Classifier based on F1 Score


In [19]:
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='f1')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


Complete in 660.6 sec
count mean std min 25% 50% 75% max
N estimators
100 6.0 0.41 0.04 0.33 0.40 0.42 0.44 0.45
70 8.0 0.40 0.07 0.30 0.35 0.41 0.45 0.48
60 4.0 0.40 0.03 0.37 0.39 0.41 0.42 0.42
10 22.0 0.38 0.08 0.20 0.35 0.37 0.41 0.55
40 11.0 0.38 0.08 0.25 0.34 0.37 0.43 0.55
50 9.0 0.37 0.06 0.28 0.33 0.39 0.42 0.43
90 14.0 0.36 0.06 0.28 0.31 0.35 0.39 0.48
80 10.0 0.36 0.05 0.27 0.34 0.35 0.40 0.44
30 7.0 0.34 0.04 0.27 0.30 0.34 0.37 0.38
20 9.0 0.33 0.08 0.18 0.31 0.35 0.37 0.45

Model Selection for AdaBoost Classifier based on precision


In [20]:
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='precision')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


Complete in 653.6 sec
count mean std min 25% 50% 75% max
N estimators
60 3.0 0.55 0.12 0.42 0.50 0.57 0.62 0.66
70 6.0 0.53 0.04 0.50 0.50 0.53 0.56 0.58
80 6.0 0.52 0.11 0.36 0.47 0.52 0.57 0.68
50 11.0 0.49 0.12 0.30 0.40 0.52 0.55 0.66
10 31.0 0.48 0.10 0.32 0.40 0.48 0.53 0.70
100 7.0 0.48 0.15 0.29 0.38 0.40 0.59 0.69
40 9.0 0.47 0.15 0.25 0.38 0.48 0.51 0.78
90 10.0 0.46 0.10 0.28 0.41 0.49 0.52 0.59
30 7.0 0.45 0.06 0.37 0.40 0.46 0.49 0.52
20 10.0 0.44 0.12 0.20 0.41 0.46 0.51 0.58

Model Selection for AdaBoost Classifier based on recall


In [21]:
from IPython.core.display import display
from time import time
n_reps = 100
best_params = []

t0 = time()
clf_ada = AdaBoostClassifier(random_state=42)
pipe_ada = Pipeline([['sc',StandardScaler()],
                     ['clf',clf_ada]])
params_ada = {'clf__n_estimators':np.arange(1,11)*10}
for rep in np.arange(n_reps):
    k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=rep)
    gs_ada_cv = GridSearchCV(estimator=pipe_ada,param_grid=params_ada,cv=k_fold,scoring='recall')
    gs_ada_cv = gs_ada_cv.fit(X,y)
    best_param = gs_ada_cv.best_params_
    best_param.update({'Best Score': gs_ada_cv.best_score_})
    best_params.append(best_param)
print 'Complete in %.1f sec' %(time()-t0)    
#DataFrame summarizing average of best scores, frequency for each best parameter value
best_params_df = pd.DataFrame(best_params)
best_params_df = best_params_df.rename(columns={'clf__n_estimators':'N estimators'})
best_params_df = best_params_df.groupby('N estimators')['Best Score'].describe()
best_params_df = np.round(best_params_df,decimals=2).sort_values(['mean','count'],axis=0,ascending=[False,False])
display(best_params_df)


Complete in 652.7 sec
count mean std min 25% 50% 75% max
N estimators
60 8.0 0.39 0.07 0.32 0.33 0.36 0.42 0.50
30 10.0 0.36 0.06 0.26 0.32 0.37 0.41 0.45
50 9.0 0.36 0.05 0.28 0.33 0.35 0.40 0.44
70 7.0 0.36 0.07 0.27 0.30 0.38 0.42 0.45
90 7.0 0.36 0.08 0.23 0.32 0.35 0.42 0.46
80 6.0 0.35 0.04 0.28 0.32 0.36 0.38 0.38
10 26.0 0.34 0.08 0.17 0.29 0.35 0.40 0.50
20 14.0 0.34 0.04 0.27 0.32 0.33 0.34 0.44
40 12.0 0.34 0.05 0.23 0.33 0.35 0.38 0.40
100 1.0 0.28 NaN 0.28 0.28 0.28 0.28 0.28

Dummy Classifier


In [22]:
from sklearn.dummy import DummyClassifier

#Set the number of repeats of the cross validation
N_outer = 5

#Dummy Classifier
scores=[]
clf_dm = DummyClassifier(strategy='uniform')

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='f1'))
print 'CV F1 Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='precision'))
print 'CV Precision Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)

t0 = time()
for i in range(N_outer):
    fold_outer = StratifiedKFold(n_splits=5,shuffle=True,random_state=i)
    scores.append(cross_val_score(clf_dm,X,y,cv=fold_outer,
                                      scoring='recall'))
print 'CV Recall Score of Dummy Classifier: %.3f +/- %.3f' %(np.mean(scores),
                                                         np.std(scores))
print 'Complete in %.1f sec' %(time()-t0)


CV F1 Score of Dummy Classifier: 0.168 +/- 0.098
Complete in 0.0 sec
CV Precision Score of Dummy Classifier: 0.144 +/- 0.082
Complete in 0.0 sec
CV Recall Score of Dummy Classifier: 0.259 +/- 0.236
Complete in 0.0 sec

In [ ]: