Enron POI Classifier

general imports and mods


In [27]:
import sys
import os
from time import time
import re
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
#sys.path.append("ud120-projects/maildir/")
import numpy as np
import pandas as pd
#from matplotlib import pyplot as plt
#import seaborn as sns
#%matplotlib inline

sklearn imports


In [28]:
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_union
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.cluster import KMeans

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

load data


In [29]:
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )

original classifier


In [30]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)


GaussianNB()
	Accuracy: 0.25560	Precision: 0.18481	Recall: 0.79800	F1: 0.30011	F2: 0.47968
	Total predictions: 10000	True positives: 1596	False positives: 7040	False negatives:  404	True negatives:  960

data-record snapshot


In [31]:
print my_dataset.keys()[0]
my_dataset.itervalues().next()


METTS MARK
Out[31]:
{'bonus': 600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'mark.metts@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 94299,
 'from_messages': 29,
 'from_poi_to_this_person': 38,
 'from_this_person_to_poi': 1,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 1740,
 'poi': False,
 'restricted_stock': 585062,
 'restricted_stock_deferred': 'NaN',
 'salary': 365788,
 'shared_receipt_with_poi': 702,
 'to_messages': 807,
 'total_payments': 1061827,
 'total_stock_value': 585062}

feature selection


In [32]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

names = np.array(my_dataset.keys())
print "number of names in data: ", names.shape
print "\nfirst five names:\n", names[:5]
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print "\nfeatures:\n", features_list


number of names in data:  (146,)

first five names:
['METTS MARK' 'BAXTER JOHN C' 'ELLIOTT STEVEN' 'CORDES WILLIAM R'
 'HANNON KEVIN P']

features:
['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']

data-format conversion


In [33]:
### convert dictionary to pandas dataframe

df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df.poi = df.poi.astype('int')
df = df.convert_objects(convert_numeric=True)

for col in list(df.columns):
    df[col] = df[col].round(decimals=3)
    
print "POI Count:\n", df.poi.value_counts()
df.head()


POI Count:
0    128
1     18
dtype: int64
Out[33]:
poi bonus deferral_payments deferred_income director_fees exercised_stock_options expenses from_messages from_poi_to_this_person from_this_person_to_poi loan_advances long_term_incentive other restricted_stock restricted_stock_deferred salary shared_receipt_with_poi to_messages total_payments total_stock_value
0 0 600000 NaN NaN NaN NaN 94299 29 38 1 NaN NaN 1740 585062 NaN 365788 702 807 1061827 585062
1 0 1200000 1295738 -1386055 NaN 6680544 11200 NaN NaN NaN NaN 1586055 2660303 3942714 NaN 267102 NaN NaN 5634343 10623258
2 0 350000 NaN -400729 NaN 4890344 78552 NaN NaN NaN NaN NaN 12961 1788391 NaN 170941 NaN NaN 211725 6678735
3 0 NaN NaN NaN NaN 651850 NaN 12 10 0 NaN NaN NaN 386335 NaN NaN 58 764 NaN 1038185
4 1 1500000 NaN -3117011 NaN 5538001 34039 32 32 21 NaN 1617011 11350 853064 NaN 243293 1035 1045 288682 6391065

In [34]:
df.describe()


Out[34]:
poi bonus deferral_payments deferred_income director_fees exercised_stock_options expenses from_messages from_poi_to_this_person from_this_person_to_poi loan_advances long_term_incentive other restricted_stock restricted_stock_deferred salary shared_receipt_with_poi to_messages total_payments total_stock_value
count 146.000000 82.000000 39.000000 49.000000 17.000000 1.020000e+02 95.000000 86.000000 86.000000 86.000000 4.0000 66.000000 93.000000 1.100000e+02 18.000000 95.000000 86.000000 86.000000 1.250000e+02 1.260000e+02
mean 0.123288 2374234.609756 1642674.153846 -1140475.142857 166804.882353 5.987054e+06 108728.915789 608.790698 64.895349 41.232558 41962500.0000 1470361.454545 919064.967742 2.321741e+06 166410.555556 562194.294737 1176.465116 2073.860465 5.081526e+06 6.773957e+06
std 0.329899 10713327.969046 5161929.973575 4025406.378506 319891.409747 3.106201e+07 533534.814109 1841.033949 86.979244 100.073111 47083208.7019 5942759.315498 4589252.907638 1.251828e+07 4201494.314703 2716369.154553 1178.317641 2582.700981 2.906172e+07 3.895777e+07
min 0.000000 70000.000000 -102500.000000 -27992891.000000 3285.000000 3.285000e+03 148.000000 12.000000 0.000000 0.000000 400000.0000 69223.000000 2.000000 -2.604490e+06 -7576788.000000 477.000000 2.000000 57.000000 1.480000e+02 -4.409300e+04
25% 0.000000 431250.000000 81573.000000 -694862.000000 98784.000000 5.278862e+05 22614.000000 22.750000 10.000000 1.000000 1600000.0000 281250.000000 1215.000000 2.540180e+05 -389621.750000 211816.000000 249.750000 541.250000 3.944750e+05 4.945102e+05
50% 0.000000 769375.000000 227449.000000 -159792.000000 108579.000000 1.310814e+06 46950.000000 41.000000 35.000000 8.000000 41762500.0000 442035.000000 52382.000000 4.517400e+05 -146975.000000 259996.000000 740.500000 1211.000000 1.101393e+06 1.102872e+06
75% 0.000000 1200000.000000 1002671.500000 -38346.000000 113784.000000 2.547724e+06 79952.500000 145.500000 72.250000 24.750000 82125000.0000 938672.000000 362096.000000 1.002370e+06 -75009.750000 312117.000000 1888.250000 2634.750000 2.093263e+06 2.949847e+06
max 1.000000 97343619.000000 32083396.000000 -833.000000 1398517.000000 3.117640e+08 5235198.000000 14368.000000 528.000000 609.000000 83925000.0000 48521928.000000 42667589.000000 1.303223e+08 15456290.000000 26704229.000000 5521.000000 15149.000000 3.098866e+08 4.345095e+08

separate labels from features


In [35]:
# create labels
y = df.poi.values
print y.shape
print y[:5]


(146,)
[0 0 0 0 1]

In [36]:
# create initial features
X = df.drop('poi', axis=1).values
print X.shape
print X[:3]


(146, 19)
[[  6.00000000e+05              nan              nan              nan
               nan   9.42990000e+04   2.90000000e+01   3.80000000e+01
    1.00000000e+00              nan              nan   1.74000000e+03
    5.85062000e+05              nan   3.65788000e+05   7.02000000e+02
    8.07000000e+02   1.06182700e+06   5.85062000e+05]
 [  1.20000000e+06   1.29573800e+06  -1.38605500e+06              nan
    6.68054400e+06   1.12000000e+04              nan              nan
               nan              nan   1.58605500e+06   2.66030300e+06
    3.94271400e+06              nan   2.67102000e+05              nan
               nan   5.63434300e+06   1.06232580e+07]
 [  3.50000000e+05              nan  -4.00729000e+05              nan
    4.89034400e+06   7.85520000e+04              nan              nan
               nan              nan              nan   1.29610000e+04
    1.78839100e+06              nan   1.70941000e+05              nan
               nan   2.11725000e+05   6.67873500e+06]]

outlier removal


In [37]:
### Task 2: Remove outliers

# hand-tuned to remove ~5% (in this case, 7%)
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()

for i in xrange(num_cols):
    point_five_percentile = np.percentile(X[:,i], 0.5)
    ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
    
    for j in xrange(num_rows):
        if X[j,i] < point_five_percentile or X[j,i] > ninety_nine_point_five_percentile:
            rows_to_remove.add(j)

print X.shape
X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))

print "names associated with outlier-containing rows to remove:"
for i in rows_to_remove:
    print "\t",names[i], " (poi? {})".format(y[i])
    
names = np.delete(names, list(rows_to_remove))

print "\nnew X shape: ", X.shape
print "\nnew y shape: ", y.shape

print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))


(146, 19)
names associated with outlier-containing rows to remove:
	MENDELSOHN JOHN  (poi? 0)
	PICKERING MARK R  (poi? 0)
	FOY JOE  (poi? 0)
	TOTAL  (poi? 0)
	BANNANTINE JAMES M  (poi? 0)
	WALLS JR ROBERT H  (poi? 0)
	BHATNAGAR SANJAY  (poi? 0)
	BELFER ROBERT  (poi? 0)
	HICKERSON GARY J  (poi? 0)
	DODSON KEITH  (poi? 0)

new X shape:  (136, 19)

new y shape:  (136,)

total rows removed:  10 (0.07)

'NaN' imputation


In [38]:
# impute 'NaN' values to column means
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(X)
X = imp.transform(X)
print X[:3]

imp_values = imp.statistics_


[[  6.00000000e+05   2.43952000e+05  -1.79896000e+05   1.08579000e+05
    1.29704900e+06   9.42990000e+04   2.90000000e+01   3.80000000e+01
    1.00000000e+00   4.17625000e+07   4.22158000e+05   1.74000000e+03
    5.85062000e+05  -1.46975000e+05   3.65788000e+05   7.02000000e+02
    8.07000000e+02   1.06182700e+06   5.85062000e+05]
 [  1.20000000e+06   1.29573800e+06  -1.38605500e+06   1.08579000e+05
    6.68054400e+06   1.12000000e+04   4.40000000e+01   3.70000000e+01
    1.10000000e+01   4.17625000e+07   1.58605500e+06   2.66030300e+06
    3.94271400e+06  -1.46975000e+05   2.67102000e+05   7.72000000e+02
    1.43300000e+03   5.63434300e+06   1.06232580e+07]
 [  3.50000000e+05   2.43952000e+05  -4.00729000e+05   1.08579000e+05
    4.89034400e+06   7.85520000e+04   4.40000000e+01   3.70000000e+01
    1.10000000e+01   4.17625000e+07   4.22158000e+05   1.29610000e+04
    1.78839100e+06  -1.46975000e+05   1.70941000e+05   7.72000000e+02
    1.43300000e+03   2.11725000e+05   6.67873500e+06]]

feature creation


In [39]:
### Task 3: Create new feature(s)

def selectkbest():
    # select K best to explore feature engineering possibilities
    selector = SelectKBest().fit(X, y)
    
    features = features_list[1:]

    high_scores = []

    print "SelectKBest SCORES:"
    selectkbest_scores = np.round(selector.scores_, 2)
    for i in xrange(len(features)):
        print "\t", features[i], ": ", selectkbest_scores[i]
        if selectkbest_scores[i] > 2:
            high_scores.append(i)

    print "\nSelectKBest HIGH SCORES:"
    for i in high_scores:
        print "\t", features[i], "[{}]".format(i), ": ", selectkbest_scores[i]

In [40]:
selectkbest()


SelectKBest SCORES:
	bonus :  14.86
	deferral_payments :  0.34
	deferred_income :  9.17
	director_fees :  0.39
	exercised_stock_options :  25.87
	expenses :  0.91
	from_messages :  0.23
	from_poi_to_this_person :  3.54
	from_this_person_to_poi :  1.8
	loan_advances :  4.43
	long_term_incentive :  7.68
	other :  3.59
	restricted_stock :  7.81
	restricted_stock_deferred :  0.27
	salary :  11.39
	shared_receipt_with_poi :  6.35
	to_messages :  0.49
	total_payments :  8.22
	total_stock_value :  22.17

SelectKBest HIGH SCORES:
	bonus [0] :  14.86
	deferred_income [2] :  9.17
	exercised_stock_options [4] :  25.87
	from_poi_to_this_person [7] :  3.54
	loan_advances [9] :  4.43
	long_term_incentive [10] :  7.68
	other [11] :  3.59
	restricted_stock [12] :  7.81
	salary [14] :  11.39
	shared_receipt_with_poi [15] :  6.35
	total_payments [17] :  8.22
	total_stock_value [18] :  22.17

In [41]:
def create_new_feature(X, col1, col2, operation, feature_name):
    
    features_list.append(feature_name)
    
    new_col = []
    if operation == '*':
        new_col = (X[:,col1] * X[:,col2])
    elif operation == '/':
        new_col = np.true_divide(X[:,col1], X[:, col2])
    
    new_col.shape = (new_col.shape[0], 1)
    #print new_col.shape

    X = np.hstack((X, new_col))
    #print X.shape
    
    return X

In [42]:
X = create_new_feature(X, 0, 14, '*', 'selectkbest_product')
selectkbest()


SelectKBest SCORES:
	bonus :  14.86
	deferral_payments :  0.34
	deferred_income :  9.17
	director_fees :  0.39
	exercised_stock_options :  25.87
	expenses :  0.91
	from_messages :  0.23
	from_poi_to_this_person :  3.54
	from_this_person_to_poi :  1.8
	loan_advances :  4.43
	long_term_incentive :  7.68
	other :  3.59
	restricted_stock :  7.81
	restricted_stock_deferred :  0.27
	salary :  11.39
	shared_receipt_with_poi :  6.35
	to_messages :  0.49
	total_payments :  8.22
	total_stock_value :  22.17
	selectkbest_product :  17.22

SelectKBest HIGH SCORES:
	bonus [0] :  14.86
	deferred_income [2] :  9.17
	exercised_stock_options [4] :  25.87
	from_poi_to_this_person [7] :  3.54
	loan_advances [9] :  4.43
	long_term_incentive [10] :  7.68
	other [11] :  3.59
	restricted_stock [12] :  7.81
	salary [14] :  11.39
	shared_receipt_with_poi [15] :  6.35
	total_payments [17] :  8.22
	total_stock_value [18] :  22.17
	selectkbest_product [19] :  17.22


In [43]:
corr_df = pd.DataFrame(X)
corr_df['label'] = y

In [44]:
corr_df.corr()['label'].values[:-1]


Out[44]:
array([ 0.31599299, -0.05044821, -0.25308716,  0.05375452,  0.40223908,
        0.08222445, -0.04157864,  0.16044261,  0.11522852,  0.17892754,
        0.23280562,  0.16160734,  0.23473738,  0.04504419,  0.2798448 ,
        0.21263471,  0.06060255,  0.24038491,  0.37674133,  0.33744415])

In [45]:
corr = corr_df.corr()['label'].values[:-1]

In [46]:
for i in xrange(len(corr)):
    if abs(corr[i]) > 0.2:
        print i, abs(corr[i])


0 0.315992990708
2 0.253087162815
4 0.402239076277
10 0.232805616467
12 0.234737383429
14 0.27984480025
15 0.212634714349
17 0.240384913661
18 0.376741329429
19 0.337444151372

machine learn!


In [47]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

def grid_searcher(clf, pca_skb, output):
    
    t0 = time()
    
    even_range = range(2,X.shape[1],2)
    random_state = [42]
    t_or_f = [True, False]
    #powers_of_ten = [10**x for x in range(-5,5)]
    logspace = np.logspace(-5, 5, 10)
    #kernels = ['linear', 'poly', 'rbf', 'sigmoid']  # takes too long, unfortunately
    kernels = ['rbf']
    criteria = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_features = ['auto', 'sqrt', 'log2', None]
    
    # modify features, remove features via pipeline
    
    pipeline = []
    params = dict()
    pipeline_clf = ""
    
    if pca_skb == "pca_skb":
        #pipeline = make_pipeline(MinMaxScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)
        pipeline = make_pipeline(StandardScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)

        params = dict(featureunion__randomizedpca__n_components = even_range,
                      featureunion__randomizedpca__iterated_power = [1, 2, 3],
                      featureunion__randomizedpca__whiten = t_or_f,
                      featureunion__randomizedpca__random_state = random_state,
                      featureunion__selectkbest__k = even_range)   
        
    elif pca_skb == "pca":
        #pipeline = make_pipeline(MinMaxScaler(), RandomizedPCA(), clf)
        pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), clf)

        params = dict(randomizedpca__n_components = [4],
                      randomizedpca__iterated_power = [1, 2, 3],
                      randomizedpca__whiten = t_or_f,
                      randomizedpca__random_state = random_state)   
        
    elif pca_skb == "skb":
        #pipeline = make_pipeline(MinMaxScaler(), SelectKBest(), clf)
        pipeline = make_pipeline(StandardScaler(), SelectKBest(), clf)

        params = dict(selectkbest__k = [4])   
    
    pipeline_clf = pipeline.steps[2][0]      
    
    if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
        params["{}__criterion".format(pipeline_clf)] = criteria
        #params["{}__splitter".format(pipeline_clf)] = splitters
        params["{}__max_features".format(pipeline_clf)] = max_features
        #params["{}__min_samples_split".format(pipeline_clf)] = even_range
        params["{}__class_weight".format(pipeline_clf)] = ['auto', None]
        params["{}__random_state".format(pipeline_clf)] = random_state
    
    if pipeline_clf == 'svc':
        params['svc__C'] = logspace
        params['svc__kernel'] = kernels
        #params['svc__degree'] = [1,2,3,4,5]  # for use with 'poly'
        params['svc__gamma'] = logspace
        params['svc__random_state'] = random_state
        
    # cross validation    
    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=random_state[0])
    
    # tune parameters
    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

    grid_search.fit(X, y)

    if output == True:
        print "*"*15, pipeline_clf.upper(), "*"*15
        print "\nBEST SCORE: ", grid_search.best_score_, "\n"
        print "\nBEST PARAMS: ", grid_search.best_params_, "\n"

    # split into training and testing data for reporting results
    if output == True:
        print "#"*50
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state[0])

    if output == True:
        print "\nBEST ESTIMATOR:"
    clf = grid_search.best_estimator_
    if output == True:
        print clf
    clf.fit(X_train, y_train)
    
    if pca_skb == "skb" or pca_skb == "pca_skb":
    
        if output == True:
            print "\nSelectKBest SCORES:"
        features = features_list[1:]
        
        selectkbest_scores = clf.steps[1][1].scores_ if pca_skb == "skb" else clf.steps[1][1].transformer_list[1][1].scores_
        
        selectkbest_scores = np.round(selectkbest_scores, 2)
        for i in xrange(len(features)):
            if output == True:
                print "\t", features[i], ": ", selectkbest_scores[i]
    
    if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
        if output == True:
            print "\n{} FEATURE IMPORTANCES:".format(pipeline_clf.upper())
            print clf.steps[2][1].feature_importances_
    
    if output == True:
        print "\n", "#"*50
    
        print "\nPREDICTIONS:"

        print "\nground truth:\n\t", y_test 
    
    y_pred = clf.predict(X_test)
    if output == True:
        print "\npredictions:\n\t", y_pred

        print "\nscore: ", clf.score(X_test, y_test)

        print "\nEVALUATIONS:"
        print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
    
        print "\nclassification report:\n", classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
    
        print "ELAPSED TIME: ", round(time()-t0,3), "s"
    
    return clf

reminder: original classifier results

GaussianNB()
     Accuracy: 0.25560     Precision: 0.18481     Recall: 0.79800     F1: 0.30011     F2: 0.47968
     Total predictions: 10000     True positives: 1596     False positives: 7040     False negatives: 404
     True negatives: 960


In [48]:
# prepare for Udacity tester

# remove emails
for key in my_dataset.keys():
    my_dataset[key].pop('email_address')
    
# remove outliers from original data set
for key in my_dataset.keys():
    if key not in names:
        my_dataset.pop(key)

# replace 'NaN's
for key in my_dataset.keys():
    for sub_key in my_dataset[key].keys():
        if my_dataset[key][sub_key] == 'NaN':
            i = (df.columns.get_loc(sub_key) - 1)
            my_dataset[key][sub_key] = imp_values[i]
            
# add created feature
i = 0
for key in my_dataset.keys():
    my_dataset[key]['selectkbest_product'] = X[i,-1]
    i += 1

In [49]:
def udacity_tester():
    # use Udacity tester
    print "\nUDACITY TESTER RESULTS: "
    test_classifier(clf, my_dataset, features_list)

new classifier results


In [50]:
for classifier in [GaussianNB(), DecisionTreeClassifier()]:
    for transfomer in ['pca', 'skb', 'pca_skb']:
        clf = grid_searcher(classifier, transfomer, output=True)
        udacity_tester()


*************** GAUSSIANNB ***************

BEST SCORE:  0.817857142857 


BEST PARAMS:  {'randomizedpca__whiten': True, 'randomizedpca__n_components': 4, 'randomizedpca__iterated_power': 1, 'randomizedpca__random_state': 42} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('gaussiannb', GaussianNB())])

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.85      0.96      0.90        23
        poi       0.50      0.20      0.29         5

avg / total       0.78      0.82      0.79        28

ELAPSED TIME:  0.215 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.83150	Precision: 0.37368	Recall: 0.26550	F1: 0.31044	F2: 0.28182
	Total predictions: 14000	True positives:  531	False positives:  890	False negatives: 1469	True negatives: 11110

*************** GAUSSIANNB ***************

BEST SCORE:  0.817857142857 


BEST PARAMS:  {'selectkbest__k': 4} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('gaussiannb', GaussianNB())])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.85      0.96      0.90        23
        poi       0.50      0.20      0.29         5

avg / total       0.78      0.82      0.79        28

ELAPSED TIME:  0.037 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.84079	Precision: 0.41636	Recall: 0.28500	F1: 0.33838	F2: 0.30419
	Total predictions: 14000	True positives:  570	False positives:  799	False negatives: 1430	True negatives: 11201

*************** GAUSSIANNB ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'featureunion__randomizedpca__whiten': True, 'featureunion__randomizedpca__random_state': 42, 'featureunion__selectkbest__k': 8, 'featureunion__randomizedpca__n_components': 6, 'featureunion__randomizedpca__iterated_power': 1} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=6, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x1093306e0>))],
       transformer_weights=None)), ('gaussiannb', GaussianNB())])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]

score:  0.857142857143

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.88      0.96      0.92        23
        poi       0.67      0.40      0.50         5

avg / total       0.84      0.86      0.84        28

ELAPSED TIME:  23.39 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=6, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x1093306e0>))],
       transformer_weights=None)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.83564	Precision: 0.40027	Recall: 0.30200	F1: 0.34426	F2: 0.31759
	Total predictions: 14000	True positives:  604	False positives:  905	False negatives: 1396	True negatives: 11095

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'randomizedpca__iterated_power': 1, 'randomizedpca__random_state': 42, 'randomizedpca__n_components': 4, 'decisiontreeclassifier__random_state': 42, 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__class_weight': 'auto', 'randomizedpca__whiten': True, 'decisiontreeclassifier__max_features': None} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.3697774   0.1154305   0.22757212  0.28721998]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1]

score:  0.75

EVALUATIONS:

confusion matrix:
[[19  4]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.86      0.83      0.84        23
        poi       0.33      0.40      0.36         5

avg / total       0.77      0.75      0.76        28

ELAPSED TIME:  3.892 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.78343	Precision: 0.24277	Recall: 0.24350	F1: 0.24314	F2: 0.24335
	Total predictions: 14000	True positives:  487	False positives: 1519	False negatives: 1513	True negatives: 10481

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'decisiontreeclassifier__class_weight': 'auto', 'decisiontreeclassifier__random_state': 42, 'selectkbest__k': 4, 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_features': 'auto'} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.32138279  0.35849755  0.01851852  0.30160114]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[21  2]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.88      0.91      0.89        23
        poi       0.50      0.40      0.44         5

avg / total       0.81      0.82      0.81        28

ELAPSED TIME:  0.536 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.80421	Precision: 0.27002	Recall: 0.21750	F1: 0.24093	F2: 0.22630
	Total predictions: 14000	True positives:  435	False positives: 1176	False negatives: 1565	True negatives: 10824

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.853571428571 


BEST PARAMS:  {'featureunion__randomizedpca__n_components': 4, 'decisiontreeclassifier__random_state': 42, 'featureunion__randomizedpca__whiten': True, 'decisiontreeclassifier__criterion': 'entropy', 'featureunion__randomizedpca__iterated_power': 1, 'decisiontreeclassifier__class_weight': None, 'featureunion__randomizedpca__random_state': 42, 'featureunion__selectkbest__k': 4, 'decisiontreeclassifier__max_features': 'log2'} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=4, s...  min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.32600091  0.11625127  0.08394309  0.09311544  0.0374992   0.08429519
  0.1467443   0.11215061]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]

score:  0.785714285714

EVALUATIONS:

confusion matrix:
[[21  2]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.84      0.91      0.87        23
        poi       0.33      0.20      0.25         5

avg / total       0.75      0.79      0.76        28

ELAPSED TIME:  450.664 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=4, s...  min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.79064	Precision: 0.26852	Recall: 0.27000	F1: 0.26926	F2: 0.26970
	Total predictions: 14000	True positives:  540	False positives: 1471	False negatives: 1460	True negatives: 10529

data dump for Udacity


In [2724]:
#dump_classifier_and_data(clf, my_dataset, features_list)

average best-scoring predictions


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clfs = dict()

for classifier in [GaussianNB(), DecisionTreeClassifier()]:

    clfs[str(classifier)] = dict()
    
    for transformer in ['pca', 'skb', 'pca_skb']:
        clf = grid_searcher(classifier, transformer, output=False)
        clf.fit(X_train, y_train)
        
        clfs[str(classifier)][transformer] = dict()
        
        y_pred = clf.predict(X_test)
        clfs[str(classifier)][transformer]['predictions'] = y_pred
        
        clf_report = classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
        clfs[str(classifier)][transformer]['clf_report'] = clf_report

In [146]:
clfs['GaussianNB()']['pca']['predictions']


Out[146]:
array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0])

In [152]:
clfs['GaussianNB()']['pca']['clf_report']


Out[152]:
'             precision    recall  f1-score   support\n\n    non-poi       0.85      0.96      0.90        23\n        poi       0.50      0.20      0.29         5\n\navg / total       0.78      0.82      0.79        28\n'

In [154]:
best_predictions = dict()
for clsfr in clfs.keys():
    best_predictions[clsfr] = dict()
    #print clsfr
    best_f1_score = 0.0
    for transformer in clfs[clsfr].keys():
        clf_report = clfs[clsfr][transformer]['clf_report']
        clf_report = [re.sub(r"[a-z]|\n", '', x) for x in clf_report.split(" ")]
        clf_report = filter(None, clf_report)
        #print "\t", transformer
        f1_score = float(clf_report[-2])
        #print f1_score
        if f1_score > best_f1_score:
            best_f1_score = f1_score
            best_predictions[clsfr]['transformer'] = transformer
            best_predictions[clsfr]['f1_score'] = f1_score
            best_predictions[clsfr]['predictions'] = clfs[clsfr][transformer]['predictions']


0.76
0.76
0.81
0.79
0.84
0.79

In [155]:
averaged_best_recall_predictions = np.zeros((28,))

for clsfr in best_predictions.keys():
    print clsfr[:10]
    print "\tbest transformer: ", best_predictions[clsfr]['transformer']
    print "\tbest f1 score: ", best_predictions[clsfr]['f1_score']
    print "\tbest predictions: ", best_predictions[clsfr]['predictions']
    
    averaged_best_recall_predictions = np.maximum(averaged_best_recall_predictions, best_predictions[clsfr]['predictions'])
    
print "\naveraged best predictions: {}".format(averaged_best_recall_predictions.astype('int'))

print "\nresulting confusion matrix:\n", confusion_matrix(y_test, averaged_best_recall_predictions)

print "\nresulting classification report:\n", classification_report(y_test, averaged_best_recall_predictions, target_names=["non-poi", "poi"])


GaussianNB
	best transformer:  pca_skb
	best f1 score:  0.84
	best predictions:  [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]
DecisionTr
	best transformer:  skb
	best f1 score:  0.81
	best predictions:  [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

averaged best predictions: [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

resulting confusion matrix:
[[21  2]
 [ 2  3]]

resulting classification report:
             precision    recall  f1-score   support

    non-poi       0.91      0.91      0.91        23
        poi       0.60      0.60      0.60         5

avg / total       0.86      0.86      0.86        28





Epilogue: email text as data

I had hoped to use email-text data along with the financial data, but clearly there is a discrepancy between the inidividuals represented by the financial data and the email data.


In [2725]:
# compare individuals represented by the financial data and by the email-corpus data

directory_names = []
poi_directory_names = []
true_count = 0
false_count = 0

for key in my_dataset.keys():
    names = key.lower().split(' ')
    dirname = names.pop(0)
    
    if len(names) > 0:
        dirname = dirname + "-" + names[0][0]

    exist = os.path.exists('/Users/excalibur/Dropbox/datasets/maildir/{}'.format(dirname))
    
    #print dirname, "\n\temails exist: ", exist, names
    
    directory_names.append(dirname)
    
    if exist:
        true_count += 1
        if my_dataset[key]['poi'] == True:
            poi_directory_names.append(dirname)
    else:
        false_count += 1
        
print "email directories matching individuals represented by financial data:"
print "\texist: ", true_count, "(POIs: {})".format(len(poi_directory_names))
print "\tdon't exist: ", false_count

#print sorted(directory_names)


email directories matching individuals represented by financial data:
	exist:  19 (POIs: 3)
	don't exist:  117

Unfortunately, it seems clear that it would be difficult to join the two data sets in a meaningful way due to their lack of overlap.

After this project, I may spend some time with text-vectorization of the email corpus and examine things like word frequencies (I have started working on some of the initial code for that process below); however, with such an apparent low number of known POI-emails being available (only 3!?), it's entirely unclear how useful such an endeavor would be for identifying POIs, although there are surely other interesting insights to be gleaned.


In [2726]:
email_corpus_dir = '/Users/excalibur/Dropbox/datasets/maildir/'

In [2727]:
email_dirs = os.listdir(email_corpus_dir)
print "number of email directories: ", len(email_dirs)
print "\nfirst five email directories:\n", email_dirs[:5]


number of email directories:  150

first five email directories:
['allen-p', 'arnold-j', 'arora-h', 'badeer-r', 'bailey-s']

In [2728]:
email_text = dict()
for email_dir in email_dirs:
    for dirpath, dirnames, filenames in os.walk(email_corpus_dir + email_dir):
        for dirname in dirnames:
            for filename in filenames:
                path = dirpath + "/" + dirname + "/" + filename
                if os.path.isfile(path):   
                    with open(dirpath + "/" + dirname + "/" + filename, 'r') as f:
                        read_data = f.readlines()
                        if email_dir not in email_text:
                            email_text[email_dir] = ''.join(read_data[15:])
                        else:
                            email_text[email_dir] += ''.join(read_data[15:])
                
#print email_text['white-s']