Enron POI Classifier

general imports and mods



In [27]:

    
import sys
import os
from time import time
import re
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
#sys.path.append("ud120-projects/maildir/")
import numpy as np
import pandas as pd
#from matplotlib import pyplot as plt
#import seaborn as sns
#%matplotlib inline

sklearn imports



In [28]:

    
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_union
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.cluster import KMeans

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

load data



In [29]:

    
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )

original classifier



In [30]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)









    



GaussianNB()
	Accuracy: 0.25560	Precision: 0.18481	Recall: 0.79800	F1: 0.30011	F2: 0.47968
	Total predictions: 10000	True positives: 1596	False positives: 7040	False negatives:  404	True negatives:  960

data-record snapshot



In [31]:

    
print my_dataset.keys()[0]
my_dataset.itervalues().next()









    



METTS MARK






    Out[31]:





{'bonus': 600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'mark.metts@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 94299,
 'from_messages': 29,
 'from_poi_to_this_person': 38,
 'from_this_person_to_poi': 1,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 1740,
 'poi': False,
 'restricted_stock': 585062,
 'restricted_stock_deferred': 'NaN',
 'salary': 365788,
 'shared_receipt_with_poi': 702,
 'to_messages': 807,
 'total_payments': 1061827,
 'total_stock_value': 585062}

feature selection



In [32]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

names = np.array(my_dataset.keys())
print "number of names in data: ", names.shape
print "\nfirst five names:\n", names[:5]
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print "\nfeatures:\n", features_list









    



number of names in data:  (146,)

first five names:
['METTS MARK' 'BAXTER JOHN C' 'ELLIOTT STEVEN' 'CORDES WILLIAM R'
 'HANNON KEVIN P']

features:
['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']

data-format conversion



In [33]:

    
### convert dictionary to pandas dataframe

df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df.poi = df.poi.astype('int')
df = df.convert_objects(convert_numeric=True)

for col in list(df.columns):
    df[col] = df[col].round(decimals=3)
    
print "POI Count:\n", df.poi.value_counts()
df.head()









    



POI Count:
0    128
1     18
dtype: int64






    Out[33]:






  
    
      
      poi
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      long_term_incentive
      other
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
    
  
  
    
      0
      0
      600000
      NaN
      NaN
      NaN
      NaN
      94299
      29
      38
      1
      NaN
      NaN
      1740
      585062
      NaN
      365788
      702
      807
      1061827
      585062
    
    
      1
      0
      1200000
      1295738
      -1386055
      NaN
      6680544
      11200
      NaN
      NaN
      NaN
      NaN
      1586055
      2660303
      3942714
      NaN
      267102
      NaN
      NaN
      5634343
      10623258
    
    
      2
      0
      350000
      NaN
      -400729
      NaN
      4890344
      78552
      NaN
      NaN
      NaN
      NaN
      NaN
      12961
      1788391
      NaN
      170941
      NaN
      NaN
      211725
      6678735
    
    
      3
      0
      NaN
      NaN
      NaN
      NaN
      651850
      NaN
      12
      10
      0
      NaN
      NaN
      NaN
      386335
      NaN
      NaN
      58
      764
      NaN
      1038185
    
    
      4
      1
      1500000
      NaN
      -3117011
      NaN
      5538001
      34039
      32
      32
      21
      NaN
      1617011
      11350
      853064
      NaN
      243293
      1035
      1045
      288682
      6391065



In [34]:

    
df.describe()









    Out[34]:






  
    
      
      poi
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      long_term_incentive
      other
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
    
  
  
    
      count
      146.000000
      82.000000
      39.000000
      49.000000
      17.000000
      1.020000e+02
      95.000000
      86.000000
      86.000000
      86.000000
      4.0000
      66.000000
      93.000000
      1.100000e+02
      18.000000
      95.000000
      86.000000
      86.000000
      1.250000e+02
      1.260000e+02
    
    
      mean
      0.123288
      2374234.609756
      1642674.153846
      -1140475.142857
      166804.882353
      5.987054e+06
      108728.915789
      608.790698
      64.895349
      41.232558
      41962500.0000
      1470361.454545
      919064.967742
      2.321741e+06
      166410.555556
      562194.294737
      1176.465116
      2073.860465
      5.081526e+06
      6.773957e+06
    
    
      std
      0.329899
      10713327.969046
      5161929.973575
      4025406.378506
      319891.409747
      3.106201e+07
      533534.814109
      1841.033949
      86.979244
      100.073111
      47083208.7019
      5942759.315498
      4589252.907638
      1.251828e+07
      4201494.314703
      2716369.154553
      1178.317641
      2582.700981
      2.906172e+07
      3.895777e+07
    
    
      min
      0.000000
      70000.000000
      -102500.000000
      -27992891.000000
      3285.000000
      3.285000e+03
      148.000000
      12.000000
      0.000000
      0.000000
      400000.0000
      69223.000000
      2.000000
      -2.604490e+06
      -7576788.000000
      477.000000
      2.000000
      57.000000
      1.480000e+02
      -4.409300e+04
    
    
      25%
      0.000000
      431250.000000
      81573.000000
      -694862.000000
      98784.000000
      5.278862e+05
      22614.000000
      22.750000
      10.000000
      1.000000
      1600000.0000
      281250.000000
      1215.000000
      2.540180e+05
      -389621.750000
      211816.000000
      249.750000
      541.250000
      3.944750e+05
      4.945102e+05
    
    
      50%
      0.000000
      769375.000000
      227449.000000
      -159792.000000
      108579.000000
      1.310814e+06
      46950.000000
      41.000000
      35.000000
      8.000000
      41762500.0000
      442035.000000
      52382.000000
      4.517400e+05
      -146975.000000
      259996.000000
      740.500000
      1211.000000
      1.101393e+06
      1.102872e+06
    
    
      75%
      0.000000
      1200000.000000
      1002671.500000
      -38346.000000
      113784.000000
      2.547724e+06
      79952.500000
      145.500000
      72.250000
      24.750000
      82125000.0000
      938672.000000
      362096.000000
      1.002370e+06
      -75009.750000
      312117.000000
      1888.250000
      2634.750000
      2.093263e+06
      2.949847e+06
    
    
      max
      1.000000
      97343619.000000
      32083396.000000
      -833.000000
      1398517.000000
      3.117640e+08
      5235198.000000
      14368.000000
      528.000000
      609.000000
      83925000.0000
      48521928.000000
      42667589.000000
      1.303223e+08
      15456290.000000
      26704229.000000
      5521.000000
      15149.000000
      3.098866e+08
      4.345095e+08

separate labels from features



In [35]:

    
# create labels
y = df.poi.values
print y.shape
print y[:5]









    



(146,)
[0 0 0 0 1]



In [36]:

    
# create initial features
X = df.drop('poi', axis=1).values
print X.shape
print X[:3]









    



(146, 19)
[[  6.00000000e+05              nan              nan              nan
               nan   9.42990000e+04   2.90000000e+01   3.80000000e+01
    1.00000000e+00              nan              nan   1.74000000e+03
    5.85062000e+05              nan   3.65788000e+05   7.02000000e+02
    8.07000000e+02   1.06182700e+06   5.85062000e+05]
 [  1.20000000e+06   1.29573800e+06  -1.38605500e+06              nan
    6.68054400e+06   1.12000000e+04              nan              nan
               nan              nan   1.58605500e+06   2.66030300e+06
    3.94271400e+06              nan   2.67102000e+05              nan
               nan   5.63434300e+06   1.06232580e+07]
 [  3.50000000e+05              nan  -4.00729000e+05              nan
    4.89034400e+06   7.85520000e+04              nan              nan
               nan              nan              nan   1.29610000e+04
    1.78839100e+06              nan   1.70941000e+05              nan
               nan   2.11725000e+05   6.67873500e+06]]

outlier removal



In [37]:

    
### Task 2: Remove outliers

# hand-tuned to remove ~5% (in this case, 7%)
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()

for i in xrange(num_cols):
    point_five_percentile = np.percentile(X[:,i], 0.5)
    ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
    
    for j in xrange(num_rows):
        if X[j,i] < point_five_percentile or X[j,i] > ninety_nine_point_five_percentile:
            rows_to_remove.add(j)

print X.shape
X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))

print "names associated with outlier-containing rows to remove:"
for i in rows_to_remove:
    print "\t",names[i], " (poi? {})".format(y[i])
    
names = np.delete(names, list(rows_to_remove))

print "\nnew X shape: ", X.shape
print "\nnew y shape: ", y.shape

print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))









    



(146, 19)
names associated with outlier-containing rows to remove:
	MENDELSOHN JOHN  (poi? 0)
	PICKERING MARK R  (poi? 0)
	FOY JOE  (poi? 0)
	TOTAL  (poi? 0)
	BANNANTINE JAMES M  (poi? 0)
	WALLS JR ROBERT H  (poi? 0)
	BHATNAGAR SANJAY  (poi? 0)
	BELFER ROBERT  (poi? 0)
	HICKERSON GARY J  (poi? 0)
	DODSON KEITH  (poi? 0)

new X shape:  (136, 19)

new y shape:  (136,)

total rows removed:  10 (0.07)

'NaN' imputation



In [38]:

    
# impute 'NaN' values to column means
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(X)
X = imp.transform(X)
print X[:3]

imp_values = imp.statistics_









    



[[  6.00000000e+05   2.43952000e+05  -1.79896000e+05   1.08579000e+05
    1.29704900e+06   9.42990000e+04   2.90000000e+01   3.80000000e+01
    1.00000000e+00   4.17625000e+07   4.22158000e+05   1.74000000e+03
    5.85062000e+05  -1.46975000e+05   3.65788000e+05   7.02000000e+02
    8.07000000e+02   1.06182700e+06   5.85062000e+05]
 [  1.20000000e+06   1.29573800e+06  -1.38605500e+06   1.08579000e+05
    6.68054400e+06   1.12000000e+04   4.40000000e+01   3.70000000e+01
    1.10000000e+01   4.17625000e+07   1.58605500e+06   2.66030300e+06
    3.94271400e+06  -1.46975000e+05   2.67102000e+05   7.72000000e+02
    1.43300000e+03   5.63434300e+06   1.06232580e+07]
 [  3.50000000e+05   2.43952000e+05  -4.00729000e+05   1.08579000e+05
    4.89034400e+06   7.85520000e+04   4.40000000e+01   3.70000000e+01
    1.10000000e+01   4.17625000e+07   4.22158000e+05   1.29610000e+04
    1.78839100e+06  -1.46975000e+05   1.70941000e+05   7.72000000e+02
    1.43300000e+03   2.11725000e+05   6.67873500e+06]]

feature creation



In [39]:

    
### Task 3: Create new feature(s)

def selectkbest():
    # select K best to explore feature engineering possibilities
    selector = SelectKBest().fit(X, y)
    
    features = features_list[1:]

    high_scores = []

    print "SelectKBest SCORES:"
    selectkbest_scores = np.round(selector.scores_, 2)
    for i in xrange(len(features)):
        print "\t", features[i], ": ", selectkbest_scores[i]
        if selectkbest_scores[i] > 2:
            high_scores.append(i)

    print "\nSelectKBest HIGH SCORES:"
    for i in high_scores:
        print "\t", features[i], "[{}]".format(i), ": ", selectkbest_scores[i]



In [40]:

    
selectkbest()









    



SelectKBest SCORES:
	bonus :  14.86
	deferral_payments :  0.34
	deferred_income :  9.17
	director_fees :  0.39
	exercised_stock_options :  25.87
	expenses :  0.91
	from_messages :  0.23
	from_poi_to_this_person :  3.54
	from_this_person_to_poi :  1.8
	loan_advances :  4.43
	long_term_incentive :  7.68
	other :  3.59
	restricted_stock :  7.81
	restricted_stock_deferred :  0.27
	salary :  11.39
	shared_receipt_with_poi :  6.35
	to_messages :  0.49
	total_payments :  8.22
	total_stock_value :  22.17

SelectKBest HIGH SCORES:
	bonus [0] :  14.86
	deferred_income [2] :  9.17
	exercised_stock_options [4] :  25.87
	from_poi_to_this_person [7] :  3.54
	loan_advances [9] :  4.43
	long_term_incentive [10] :  7.68
	other [11] :  3.59
	restricted_stock [12] :  7.81
	salary [14] :  11.39
	shared_receipt_with_poi [15] :  6.35
	total_payments [17] :  8.22
	total_stock_value [18] :  22.17



In [41]:

    
def create_new_feature(X, col1, col2, operation, feature_name):
    
    features_list.append(feature_name)
    
    new_col = []
    if operation == '*':
        new_col = (X[:,col1] * X[:,col2])
    elif operation == '/':
        new_col = np.true_divide(X[:,col1], X[:, col2])
    
    new_col.shape = (new_col.shape[0], 1)
    #print new_col.shape

    X = np.hstack((X, new_col))
    #print X.shape
    
    return X



In [42]:

    
X = create_new_feature(X, 0, 14, '*', 'selectkbest_product')
selectkbest()









    



SelectKBest SCORES:
	bonus :  14.86
	deferral_payments :  0.34
	deferred_income :  9.17
	director_fees :  0.39
	exercised_stock_options :  25.87
	expenses :  0.91
	from_messages :  0.23
	from_poi_to_this_person :  3.54
	from_this_person_to_poi :  1.8
	loan_advances :  4.43
	long_term_incentive :  7.68
	other :  3.59
	restricted_stock :  7.81
	restricted_stock_deferred :  0.27
	salary :  11.39
	shared_receipt_with_poi :  6.35
	to_messages :  0.49
	total_payments :  8.22
	total_stock_value :  22.17
	selectkbest_product :  17.22

SelectKBest HIGH SCORES:
	bonus [0] :  14.86
	deferred_income [2] :  9.17
	exercised_stock_options [4] :  25.87
	from_poi_to_this_person [7] :  3.54
	loan_advances [9] :  4.43
	long_term_incentive [10] :  7.68
	other [11] :  3.59
	restricted_stock [12] :  7.81
	salary [14] :  11.39
	shared_receipt_with_poi [15] :  6.35
	total_payments [17] :  8.22
	total_stock_value [18] :  22.17
	selectkbest_product [19] :  17.22



In [43]:

    
corr_df = pd.DataFrame(X)
corr_df['label'] = y



In [44]:

    
corr_df.corr()['label'].values[:-1]









    Out[44]:





array([ 0.31599299, -0.05044821, -0.25308716,  0.05375452,  0.40223908,
        0.08222445, -0.04157864,  0.16044261,  0.11522852,  0.17892754,
        0.23280562,  0.16160734,  0.23473738,  0.04504419,  0.2798448 ,
        0.21263471,  0.06060255,  0.24038491,  0.37674133,  0.33744415])



In [45]:

    
corr = corr_df.corr()['label'].values[:-1]



In [46]:

    
for i in xrange(len(corr)):
    if abs(corr[i]) > 0.2:
        print i, abs(corr[i])









    



0 0.315992990708
2 0.253087162815
4 0.402239076277
10 0.232805616467
12 0.234737383429
14 0.27984480025
15 0.212634714349
17 0.240384913661
18 0.376741329429
19 0.337444151372

machine learn!



In [47]:

    
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

def grid_searcher(clf, pca_skb, output):
    
    t0 = time()
    
    even_range = range(2,X.shape[1],2)
    random_state = [42]
    t_or_f = [True, False]
    #powers_of_ten = [10**x for x in range(-5,5)]
    logspace = np.logspace(-5, 5, 10)
    #kernels = ['linear', 'poly', 'rbf', 'sigmoid']  # takes too long, unfortunately
    kernels = ['rbf']
    criteria = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_features = ['auto', 'sqrt', 'log2', None]
    
    # modify features, remove features via pipeline
    
    pipeline = []
    params = dict()
    pipeline_clf = ""
    
    if pca_skb == "pca_skb":
        #pipeline = make_pipeline(MinMaxScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)
        pipeline = make_pipeline(StandardScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)

        params = dict(featureunion__randomizedpca__n_components = even_range,
                      featureunion__randomizedpca__iterated_power = [1, 2, 3],
                      featureunion__randomizedpca__whiten = t_or_f,
                      featureunion__randomizedpca__random_state = random_state,
                      featureunion__selectkbest__k = even_range)   
        
    elif pca_skb == "pca":
        #pipeline = make_pipeline(MinMaxScaler(), RandomizedPCA(), clf)
        pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), clf)

        params = dict(randomizedpca__n_components = [4],
                      randomizedpca__iterated_power = [1, 2, 3],
                      randomizedpca__whiten = t_or_f,
                      randomizedpca__random_state = random_state)   
        
    elif pca_skb == "skb":
        #pipeline = make_pipeline(MinMaxScaler(), SelectKBest(), clf)
        pipeline = make_pipeline(StandardScaler(), SelectKBest(), clf)

        params = dict(selectkbest__k = [4])   
    
    pipeline_clf = pipeline.steps[2][0]      
    
    if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
        params["{}__criterion".format(pipeline_clf)] = criteria
        #params["{}__splitter".format(pipeline_clf)] = splitters
        params["{}__max_features".format(pipeline_clf)] = max_features
        #params["{}__min_samples_split".format(pipeline_clf)] = even_range
        params["{}__class_weight".format(pipeline_clf)] = ['auto', None]
        params["{}__random_state".format(pipeline_clf)] = random_state
    
    if pipeline_clf == 'svc':
        params['svc__C'] = logspace
        params['svc__kernel'] = kernels
        #params['svc__degree'] = [1,2,3,4,5]  # for use with 'poly'
        params['svc__gamma'] = logspace
        params['svc__random_state'] = random_state
        
    # cross validation    
    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=random_state[0])
    
    # tune parameters
    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

    grid_search.fit(X, y)

    if output == True:
        print "*"*15, pipeline_clf.upper(), "*"*15
        print "\nBEST SCORE: ", grid_search.best_score_, "\n"
        print "\nBEST PARAMS: ", grid_search.best_params_, "\n"

    # split into training and testing data for reporting results
    if output == True:
        print "#"*50
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state[0])

    if output == True:
        print "\nBEST ESTIMATOR:"
    clf = grid_search.best_estimator_
    if output == True:
        print clf
    clf.fit(X_train, y_train)
    
    if pca_skb == "skb" or pca_skb == "pca_skb":
    
        if output == True:
            print "\nSelectKBest SCORES:"
        features = features_list[1:]
        
        selectkbest_scores = clf.steps[1][1].scores_ if pca_skb == "skb" else clf.steps[1][1].transformer_list[1][1].scores_
        
        selectkbest_scores = np.round(selectkbest_scores, 2)
        for i in xrange(len(features)):
            if output == True:
                print "\t", features[i], ": ", selectkbest_scores[i]
    
    if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
        if output == True:
            print "\n{} FEATURE IMPORTANCES:".format(pipeline_clf.upper())
            print clf.steps[2][1].feature_importances_
    
    if output == True:
        print "\n", "#"*50
    
        print "\nPREDICTIONS:"

        print "\nground truth:\n\t", y_test 
    
    y_pred = clf.predict(X_test)
    if output == True:
        print "\npredictions:\n\t", y_pred

        print "\nscore: ", clf.score(X_test, y_test)

        print "\nEVALUATIONS:"
        print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
    
        print "\nclassification report:\n", classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
    
        print "ELAPSED TIME: ", round(time()-t0,3), "s"
    
    return clf

reminder: original classifier results

GaussianNB()
     Accuracy: 0.25560     Precision: 0.18481     Recall: 0.79800     F1: 0.30011     F2: 0.47968
     Total predictions: 10000     True positives: 1596     False positives: 7040     False negatives: 404
     True negatives: 960



In [48]:

    
# prepare for Udacity tester

# remove emails
for key in my_dataset.keys():
    my_dataset[key].pop('email_address')
    
# remove outliers from original data set
for key in my_dataset.keys():
    if key not in names:
        my_dataset.pop(key)

# replace 'NaN's
for key in my_dataset.keys():
    for sub_key in my_dataset[key].keys():
        if my_dataset[key][sub_key] == 'NaN':
            i = (df.columns.get_loc(sub_key) - 1)
            my_dataset[key][sub_key] = imp_values[i]
            
# add created feature
i = 0
for key in my_dataset.keys():
    my_dataset[key]['selectkbest_product'] = X[i,-1]
    i += 1



In [49]:

    
def udacity_tester():
    # use Udacity tester
    print "\nUDACITY TESTER RESULTS: "
    test_classifier(clf, my_dataset, features_list)

new classifier results



In [50]:

    
for classifier in [GaussianNB(), DecisionTreeClassifier()]:
    for transfomer in ['pca', 'skb', 'pca_skb']:
        clf = grid_searcher(classifier, transfomer, output=True)
        udacity_tester()









    



*************** GAUSSIANNB ***************

BEST SCORE:  0.817857142857 


BEST PARAMS:  {'randomizedpca__whiten': True, 'randomizedpca__n_components': 4, 'randomizedpca__iterated_power': 1, 'randomizedpca__random_state': 42} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('gaussiannb', GaussianNB())])

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.85      0.96      0.90        23
        poi       0.50      0.20      0.29         5

avg / total       0.78      0.82      0.79        28

ELAPSED TIME:  0.215 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.83150	Precision: 0.37368	Recall: 0.26550	F1: 0.31044	F2: 0.28182
	Total predictions: 14000	True positives:  531	False positives:  890	False negatives: 1469	True negatives: 11110

*************** GAUSSIANNB ***************

BEST SCORE:  0.817857142857 


BEST PARAMS:  {'selectkbest__k': 4} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('gaussiannb', GaussianNB())])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.85      0.96      0.90        23
        poi       0.50      0.20      0.29         5

avg / total       0.78      0.82      0.79        28

ELAPSED TIME:  0.037 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.84079	Precision: 0.41636	Recall: 0.28500	F1: 0.33838	F2: 0.30419
	Total predictions: 14000	True positives:  570	False positives:  799	False negatives: 1430	True negatives: 11201

*************** GAUSSIANNB ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'featureunion__randomizedpca__whiten': True, 'featureunion__randomizedpca__random_state': 42, 'featureunion__selectkbest__k': 8, 'featureunion__randomizedpca__n_components': 6, 'featureunion__randomizedpca__iterated_power': 1} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=6, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x1093306e0>))],
       transformer_weights=None)), ('gaussiannb', GaussianNB())])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]

score:  0.857142857143

EVALUATIONS:

confusion matrix:
[[22  1]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.88      0.96      0.92        23
        poi       0.67      0.40      0.50         5

avg / total       0.84      0.86      0.84        28

ELAPSED TIME:  23.39 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=6, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x1093306e0>))],
       transformer_weights=None)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.83564	Precision: 0.40027	Recall: 0.30200	F1: 0.34426	F2: 0.31759
	Total predictions: 14000	True positives:  604	False positives:  905	False negatives: 1396	True negatives: 11095

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'randomizedpca__iterated_power': 1, 'randomizedpca__random_state': 42, 'randomizedpca__n_components': 4, 'decisiontreeclassifier__random_state': 42, 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__class_weight': 'auto', 'randomizedpca__whiten': True, 'decisiontreeclassifier__max_features': None} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.3697774   0.1154305   0.22757212  0.28721998]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1]

score:  0.75

EVALUATIONS:

confusion matrix:
[[19  4]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.86      0.83      0.84        23
        poi       0.33      0.40      0.36         5

avg / total       0.77      0.75      0.76        28

ELAPSED TIME:  3.892 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.78343	Precision: 0.24277	Recall: 0.24350	F1: 0.24314	F2: 0.24335
	Total predictions: 14000	True positives:  487	False positives: 1519	False negatives: 1513	True negatives: 10481

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.814285714286 


BEST PARAMS:  {'decisiontreeclassifier__class_weight': 'auto', 'decisiontreeclassifier__random_state': 42, 'selectkbest__k': 4, 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_features': 'auto'} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.32138279  0.35849755  0.01851852  0.30160114]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

score:  0.821428571429

EVALUATIONS:

confusion matrix:
[[21  2]
 [ 3  2]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.88      0.91      0.89        23
        poi       0.50      0.40      0.44         5

avg / total       0.81      0.82      0.81        28

ELAPSED TIME:  0.536 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=4, score_func=<function f_classif at 0x1093306e0>)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.80421	Precision: 0.27002	Recall: 0.21750	F1: 0.24093	F2: 0.22630
	Total predictions: 14000	True positives:  435	False positives: 1176	False negatives: 1565	True negatives: 10824

*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.853571428571 


BEST PARAMS:  {'featureunion__randomizedpca__n_components': 4, 'decisiontreeclassifier__random_state': 42, 'featureunion__randomizedpca__whiten': True, 'decisiontreeclassifier__criterion': 'entropy', 'featureunion__randomizedpca__iterated_power': 1, 'decisiontreeclassifier__class_weight': None, 'featureunion__randomizedpca__random_state': 42, 'featureunion__selectkbest__k': 4, 'decisiontreeclassifier__max_features': 'log2'} 

##################################################

BEST ESTIMATOR:
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=4, s...  min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])

SelectKBest SCORES:
	bonus :  6.66
	deferral_payments :  0.11
	deferred_income :  4.6
	director_fees :  0.25
	exercised_stock_options :  20.65
	expenses :  3.78
	from_messages :  0.17
	from_poi_to_this_person :  0.19
	from_this_person_to_poi :  2.01
	loan_advances :  0.14
	long_term_incentive :  2.2
	other :  0.01
	restricted_stock :  2.79
	restricted_stock_deferred :  0.26
	salary :  4.33
	shared_receipt_with_poi :  2.87
	to_messages :  0.08
	total_payments :  1.69
	total_stock_value :  19.08
	selectkbest_product :  8.96

DECISIONTREECLASSIFIER FEATURE IMPORTANCES:
[ 0.32600091  0.11625127  0.08394309  0.09311544  0.0374992   0.08429519
  0.1467443   0.11215061]

##################################################

PREDICTIONS:

ground truth:
	[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0]

predictions:
	[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]

score:  0.785714285714

EVALUATIONS:

confusion matrix:
[[21  2]
 [ 4  1]]

classification report:
             precision    recall  f1-score   support

    non-poi       0.84      0.91      0.87        23
        poi       0.33      0.20      0.25         5

avg / total       0.75      0.79      0.76        28

ELAPSED TIME:  450.664 s

UDACITY TESTER RESULTS: 
Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=1, n_components=4, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k=4, s...  min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best'))])
	Accuracy: 0.79064	Precision: 0.26852	Recall: 0.27000	F1: 0.26926	F2: 0.26970
	Total predictions: 14000	True positives:  540	False positives: 1471	False negatives: 1460	True negatives: 10529

data dump for Udacity



In [2724]:

    
#dump_classifier_and_data(clf, my_dataset, features_list)

average best-scoring predictions



In [51]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clfs = dict()

for classifier in [GaussianNB(), DecisionTreeClassifier()]:

    clfs[str(classifier)] = dict()
    
    for transformer in ['pca', 'skb', 'pca_skb']:
        clf = grid_searcher(classifier, transformer, output=False)
        clf.fit(X_train, y_train)
        
        clfs[str(classifier)][transformer] = dict()
        
        y_pred = clf.predict(X_test)
        clfs[str(classifier)][transformer]['predictions'] = y_pred
        
        clf_report = classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
        clfs[str(classifier)][transformer]['clf_report'] = clf_report



In [146]:

    
clfs['GaussianNB()']['pca']['predictions']









    Out[146]:





array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0])



In [152]:

    
clfs['GaussianNB()']['pca']['clf_report']









    Out[152]:





'             precision    recall  f1-score   support\n\n    non-poi       0.85      0.96      0.90        23\n        poi       0.50      0.20      0.29         5\n\navg / total       0.78      0.82      0.79        28\n'



In [154]:

    
best_predictions = dict()
for clsfr in clfs.keys():
    best_predictions[clsfr] = dict()
    #print clsfr
    best_f1_score = 0.0
    for transformer in clfs[clsfr].keys():
        clf_report = clfs[clsfr][transformer]['clf_report']
        clf_report = [re.sub(r"[a-z]|\n", '', x) for x in clf_report.split(" ")]
        clf_report = filter(None, clf_report)
        #print "\t", transformer
        f1_score = float(clf_report[-2])
        #print f1_score
        if f1_score > best_f1_score:
            best_f1_score = f1_score
            best_predictions[clsfr]['transformer'] = transformer
            best_predictions[clsfr]['f1_score'] = f1_score
            best_predictions[clsfr]['predictions'] = clfs[clsfr][transformer]['predictions']



In [155]:

    
averaged_best_recall_predictions = np.zeros((28,))

for clsfr in best_predictions.keys():
    print clsfr[:10]
    print "\tbest transformer: ", best_predictions[clsfr]['transformer']
    print "\tbest f1 score: ", best_predictions[clsfr]['f1_score']
    print "\tbest predictions: ", best_predictions[clsfr]['predictions']
    
    averaged_best_recall_predictions = np.maximum(averaged_best_recall_predictions, best_predictions[clsfr]['predictions'])
    
print "\naveraged best predictions: {}".format(averaged_best_recall_predictions.astype('int'))

print "\nresulting confusion matrix:\n", confusion_matrix(y_test, averaged_best_recall_predictions)

print "\nresulting classification report:\n", classification_report(y_test, averaged_best_recall_predictions, target_names=["non-poi", "poi"])









    



GaussianNB
	best transformer:  pca_skb
	best f1 score:  0.84
	best predictions:  [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]
DecisionTr
	best transformer:  skb
	best f1 score:  0.81
	best predictions:  [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

averaged best predictions: [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

resulting confusion matrix:
[[21  2]
 [ 2  3]]

resulting classification report:
             precision    recall  f1-score   support

    non-poi       0.91      0.91      0.91        23
        poi       0.60      0.60      0.60         5

avg / total       0.86      0.86      0.86        28

Epilogue: email text as data

I had hoped to use email-text data along with the financial data, but clearly there is a discrepancy between the inidividuals represented by the financial data and the email data.



In [2725]:

    
# compare individuals represented by the financial data and by the email-corpus data

directory_names = []
poi_directory_names = []
true_count = 0
false_count = 0

for key in my_dataset.keys():
    names = key.lower().split(' ')
    dirname = names.pop(0)
    
    if len(names) > 0:
        dirname = dirname + "-" + names[0][0]

    exist = os.path.exists('/Users/excalibur/Dropbox/datasets/maildir/{}'.format(dirname))
    
    #print dirname, "\n\temails exist: ", exist, names
    
    directory_names.append(dirname)
    
    if exist:
        true_count += 1
        if my_dataset[key]['poi'] == True:
            poi_directory_names.append(dirname)
    else:
        false_count += 1
        
print "email directories matching individuals represented by financial data:"
print "\texist: ", true_count, "(POIs: {})".format(len(poi_directory_names))
print "\tdon't exist: ", false_count

#print sorted(directory_names)









    



email directories matching individuals represented by financial data:
	exist:  19 (POIs: 3)
	don't exist:  117

Unfortunately, it seems clear that it would be difficult to join the two data sets in a meaningful way due to their lack of overlap.

After this project, I may spend some time with text-vectorization of the email corpus and examine things like word frequencies (I have started working on some of the initial code for that process below); however, with such an apparent low number of known POI-emails being available (only 3!?), it's entirely unclear how useful such an endeavor would be for identifying POIs, although there are surely other interesting insights to be gleaned.



In [2726]:

    
email_corpus_dir = '/Users/excalibur/Dropbox/datasets/maildir/'



In [2727]:

    
email_dirs = os.listdir(email_corpus_dir)
print "number of email directories: ", len(email_dirs)
print "\nfirst five email directories:\n", email_dirs[:5]









    



number of email directories:  150

first five email directories:
['allen-p', 'arnold-j', 'arora-h', 'badeer-r', 'bailey-s']



In [2728]:

    
email_text = dict()
for email_dir in email_dirs:
    for dirpath, dirnames, filenames in os.walk(email_corpus_dir + email_dir):
        for dirname in dirnames:
            for filename in filenames:
                path = dirpath + "/" + dirname + "/" + filename
                if os.path.isfile(path):   
                    with open(dirpath + "/" + dirname + "/" + filename, 'r') as f:
                        read_data = f.readlines()
                        if email_dir not in email_text:
                            email_text[email_dir] = ''.join(read_data[15:])
                        else:
                            email_text[email_dir] += ''.join(read_data[15:])
                
#print email_text['white-s']

	poi	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	long_term_incentive	other	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value
0	0	600000	NaN	NaN	NaN	NaN	94299	29	38	1	NaN	NaN	1740	585062	NaN	365788	702	807	1061827	585062
1	0	1200000	1295738	-1386055	NaN	6680544	11200	NaN	NaN	NaN	NaN	1586055	2660303	3942714	NaN	267102	NaN	NaN	5634343	10623258
2	0	350000	NaN	-400729	NaN	4890344	78552	NaN	NaN	NaN	NaN	NaN	12961	1788391	NaN	170941	NaN	NaN	211725	6678735
3	0	NaN	NaN	NaN	NaN	651850	NaN	12	10	0	NaN	NaN	NaN	386335	NaN	NaN	58	764	NaN	1038185
4	1	1500000	NaN	-3117011	NaN	5538001	34039	32	32	21	NaN	1617011	11350	853064	NaN	243293	1035	1045	288682	6391065

	poi	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	long_term_incentive	other	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value
count	146.000000	82.000000	39.000000	49.000000	17.000000	1.020000e+02	95.000000	86.000000	86.000000	86.000000	4.0000	66.000000	93.000000	1.100000e+02	18.000000	95.000000	86.000000	86.000000	1.250000e+02	1.260000e+02
mean	0.123288	2374234.609756	1642674.153846	-1140475.142857	166804.882353	5.987054e+06	108728.915789	608.790698	64.895349	41.232558	41962500.0000	1470361.454545	919064.967742	2.321741e+06	166410.555556	562194.294737	1176.465116	2073.860465	5.081526e+06	6.773957e+06
std	0.329899	10713327.969046	5161929.973575	4025406.378506	319891.409747	3.106201e+07	533534.814109	1841.033949	86.979244	100.073111	47083208.7019	5942759.315498	4589252.907638	1.251828e+07	4201494.314703	2716369.154553	1178.317641	2582.700981	2.906172e+07	3.895777e+07
min	0.000000	70000.000000	-102500.000000	-27992891.000000	3285.000000	3.285000e+03	148.000000	12.000000	0.000000	0.000000	400000.0000	69223.000000	2.000000	-2.604490e+06	-7576788.000000	477.000000	2.000000	57.000000	1.480000e+02	-4.409300e+04
25%	0.000000	431250.000000	81573.000000	-694862.000000	98784.000000	5.278862e+05	22614.000000	22.750000	10.000000	1.000000	1600000.0000	281250.000000	1215.000000	2.540180e+05	-389621.750000	211816.000000	249.750000	541.250000	3.944750e+05	4.945102e+05
50%	0.000000	769375.000000	227449.000000	-159792.000000	108579.000000	1.310814e+06	46950.000000	41.000000	35.000000	8.000000	41762500.0000	442035.000000	52382.000000	4.517400e+05	-146975.000000	259996.000000	740.500000	1211.000000	1.101393e+06	1.102872e+06
75%	0.000000	1200000.000000	1002671.500000	-38346.000000	113784.000000	2.547724e+06	79952.500000	145.500000	72.250000	24.750000	82125000.0000	938672.000000	362096.000000	1.002370e+06	-75009.750000	312117.000000	1888.250000	2634.750000	2.093263e+06	2.949847e+06
max	1.000000	97343619.000000	32083396.000000	-833.000000	1398517.000000	3.117640e+08	5235198.000000	14368.000000	528.000000	609.000000	83925000.0000	48521928.000000	42667589.000000	1.303223e+08	15456290.000000	26704229.000000	5521.000000	15149.000000	3.098866e+08	4.345095e+08