Identify Fraud from Enron email

Machine Learning Project


Tools library


In [114]:
#!/usr/bin/python

""" 
    A general tool for converting data from the
    dictionary format to an (n x k) python list that's 
    ready for training an sklearn algorithm

    n--no. of key-value pairs in dictonary
    k--no. of features being extracted

    dictionary keys are names of persons in dataset
    dictionary values are dictionaries, where each
        key-value pair in the dict is the name
        of a feature, and its value for that person

    In addition to converting a dictionary to a numpy 
    array, you may want to separate the labels from the
    features--this is what targetFeatureSplit is for

    so, if you want to have the poi label as the target,
    and the features you want to use are the person's
    salary and bonus, here's what you would do:

    feature_list = ["poi", "salary", "bonus"] 
    data_array = featureFormat( data_dictionary, feature_list )
    label, features = targetFeatureSplit(data_array)

    the line above (targetFeatureSplit) assumes that the
    label is the _first_ item in feature_list--very important
    that poi is listed first!
"""


import numpy as np
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
#from feature_format import featureFormat, targetFeatureSplit

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """


    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

Tester module


In [115]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
#from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    print labels
    print features
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    #print true_negatives, false_negatives, true_positives, false_positives
    
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)

#if __name__ == '__main__':
#    main()

Data exploration and cleaning


In [116]:
import matplotlib.pyplot as plt
import pandas as pd
% matplotlib inline

# Load data from pickle files:
with open('final_project_dataset.pkl', 'rb') as f:
    data_dict = pickle.load(f)
  • Basic statistics:

In [117]:
print "Dataset type:", type(data_dict)
print "Number of key-value pairs in dictionary:", len(data_dict)
print "List of keys in dictionary:", data_dict.keys()
print "Number of elements in a key-value pair:", len(data_dict['SHANKMAN JEFFREY A'])
print "Example of contents of a key-value pair:", data_dict['SHANKMAN JEFFREY A']


Dataset type: <type 'dict'>
Number of key-value pairs in dictionary: 146
List of keys in dictionary: ['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIMOTHY J', 'WAKEHAM JOHN', 'POWERS WILLIAM', 'GOLD JOSEPH', 'BANNANTINE JAMES M', 'DUNCAN JOHN H', 'SHAPIRO RICHARD S', 'SHERRIFF JOHN R', 'SHELBY REX', 'LEMAISTRE CHARLES', 'DEFFNER JOSEPH M', 'KISHKILL JOSEPH G', 'WHALLEY LAWRENCE G', 'MCCONNELL MICHAEL S', 'PIRO JIM', 'DELAINEY DAVID W', 'SULLIVAN-SHAKLOVITZ COLLEEN', 'WROBEL BRUCE', 'LINDHOLM TOD A', 'MEYER JEROME J', 'LAY KENNETH L', 'BUTTS ROBERT H', 'OLSON CINDY K', 'MCDONALD REBECCA', 'CUMBERLAND MICHAEL S', 'GAHN ROBERT S', 'MCCLELLAN GEORGE', 'HERMANN ROBERT J', 'SCRIMSHAW MATTHEW', 'GATHMANN WILLIAM D', 'HAEDICKE MARK E', 'BOWEN JR RAYMOND M', 'GILLIS JOHN', 'FITZGERALD JAY L', 'MORAN MICHAEL P', 'REDMOND BRIAN L', 'BAZELIDES PHILIP J', 'BELDEN TIMOTHY N', 'DURAN WILLIAM D', 'THORN TERENCE H', 'FASTOW ANDREW S', 'FOY JOE', 'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'KAMINSKI WINCENTY J', 'LOCKHART EUGENE E', 'COX DAVID', 'OVERDYKE JR JERE C', 'PEREIRA PAULO V. FERRAZ', 'STABLER FRANK', 'SKILLING JEFFREY K', 'BLAKE JR. NORMAN P', 'SHERRICK JEFFREY B', 'PRENTICE JAMES', 'GRAY RODNEY', 'PICKERING MARK R', 'THE TRAVEL AGENCY IN THE PARK', 'NOLES JAMES L', 'KEAN STEVEN J', 'TOTAL', 'FOWLER PEGGY', 'WASAFF GEORGE', 'WHITE JR THOMAS E', 'CHRISTODOULOU DIOMEDES', 'ALLEN PHILLIP K', 'SHARP VICTORIA T', 'JAEDICKE ROBERT', 'WINOKUR JR. HERBERT S', 'BROWN MICHAEL', 'BADUM JAMES P', 'HUGHES JAMES A', 'REYNOLDS LAWRENCE', 'DIMICHELE RICHARD G', 'BHATNAGAR SANJAY', 'CARTER REBECCA C', 'BUCHANAN HAROLD G', 'YEAP SOON', 'MURRAY JULIA H', 'GARLAND C KEVIN', 'DODSON KEITH', 'YEAGER F SCOTT', 'HIRKO JOSEPH', 'DIETRICH JANET R', 'DERRICK JR. JAMES V', 'FREVERT MARK A', 'PAI LOU L', 'BAY FRANKLIN R', 'HAYSLETT RODERICK J', 'FUGH JOHN L', 'FALLON JAMES B', 'KOENIG MARK E', 'SAVAGE FRANK', 'IZZO LAWRENCE L', 'TILNEY ELIZABETH A', 'MARTIN AMANDA K', 'BUY RICHARD B', 'GRAMM WENDY L', 'CAUSEY RICHARD A', 'TAYLOR MITCHELL S', 'DONAHUE JR JEFFREY M', 'GLISAN JR BEN F']
Number of elements in a key-value pair: 21
Example of contents of a key-value pair: {'salary': 304110, 'to_messages': 3221, 'deferral_payments': 'NaN', 'total_payments': 3038702, 'exercised_stock_options': 1441898, 'bonus': 2000000, 'restricted_stock': 630137, 'shared_receipt_with_poi': 1730, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 2072035, 'expenses': 178979, 'loan_advances': 'NaN', 'from_messages': 2681, 'other': 1191, 'from_this_person_to_poi': 83, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 554422, 'email_address': 'jeffrey.shankman@enron.com', 'from_poi_to_this_person': 94}

This shows that the dataset is stored as a dictionary. Each person in the dataset is represented by a key-value pair. There are 146 such pairs.

The value is itself a dictionary containing 21 key-value pairs corresponding to the financial and email features.

  • Feature exploration: We first need to convert the dataset into a Panda dataframe for convenience.

In [118]:
features_list = ['poi', 'salary', 'bonus', 'long_term_incentive', 'deferred_income', 'deferral_payments', 
                 'loan_advances','other', 'expenses', 'director_fees', 'total_payments', 
                 'exercised_stock_options', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                 'from_messages', 'to_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 
                 'shared_receipt_with_poi']

data_df = pd.DataFrame.from_dict(data_dict, orient = 'index', dtype = float)

Let's have a look at missing values:


In [119]:
data_df.isnull().sum(axis = 0).sort_values(ascending = False)


Out[119]:
loan_advances                142
director_fees                129
restricted_stock_deferred    128
deferral_payments            107
deferred_income               97
long_term_incentive           80
bonus                         64
from_poi_to_this_person       60
shared_receipt_with_poi       60
to_messages                   60
from_this_person_to_poi       60
from_messages                 60
other                         53
expenses                      51
salary                        51
exercised_stock_options       44
restricted_stock              36
total_payments                21
total_stock_value             20
email_address                  0
poi                            0
dtype: int64

The column loan_advances has 142 missing values out of 146 observations.It is unlikely to be useful in the model we are trying to build.


In [120]:
data_df.isnull().sum(axis =1).sort_values(ascending = False)


Out[120]:
LOCKHART EUGENE E                19
GRAMM WENDY L                    17
WROBEL BRUCE                     17
WODRASKA JOHN                    17
THE TRAVEL AGENCY IN THE PARK    17
WHALEY DAVID A                   17
SCRIMSHAW MATTHEW                17
CHRISTODOULOU DIOMEDES           16
CLINE KENNETH W                  16
GILLIS JOHN                      16
SAVAGE FRANK                     16
WAKEHAM JOHN                     16
CHAN RONNIE                      15
MEYER JEROME J                   15
BLAKE JR. NORMAN P               15
PEREIRA PAULO V. FERRAZ          15
FUGH JOHN L                      15
MENDELSOHN JOHN                  15
LOWRY CHARLES P                  15
GATHMANN WILLIAM D               15
YEAP SOON                        15
URQUHART JOHN A                  15
WINOKUR JR. HERBERT S            15
LEMAISTRE CHARLES                14
BADUM JAMES P                    14
NOLES JAMES L                    14
DUNCAN JOHN H                    14
PRENTICE JAMES                   14
GRAY RODNEY                      14
WALTERS GARETH W                 14
                                 ..
MCMAHON JEFFREY                   5
THORN TERENCE H                   5
MCCONNELL MICHAEL S               5
MCCLELLAN GEORGE                  5
TILNEY ELIZABETH A                5
LAVORATO JOHN J                   5
KOENIG MARK E                     5
KEAN STEVEN J                     5
TOTAL                             5
WALLS JR ROBERT H                 5
GLISAN JR BEN F                   5
GARLAND C KEVIN                   5
FITZGERALD JAY L                  5
WHALLEY LAWRENCE G                5
FALLON JAMES B                    5
WASAFF GEORGE                     4
BELDEN TIMOTHY N                  4
SHARP VICTORIA T                  4
MULLER MARK S                     4
RIEKER PAULA H                    4
RICE KENNETH D                    4
OLSON CINDY K                     4
BUY RICHARD B                     4
HANNON KEVIN P                    4
PIPER GREGORY F                   3
DERRICK JR. JAMES V               3
LAY KENNETH L                     2
HAEDICKE MARK E                   2
FREVERT MARK A                    2
ALLEN PHILLIP K                   2
Length: 146, dtype: int64

In [121]:
data_df.loc['LOCKHART EUGENE E', :]


Out[121]:
salary                       NaN
to_messages                  NaN
deferral_payments            NaN
total_payments               NaN
exercised_stock_options      NaN
bonus                        NaN
restricted_stock             NaN
shared_receipt_with_poi      NaN
restricted_stock_deferred    NaN
total_stock_value            NaN
expenses                     NaN
loan_advances                NaN
from_messages                NaN
other                        NaN
from_this_person_to_poi      NaN
poi                            0
director_fees                NaN
deferred_income              NaN
long_term_incentive          NaN
email_address                NaN
from_poi_to_this_person      NaN
Name: LOCKHART EUGENE E, dtype: object

The entry 'Eugene Lockhart' has only NAs, except for poi which has a meaningful value. This matches the content of the file enron61702insiderpay.pdf, which shows that all his values are zero. In a sense, this person is an outlier, however we have to decide whether we want to retain im in the data or not. In other words, do we believe that this is a correct observaton or an error, and if we think it is correct, it is usefull to keep an observation that has only zeros? My view on this is that the observation is probably correct (there is a number of other individuals with very few non-zero features) and might be useful to the model, so I will retain it.

In this dataset, missing values obviously mean zero. However, when working with financial data, one often has to convert values to their logarithm. With zeros and negative numbers, this leads to undefined values. We will therefore replace all NAs with a very small number (1.e-5).


In [122]:
data_df.fillna(1.e-5, inplace = True)
data_df = data_df[features_list]
data_df.describe()


Out[122]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
count 146.000000 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 1.460000e+02 146.000000 146.000000 146.000000 146.000000 146.000000
mean 0.123288 3.658114e+05 1.333474e+06 6.646839e+05 -3.827622e+05 4.387965e+05 1.149658e+06 5.854318e+05 7.074827e+04 1.942249e+04 4.350622e+06 4.182736e+06 1.749257e+06 2.051637e+04 5.846018e+06 358.602744 1221.589045 38.226032 24.287675 692.986305
std 0.329899 2.203575e+06 8.094029e+06 4.046072e+06 2.378250e+06 2.741325e+06 9.649342e+06 3.682345e+06 4.327163e+05 1.190543e+05 2.693448e+07 2.607040e+07 1.089995e+07 1.439661e+06 3.624681e+07 1441.259867 2226.770635 73.901122 79.278205 1072.969489
min 0.000000 1.000000e-05 1.000000e-05 1.000000e-05 -2.799289e+07 -1.025000e+05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 -2.604490e+06 -7.576788e+06 -4.409300e+04 0.000010 0.000010 0.000000 0.000000 0.000010
25% 0.000000 1.000000e-05 1.000000e-05 1.000000e-05 -3.792600e+04 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 9.394475e+04 1.000000e-05 8.115000e+03 1.000000e-05 2.288695e+05 0.000010 0.000010 0.000010 0.000010 0.000010
50% 0.000000 2.105960e+05 3.000000e+05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 9.595000e+02 2.018200e+04 1.000000e-05 9.413595e+05 6.082935e+05 3.605280e+05 1.000000e-05 9.659550e+05 16.500000 289.000000 2.500000 0.000010 102.500000
75% 0.000000 2.708505e+05 8.000000e+05 3.750648e+05 1.000000e-05 9.684500e+03 1.000000e-05 1.506065e+05 5.374075e+04 1.000000e-05 1.968287e+06 1.714221e+06 8.145280e+05 1.000000e-05 2.319991e+06 51.250000 1585.750000 40.750000 13.750000 893.500000
max 1.000000 2.670423e+07 9.734362e+07 4.852193e+07 1.000000e-05 3.208340e+07 8.392500e+07 4.266759e+07 5.235198e+06 1.398517e+06 3.098866e+08 3.117640e+08 1.303223e+08 1.545629e+07 4.345095e+08 14368.000000 15149.000000 528.000000 609.000000 5521.000000

restricted_stock_deferred seems to have negative values only according to enron61702insiderpay.pdf, however its maximum value is $15,456,290. Let's investigate:


In [123]:
data_df[data_df['restricted_stock_deferred'] == np.max(data_df['restricted_stock_deferred'])]


Out[123]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
BHATNAGAR SANJAY 0.0 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 137864.0 0.00001 137864.0 15456290.0 2604490.0 -2604490.0 15456290.0 0.00001 29.0 523.0 0.0 1.0 463.0

When comparing these values with the pdf file, I realize that the data is shifted to the left by one column, hence the errors. Presumably, there might be other such occurences so I now need to go through the data and manually fix these.

  • Data cleaning: An easy way to detect discrepancies such as described above is to check that totals (payments and stock value) are equal to the sum of their components.

In [124]:
data_df[(np.floor(data_df['salary'] + data_df['bonus'] + data_df['long_term_incentive'] + data_df['deferred_income'] + \
       data_df['deferral_payments'] + data_df['loan_advances'] + data_df['other'] + data_df['expenses'] + \
       data_df['director_fees']) != np.floor(data_df['total_payments'])) | \
       (np.floor(data_df['exercised_stock_options'] + data_df['restricted_stock'] + \
        data_df['restricted_stock_deferred']) != np.floor(data_df['total_stock_value']))]


Out[124]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
BELFER ROBERT 0.0 0.00001 0.00001 0.00001 0.00001 -102500.00000 0.00001 0.00001 0.00001 3285.0 102500.0 3285.0 1.000000e-05 44093.0 -44093.00000 0.00001 0.00001 0.00001 0.00001 0.00001
BHATNAGAR SANJAY 0.0 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 137864.00000 0.00001 137864.0 15456290.0 2604490.0 -2.604490e+06 15456290.0 0.00001 29.00000 523.00000 0.00000 1.00000 463.00000

There are only two problematic observations. Let's correct them manually:


In [125]:
# Robert Belfer:
for j in xrange(1, 14):
    data_df.ix['BELFER ROBERT', j] = data_df.ix['BELFER ROBERT', j + 1]
data_df.ix['BELFER ROBERT', 14] = 1.e-5

# Sanjay Bhatnagar:
for j in xrange(14, 2, -1):
    data_df.ix['BHATNAGAR SANJAY', j] = data_df.ix['BHATNAGAR SANJAY', j - 1]
data_df.ix['BHATNAGAR SANJAY', 1] = 1.e-5
    
data_df.loc[['BELFER ROBERT', 'BHATNAGAR SANJAY']]


/home/lucfrachon/anaconda3/envs/py27/lib/python2.7/site-packages/ipykernel/__main__.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  app.launch_new_instance()
Out[125]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
BELFER ROBERT 0.0 0.00001 0.00001 0.00001 -102500.00000 0.00001 0.00001 0.00001 3285.0 102500.00000 3285.0 1.000000e-05 44093.0 -44093.0 1.000000e-05 0.00001 0.00001 0.00001 0.00001 0.00001
BHATNAGAR SANJAY 0.0 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 137864.0 0.00001 137864.0 1.545629e+07 2604490.0 -2604490.0 1.545629e+07 29.00000 523.00000 0.00000 1.00000 463.00000

This confirms we successfully cleaned up the data.

  • Look for outliers:

In our list of DataFrame indexes shown above, we can see a name that is obviously not a real person: 'THE TRAVEL AGENCY IN THE PARK'. Some research show that this is a travel agency that was contracted to Enron while related to the wife of one of Enron's executives. There might be conflict of interest here, but we since we are investigating persons and not suppliers, I chose to drop this observation.


In [126]:
data_df = data_df.drop(['THE TRAVEL AGENCY IN THE PARK'])

I will now make a scatter plot of the first two variables:


In [127]:
sp = data_df.plot.scatter(x = 'salary', y = 'deferral_payments', c = 'poi', edgecolors = 'Blue', 
                          s = 50)


There is an observation that immediately stands out. It corresponds to the highest values of both salary and deferral_payments. When checking the numbers against the document named enron61702insiderpay.pdf, we see that these values correspond to the 'TOTAL' line and are therefore an artefact of the data collection process rather than an actual observation.


In [128]:
# Drop the 'TOTAL' row:
data_df = data_df.drop(['TOTAL'])
data_df.describe()


Out[128]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
count 144.000000 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 144.000000 144.000000 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 1.440000e+02 144.000000 144.000000 144.000000 144.000000 144.000000
mean 0.125000 1.854460e+05 6.759974e+05 3.369578e+05 -1.943951e+05 2.228014e+05 5.828125e+05 2.937881e+05 36355.541670 9711.923620 2.149476e+06 2.165028e+06 9.050160e+05 -5.261658e+04 3.017427e+06 363.583337 1238.555560 38.756948 24.625004 702.611115
std 0.331873 1.970421e+05 1.233155e+06 6.871826e+05 6.058422e+05 7.538411e+05 6.794472e+06 1.131517e+06 45990.417222 30422.534937 8.779364e+06 4.923320e+06 2.000357e+06 2.731835e+05 6.271528e+06 1450.675238 2237.564813 74.276767 79.778264 1077.290734
min 0.000000 1.000000e-05 1.000000e-05 1.000000e-05 -3.504386e+06 1.000000e-05 1.000000e-05 1.000000e-05 0.000010 0.000010 1.000000e-05 1.000000e-05 1.000000e-05 -2.604490e+06 1.000000e-05 0.000010 0.000010 0.000000 0.000000 0.000010
25% 0.000000 1.000000e-05 1.000000e-05 1.000000e-05 -3.907200e+04 1.000000e-05 1.000000e-05 1.000000e-05 0.000010 0.000010 8.747150e+04 1.000000e-05 4.409300e+04 1.000000e-05 2.563765e+05 0.000010 0.000010 0.000010 0.000010 0.000010
50% 0.000000 2.105960e+05 3.000000e+05 1.000000e-05 1.000000e-05 1.000000e-05 1.000000e-05 8.825000e+02 21937.000000 0.000010 9.138250e+05 6.082935e+05 3.619780e+05 1.000000e-05 9.805345e+05 17.500000 347.500000 4.000000 0.000010 114.000000
75% 0.000000 2.696675e+05 8.000000e+05 3.745862e+05 1.000000e-05 8.535500e+03 1.000000e-05 1.485770e+05 54234.500000 0.000010 1.885158e+06 1.683580e+06 8.571030e+05 1.000000e-05 2.372703e+06 53.000000 1623.000000 41.250000 14.000000 933.750000
max 1.000000 1.111258e+06 8.000000e+06 5.145434e+06 1.000000e-05 6.426990e+06 8.152500e+07 1.035973e+07 228763.000000 125034.000000 1.035598e+08 3.434838e+07 1.476169e+07 1.000000e-05 4.911008e+07 14368.000000 15149.000000 528.000000 609.000000 5521.000000

In [129]:
sp = data_df.plot.scatter(x = 'salary', y = 'deferral_payments', c = 'poi', edgecolors = 'Blue', 
                          s = 50)


The data looks a lot more sensible now. There are still two significant outliers but they correspond to actual staff members (Jeffrey Skelling and Mark Frevert). As often with financial data, we might need to opt for log scales in further exploration. But for now, we are trying to identify outliers so we will stick to linear scales. Let's continue to plot observations:


In [130]:
sp = data_df.plot.scatter(x = 'salary', y = 'bonus', c = 'poi', edgecolors = 'Blue', 
                          s = 50)



In [131]:
sp = data_df.plot.scatter(x = 'salary', y = 'expenses', c = 'poi', edgecolors = 'Blue', 
                          s = 50)



In [132]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_payments', c = 'poi', edgecolors = 'Blue', 
                          s = 50)


Wow, now we have someone whose total payments is one order of magnitude above everyone else's. This is Kenneth Lay; the bulk of the payments come from Loan Advances. We will need to make a decision as to whether we want to keep him in the data or not... Let's see what this plot looks like with a logarithmic y-scale:


In [133]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_payments', c = 'poi', edgecolors = 'Blue', 
                          s = 50)
sp.set_yscale('log')
sp.set_ylim(1.0e4, 1.5e8)


Out[133]:
(10000.0, 150000000.0)

Now there seems to be a correlation between the two variables.


In [134]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_stock_value', c = 'poi', edgecolors = 'Blue', 
                          s = 50)


Again, Kenneth Lay stands out with a total stock value of over $49mil. This is a real observations, so I decide to keep it for now.


In [135]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_stock_value', c = 'poi', edgecolors = 'Blue', 
                          s = 50)
sp.set_yscale('log')
sp.set_ylim(1.0e4, 1.5e8)


Out[135]:
(10000.0, 150000000.0)

The two variables seem associated but the relationship may not be linear, even taking the log of total_stock_value.

Let's now look at email features:


In [136]:
sp = data_df.plot.scatter(x = 'to_messages', y = 'from_messages', c = 'poi', edgecolors = 'Blue', 
                          s = 50)


One employee stands out as extremely verbose! They sent almost 3 times as many messages as they received. Let's find out who they were:


In [137]:
data_df[data_df['from_messages'] == np.max(data_df['from_messages'])]


Out[137]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
KAMINSKI WINCENTY J 0.0 275101.0 400000.0 323466.0 0.00001 0.00001 0.00001 4669.0 83585.0 0.00001 1086821.0 850010.0 126027.0 0.00001 976037.0 14368.0 4607.0 41.0 171.0 583.0

The Wikipedia page about Vince Kaminski tells us he was Managing Director for Research and repetedly voiced objections to Enron's practices, warning that a single event could trigger a cascade of provision clauses in creditor contracts that would quickly lead to the demise of Enron. He was unfortunately proved right... Would this explain the discrepancy between the number of emails sent andb received? A detailed analysis of his emails might give us some insight into this but this is outside the scope of this project.


In [138]:
sp = data_df.plot.scatter(x = 'from_poi_to_this_person', y = 'from_this_person_to_poi',
                          c = 'poi', edgecolors = 'Blue', 
                          s = 50)


Again, let's look at who the two outliers are:


In [139]:
data_df[data_df['from_this_person_to_poi'] == np.max(data_df['from_this_person_to_poi'])]


Out[139]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
DELAINEY DAVID W 1.0 365163.0 3000000.0 1294981.0 0.00001 0.00001 0.00001 1661.0 86174.0 0.00001 4747979.0 2291113.0 1323148.0 0.00001 3614261.0 3069.0 3093.0 66.0 609.0 2097.0

In [140]:
data_df[data_df['from_poi_to_this_person'] == np.max(data_df['from_poi_to_this_person'])]


Out[140]:
poi salary bonus long_term_incentive deferred_income deferral_payments loan_advances other expenses director_fees total_payments exercised_stock_options restricted_stock restricted_stock_deferred total_stock_value from_messages to_messages from_poi_to_this_person from_this_person_to_poi shared_receipt_with_poi
LAVORATO JOHN J 0.0 339288.0 8000000.0 2035380.0 0.00001 0.00001 0.00001 1552.0 49537.0 0.00001 10425757.0 4158995.0 1008149.0 0.00001 5167144.0 2585.0 7259.0 528.0 411.0 3962.0

David Delainey is a POI; in fact he was amongst the first convicted employees of Enron. John Lavorato is not a POI, and seems to have privately expressed concerns about some of Enron's behaviour.

  • Look at individual variable distributions:

In this section we will plot histograms for each feature variable and apply log transformations where required to make them closer to a normal distribution, which helps many machine learning models.


In [141]:
fig, ax = plt.subplots()
data_df.hist('salary', ax = ax, bins = 50)
ax.set_ylim(0, 20)


Out[141]:
(0, 20)

Many salaries are 0 -- presumably, not current Enron employees. Salaries are distributed in a fairly normal way, if we exclude the 0 values.


In [142]:
fig, ax = plt.subplots()
data_df.hist('deferral_payments', ax = ax, bins = np.logspace(np.log10(1.e-5),
                                                              np.log10(2.e6),
                                                              50))
ax.set_xscale('log')
ax.set_xlim(1e3, None)
ax.set_ylim(0, 10)


Out[142]:
(0, 10)

For deferral_payments, I had to use a log transformation to get something resembling a normal distribution. Note that the count of non-zero values is low.


In [143]:
fig, ax = plt.subplots()
data_df.hist('total_payments', ax = ax, bins = np.logspace(np.log10(1.e-5),
                                                              np.log10(2.e8),
                                                              50))
ax.set_xscale('log')
ax.set_xlim(1.e2, None)
ax.set_ylim(0, None)


Out[143]:
(0, 33.600000000000001)

Again, a logarithmic transformation is required to have a roughly normal distribution of the non-zero values.


In [144]:
fig, ax = plt.subplots()
data_df.hist('loan_advances', ax = ax, bins = 50)


Out[144]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7ffa320a9b90>], dtype=object)

The number of non-zero values does not justify using this predictor in the model.


In [145]:
fig, ax = plt.subplots()
data_df.hist('bonus', ax = ax, bins = np.logspace(np.log10(1.e-5),
                                                              np.log10(2.e6),
                                                              100))
ax.set_xscale('log')
ax.set_xlim(1e4, None)
ax.set_ylim(0, 20)


Out[145]:
(0, 20)

Here again, I used a log transformation.


In [146]:
fig, ax = plt.subplots()
data_df.hist('restricted_stock_deferred', ax = ax, bins = 50)


Out[146]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7ffa31e5e550>], dtype=object)
Once again, it does not look like this feature will be terribly useful as it has very few non-zero observations.
  • Look for associations between features and labels:

In this section, we will boxplot the poi label against the feature variables to try and find the most relevant features to select in our model.


In [147]:
fig, ax = plt.subplots(2, 2, sharey = False, figsize = (10, 15))
bp = data_df.boxplot(['salary', 'bonus', 'expenses', 'director_fees'], by = 'poi', 
                      ax = ax)


It looks like salary and bonus might be good predictors. expenses seem less significant. Finally, director_fees seems useful because none of the POI seems to have received any director fees. However, this also applies to many non-POIs so it is not enough for perfect prediction. Moreover, very few employees received these fees so the information might not be very significant.


In [148]:
data_df.loc[:, ['director_fees', 'poi']]


Out[148]:
director_fees poi
ALLEN PHILLIP K 0.00001 0.0
BADUM JAMES P 0.00001 0.0
BANNANTINE JAMES M 0.00001 0.0
BAXTER JOHN C 0.00001 0.0
BAY FRANKLIN R 0.00001 0.0
BAZELIDES PHILIP J 0.00001 0.0
BECK SALLY W 0.00001 0.0
BELDEN TIMOTHY N 0.00001 1.0
BELFER ROBERT 102500.00000 0.0
BERBERIAN DAVID 0.00001 0.0
BERGSIEKER RICHARD P 0.00001 0.0
BHATNAGAR SANJAY 0.00001 0.0
BIBI PHILIPPE A 0.00001 0.0
BLACHMAN JEREMY M 0.00001 0.0
BLAKE JR. NORMAN P 113784.00000 0.0
BOWEN JR RAYMOND M 0.00001 1.0
BROWN MICHAEL 0.00001 0.0
BUCHANAN HAROLD G 0.00001 0.0
BUTTS ROBERT H 0.00001 0.0
BUY RICHARD B 0.00001 0.0
CALGER CHRISTOPHER F 0.00001 1.0
CARTER REBECCA C 0.00001 0.0
CAUSEY RICHARD A 0.00001 1.0
CHAN RONNIE 98784.00000 0.0
CHRISTODOULOU DIOMEDES 0.00001 0.0
CLINE KENNETH W 0.00001 0.0
COLWELL WESLEY 0.00001 1.0
CORDES WILLIAM R 0.00001 0.0
COX DAVID 0.00001 0.0
CUMBERLAND MICHAEL S 0.00001 0.0
... ... ...
SAVAGE FRANK 125034.00000 0.0
SCRIMSHAW MATTHEW 0.00001 0.0
SHANKMAN JEFFREY A 0.00001 0.0
SHAPIRO RICHARD S 0.00001 0.0
SHARP VICTORIA T 0.00001 0.0
SHELBY REX 0.00001 1.0
SHERRICK JEFFREY B 0.00001 0.0
SHERRIFF JOHN R 0.00001 0.0
SKILLING JEFFREY K 0.00001 1.0
STABLER FRANK 0.00001 0.0
SULLIVAN-SHAKLOVITZ COLLEEN 0.00001 0.0
SUNDE MARTIN 0.00001 0.0
TAYLOR MITCHELL S 0.00001 0.0
THORN TERENCE H 0.00001 0.0
TILNEY ELIZABETH A 0.00001 0.0
UMANOFF ADAM S 0.00001 0.0
URQUHART JOHN A 36666.00000 0.0
WAKEHAM JOHN 109298.00000 0.0
WALLS JR ROBERT H 0.00001 0.0
WALTERS GARETH W 0.00001 0.0
WASAFF GEORGE 0.00001 0.0
WESTFAHL RICHARD K 0.00001 0.0
WHALEY DAVID A 0.00001 0.0
WHALLEY LAWRENCE G 0.00001 0.0
WHITE JR THOMAS E 0.00001 0.0
WINOKUR JR. HERBERT S 108579.00000 0.0
WODRASKA JOHN 0.00001 0.0
WROBEL BRUCE 0.00001 0.0
YEAGER F SCOTT 0.00001 1.0
YEAP SOON 0.00001 0.0

144 rows × 2 columns

Let's continue our investigation with the next set of predictors. For these predictors, a logarithmic y scale is more adequate:


In [149]:
data_df.loc[:, 'deferred_income'] = np.abs(data_df.loc[:, 'deferred_income'])

In [150]:
fig, axes = plt.subplots(2, 3, sharey = False, figsize = (15, 15))
bp = data_df.boxplot(['deferral_payments', 'loan_advances', 'deferred_income', 
                       'long_term_incentive', 'other', 'total_payments'], by = 'poi', 
                      ax = axes)
for i in range(2):
    for j in range(3):
        axes[i][j].set_yscale('log')
        axes[i][j].set_ylim(1000, None)


Some of these variables have very few non-zero values, such as loan advances for instance. deferral_payment might not be a very strong predictor, but the other variables seem all significant. However we need to be warry of not duplicating information, so we should probably not include total_payments (which is the sum of all other pay-related features), or conversely we should only keep the total but not (all) the elements making it up.

Let's continue with the stock value features:


In [151]:
data_df.loc[:, 'restricted_stock_deferred'] = np.abs(data_df.loc[:, 'restricted_stock_deferred'])

In [152]:
fig, axes = plt.subplots(2, 2, sharey = False, figsize = (10, 10))
bp = data_df.boxplot(['restricted_stock_deferred', 'exercised_stock_options', 
                      'restricted_stock', 'total_stock_value'], by = 'poi', 
                      ax = axes)
for i in range(2):
    for j in range(2):
        axes[i][j].set_yscale('log')
        axes[i][j].set_ylim(1000, None)


It appears that only non-POI have non-zero values for restricted_stock_deferred. However the number of non-zero observations is low. The other predictors all seem useful, but total_stock_value is the sum of all of them so we may need to choose whether to keep the total or the individual predictors that make it up.

Finally, let's have a look at the email features:


In [153]:
fig, axes = plt.subplots(1, 5, sharey = False, figsize = (20, 10))
bp = data_df.boxplot(['to_messages', 'from_messages', 'from_poi_to_this_person', 
                      'from_this_person_to_poi', 'shared_receipt_with_poi'], by = 'poi', 
                      ax = axes)
for i in range(5):
    axes[i].set_yscale('log')
    axes[i].set_ylim(1, None)


All these predictors seem relevant to predict the POI status of a member of staff. Note that there seems to be a circular logic in these features: To predict whether or not someone is a POI, we look at whether they sent emails or received emails from other POIs, which implies that we already know if they are POIs or not...

Synthetic features

Given the small dataset size, I would like to restrict the number of predictors to as low a number as possible. To that end, I will try to aggregate some of the variables in a meaningful way. My first idea is to use the email features to look at from / to ratios and total number of emails involving POIs.


In [154]:
extended_data = data_df.loc[:, ['poi', 'salary', 'bonus', 'expenses', 'director_fees']]
extended_data.loc[:, 'sent_vs_received'] = data_df.loc[:, 'from_messages'] / data_df.loc[:, 'to_messages']
extended_data.loc[:, 'total_emails']= data_df.loc[:, 'from_messages'] + data_df.loc[:, 'to_messages']
extended_data.loc[:, 'emails_with_poi'] = data_df.loc[:, 'from_this_person_to_poi'] + \
    data_df.loc[:, 'from_poi_to_this_person'] + \
    data_df.loc[:, 'shared_receipt_with_poi']

Let's see if these variables teach us anything:


In [155]:
fig, axes = plt.subplots(1, 3, sharey = False, figsize = (15, 10))
bp = extended_data.boxplot(['sent_vs_received', 'total_emails', 'emails_with_poi'], by = 'poi', 
                           ax = axes)


The boxes for POI / non-POI in the first plot overlap quite a lot, meaning the predictor might not be as useful as others, but median values are quite diffent. It seems that on average, POI tend to send far less emails than they receive, which is intuitively consistent with senior executives being cc'd on a lot of conversations.

Feature selection

Based on the analysis above, I will make a feature selection for my first model, bearing in mind the need to keep the feature count as low as possible. There are also some variables that need to be converted to their logarithmic values.

Note: By accident, I discovered that keeping both

Selected features:

  • salary
  • log(bonus)
  • expenses
  • director_fees
  • log(deferred_income)
  • log(long_term_incentive)
  • log(other)
  • log(restricted_stock_deferred)
  • log(total_stock_value)
  • sent_vs_received
  • total_emails
  • emails_with_poi

In [156]:
# Create and export final dataset
extended_data.loc[:, 'log_bonus'] =  np.log(data_df.loc[:, 'bonus'])
extended_data.loc[:, 'log_deferred_income'] =  np.log(data_df.loc[:, 'deferred_income'])
extended_data.loc[:, 'log_long_term_incentive'] =  np.log(data_df.loc[:, 'long_term_incentive'])
extended_data.loc[:, 'log_other'] =  np.log(data_df.loc[:, 'other'])
extended_data.loc[:, 'log_restricted_stock_deferred'] =  np.log(data_df.loc[:, 'restricted_stock_deferred'])
extended_data.loc[:, 'log_total_stock_value'] =  np.log(data_df.loc[:, 'total_stock_value'])

# List of features used in the model
features_list = list(extended_data.columns)

# Put dataset into the dict format expected by the test module
my_data_dict = extended_data.to_dict(orient = 'index')

print features_list


['poi', 'salary', 'bonus', 'expenses', 'director_fees', 'sent_vs_received', 'total_emails', 'emails_with_poi', 'log_bonus', 'log_deferred_income', 'log_long_term_incentive', 'log_other', 'log_restricted_stock_deferred', 'log_total_stock_value']

In [157]:
extended_data.describe()


Out[157]:
poi salary bonus expenses director_fees sent_vs_received total_emails emails_with_poi log_bonus log_deferred_income log_long_term_incentive log_other log_restricted_stock_deferred log_total_stock_value
count 144.000000 1.440000e+02 1.440000e+02 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000 144.000000
mean 0.125000 1.854460e+05 6.759974e+05 36355.541670 9711.923620 0.510613 1602.138897 765.993068 2.590001 -3.555978 -0.423050 1.870647 -8.711528 10.683963
std 0.331873 1.970421e+05 1.233155e+06 45990.417222 30422.534937 0.498352 3226.505841 1177.218110 12.499789 11.177331 12.285673 10.717991 7.694978 8.786104
min 0.000000 1.000000e-05 1.000000e-05 0.000010 0.000010 0.005657 0.000020 0.000030 -11.512925 -11.512925 -11.512925 -11.512925 -11.512925 -11.512925
25% 0.000000 1.000000e-05 1.000000e-05 0.000010 0.000010 0.043209 0.000020 0.000030 -11.512925 -11.512925 -11.512925 -11.512925 -11.512925 12.454355
50% 0.000000 2.105960e+05 3.000000e+05 21937.000000 0.000010 0.287378 378.500000 126.500000 12.611538 -11.512925 -11.512925 6.782712 -11.512925 13.795843
75% 0.000000 2.696675e+05 8.000000e+05 54234.500000 0.000010 1.000000 1762.250000 1000.250000 13.592367 10.572656 12.833577 11.908832 -11.512925 14.679117
max 1.000000 1.111258e+06 8.000000e+06 228763.000000 125034.000000 3.118732 19513.000000 5857.000000 15.894952 15.069526 15.453620 16.153437 14.772747 17.709575

In [158]:
extended_data.columns


Out[158]:
Index([u'poi', u'salary', u'bonus', u'expenses', u'director_fees',
       u'sent_vs_received', u'total_emails', u'emails_with_poi', u'log_bonus',
       u'log_deferred_income', u'log_long_term_incentive', u'log_other',
       u'log_restricted_stock_deferred', u'log_total_stock_value'],
      dtype='object')