In [114]:
#!/usr/bin/python
"""
A general tool for converting data from the
dictionary format to an (n x k) python list that's
ready for training an sklearn algorithm
n--no. of key-value pairs in dictonary
k--no. of features being extracted
dictionary keys are names of persons in dataset
dictionary values are dictionaries, where each
key-value pair in the dict is the name
of a feature, and its value for that person
In addition to converting a dictionary to a numpy
array, you may want to separate the labels from the
features--this is what targetFeatureSplit is for
so, if you want to have the poi label as the target,
and the features you want to use are the person's
salary and bonus, here's what you would do:
feature_list = ["poi", "salary", "bonus"]
data_array = featureFormat( data_dictionary, feature_list )
label, features = targetFeatureSplit(data_array)
the line above (targetFeatureSplit) assumes that the
label is the _first_ item in feature_list--very important
that poi is listed first!
"""
import numpy as np
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
#from feature_format import featureFormat, targetFeatureSplit
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
""" convert dictionary to numpy array of features
remove_NaN = True will convert "NaN" string to 0.0
remove_all_zeroes = True will omit any data points for which
all the features you seek are 0.0
remove_any_zeroes = True will omit any data points for which
any of the features you seek are 0.0
sort_keys = True sorts keys by alphabetical order. Setting the value as
a string opens the corresponding pickle file with a preset key
order (this is used for Python 3 compatibility, and sort_keys
should be left as False for the course mini-projects).
NOTE: first feature is assumed to be 'poi' and is not checked for
removal for zero or missing values.
"""
return_list = []
# Key order - first branch is for Python 3 compatibility on mini-projects,
# second branch is for compatibility on final project.
if isinstance(sort_keys, str):
import pickle
keys = pickle.load(open(sort_keys, "rb"))
elif sort_keys:
keys = sorted(dictionary.keys())
else:
keys = dictionary.keys()
for key in keys:
tmp_list = []
for feature in features:
try:
dictionary[key][feature]
except KeyError:
print "error: key ", feature, " not present"
return
value = dictionary[key][feature]
if value=="NaN" and remove_NaN:
value = 0
tmp_list.append( float(value) )
# Logic for deciding whether or not to add the data point.
append = True
# exclude 'poi' class as criteria.
if features[0] == 'poi':
test_list = tmp_list[1:]
else:
test_list = tmp_list
### if all features are zero and you want to remove
### data points that are all zero, do that here
if remove_all_zeroes:
append = False
for item in test_list:
if item != 0 and item != "NaN":
append = True
break
### if any features for a given data point are zero
### and you want to remove data points with any zeroes,
### handle that here
if remove_any_zeroes:
if 0 in test_list or "NaN" in test_list:
append = False
### Append the data point if flagged for addition.
if append:
return_list.append( np.array(tmp_list) )
return np.array(return_list)
def targetFeatureSplit( data ):
"""
given a numpy array like the one returned from
featureFormat, separate out the first feature
and put it into its own list (this should be the
quantity you want to predict)
return targets and features as separate lists
(sklearn can generally handle both lists and numpy arrays as
input formats when training/predicting)
"""
target = []
features = []
for item in data:
target.append( item[0] )
features.append( item[1:] )
return target, features
In [115]:
#!/usr/bin/pickle
""" a basic script for importing student's POI identifier,
and checking the results that they get from it
requires that the algorithm, dataset, and features list
be written to my_classifier.pkl, my_dataset.pkl, and
my_feature_list.pkl, respectively
that process should happen at the end of poi_id.py
"""
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
#from feature_format import featureFormat, targetFeatureSplit
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
print labels
print features
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
#print true_negatives, false_negatives, true_positives, false_positives
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"
def dump_classifier_and_data(clf, dataset, feature_list):
with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
pickle.dump(clf, clf_outfile)
with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
pickle.dump(dataset, dataset_outfile)
with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
pickle.dump(feature_list, featurelist_outfile)
def load_classifier_and_data():
with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
clf = pickle.load(clf_infile)
with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
dataset = pickle.load(dataset_infile)
with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
feature_list = pickle.load(featurelist_infile)
return clf, dataset, feature_list
def main():
### load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
### Run testing script
test_classifier(clf, dataset, feature_list)
#if __name__ == '__main__':
# main()
In [116]:
import matplotlib.pyplot as plt
import pandas as pd
% matplotlib inline
# Load data from pickle files:
with open('final_project_dataset.pkl', 'rb') as f:
data_dict = pickle.load(f)
In [117]:
print "Dataset type:", type(data_dict)
print "Number of key-value pairs in dictionary:", len(data_dict)
print "List of keys in dictionary:", data_dict.keys()
print "Number of elements in a key-value pair:", len(data_dict['SHANKMAN JEFFREY A'])
print "Example of contents of a key-value pair:", data_dict['SHANKMAN JEFFREY A']
This shows that the dataset is stored as a dictionary. Each person in the dataset is represented by a key-value pair. There are 146 such pairs.
The value is itself a dictionary containing 21 key-value pairs corresponding to the financial and email features.
In [118]:
features_list = ['poi', 'salary', 'bonus', 'long_term_incentive', 'deferred_income', 'deferral_payments',
'loan_advances','other', 'expenses', 'director_fees', 'total_payments',
'exercised_stock_options', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
'from_messages', 'to_messages', 'from_poi_to_this_person', 'from_this_person_to_poi',
'shared_receipt_with_poi']
data_df = pd.DataFrame.from_dict(data_dict, orient = 'index', dtype = float)
Let's have a look at missing values:
In [119]:
data_df.isnull().sum(axis = 0).sort_values(ascending = False)
Out[119]:
The column loan_advances has 142 missing values out of 146 observations.It is unlikely to be useful in the model we are trying to build.
In [120]:
data_df.isnull().sum(axis =1).sort_values(ascending = False)
Out[120]:
In [121]:
data_df.loc['LOCKHART EUGENE E', :]
Out[121]:
The entry 'Eugene Lockhart' has only NAs, except for poi which has a meaningful value. This matches the content of the file enron61702insiderpay.pdf, which shows that all his values are zero. In a sense, this person is an outlier, however we have to decide whether we want to retain im in the data or not. In other words, do we believe that this is a correct observaton or an error, and if we think it is correct, it is usefull to keep an observation that has only zeros?
My view on this is that the observation is probably correct (there is a number of other individuals with very few non-zero features) and might be useful to the model, so I will retain it.
In this dataset, missing values obviously mean zero. However, when working with financial data, one often has to convert values to their logarithm. With zeros and negative numbers, this leads to undefined values. We will therefore replace all NAs with a very small number (1.e-5).
In [122]:
data_df.fillna(1.e-5, inplace = True)
data_df = data_df[features_list]
data_df.describe()
Out[122]:
restricted_stock_deferred seems to have negative values only according to enron61702insiderpay.pdf, however its maximum value is $15,456,290. Let's investigate:
In [123]:
data_df[data_df['restricted_stock_deferred'] == np.max(data_df['restricted_stock_deferred'])]
Out[123]:
When comparing these values with the pdf file, I realize that the data is shifted to the left by one column, hence the errors. Presumably, there might be other such occurences so I now need to go through the data and manually fix these.
In [124]:
data_df[(np.floor(data_df['salary'] + data_df['bonus'] + data_df['long_term_incentive'] + data_df['deferred_income'] + \
data_df['deferral_payments'] + data_df['loan_advances'] + data_df['other'] + data_df['expenses'] + \
data_df['director_fees']) != np.floor(data_df['total_payments'])) | \
(np.floor(data_df['exercised_stock_options'] + data_df['restricted_stock'] + \
data_df['restricted_stock_deferred']) != np.floor(data_df['total_stock_value']))]
Out[124]:
There are only two problematic observations. Let's correct them manually:
In [125]:
# Robert Belfer:
for j in xrange(1, 14):
data_df.ix['BELFER ROBERT', j] = data_df.ix['BELFER ROBERT', j + 1]
data_df.ix['BELFER ROBERT', 14] = 1.e-5
# Sanjay Bhatnagar:
for j in xrange(14, 2, -1):
data_df.ix['BHATNAGAR SANJAY', j] = data_df.ix['BHATNAGAR SANJAY', j - 1]
data_df.ix['BHATNAGAR SANJAY', 1] = 1.e-5
data_df.loc[['BELFER ROBERT', 'BHATNAGAR SANJAY']]
Out[125]:
This confirms we successfully cleaned up the data.
In our list of DataFrame indexes shown above, we can see a name that is obviously not a real person: 'THE TRAVEL AGENCY IN THE PARK'. Some research show that this is a travel agency that was contracted to Enron while related to the wife of one of Enron's executives. There might be conflict of interest here, but we since we are investigating persons and not suppliers, I chose to drop this observation.
In [126]:
data_df = data_df.drop(['THE TRAVEL AGENCY IN THE PARK'])
I will now make a scatter plot of the first two variables:
In [127]:
sp = data_df.plot.scatter(x = 'salary', y = 'deferral_payments', c = 'poi', edgecolors = 'Blue',
s = 50)
There is an observation that immediately stands out. It corresponds to the highest values of both salary and deferral_payments. When checking the numbers against the document named enron61702insiderpay.pdf, we see that these values correspond to the 'TOTAL' line and are therefore an artefact of the data collection process rather than an actual observation.
In [128]:
# Drop the 'TOTAL' row:
data_df = data_df.drop(['TOTAL'])
data_df.describe()
Out[128]:
In [129]:
sp = data_df.plot.scatter(x = 'salary', y = 'deferral_payments', c = 'poi', edgecolors = 'Blue',
s = 50)
The data looks a lot more sensible now. There are still two significant outliers but they correspond to actual staff members (Jeffrey Skelling and Mark Frevert). As often with financial data, we might need to opt for log scales in further exploration. But for now, we are trying to identify outliers so we will stick to linear scales. Let's continue to plot observations:
In [130]:
sp = data_df.plot.scatter(x = 'salary', y = 'bonus', c = 'poi', edgecolors = 'Blue',
s = 50)
In [131]:
sp = data_df.plot.scatter(x = 'salary', y = 'expenses', c = 'poi', edgecolors = 'Blue',
s = 50)
In [132]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_payments', c = 'poi', edgecolors = 'Blue',
s = 50)
Wow, now we have someone whose total payments is one order of magnitude above everyone else's. This is Kenneth Lay; the bulk of the payments come from Loan Advances. We will need to make a decision as to whether we want to keep him in the data or not... Let's see what this plot looks like with a logarithmic y-scale:
In [133]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_payments', c = 'poi', edgecolors = 'Blue',
s = 50)
sp.set_yscale('log')
sp.set_ylim(1.0e4, 1.5e8)
Out[133]:
Now there seems to be a correlation between the two variables.
In [134]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_stock_value', c = 'poi', edgecolors = 'Blue',
s = 50)
Again, Kenneth Lay stands out with a total stock value of over $49mil. This is a real observations, so I decide to keep it for now.
In [135]:
sp = data_df.plot.scatter(x = 'salary', y = 'total_stock_value', c = 'poi', edgecolors = 'Blue',
s = 50)
sp.set_yscale('log')
sp.set_ylim(1.0e4, 1.5e8)
Out[135]:
The two variables seem associated but the relationship may not be linear, even taking the log of total_stock_value.
Let's now look at email features:
In [136]:
sp = data_df.plot.scatter(x = 'to_messages', y = 'from_messages', c = 'poi', edgecolors = 'Blue',
s = 50)
One employee stands out as extremely verbose! They sent almost 3 times as many messages as they received. Let's find out who they were:
In [137]:
data_df[data_df['from_messages'] == np.max(data_df['from_messages'])]
Out[137]:
The Wikipedia page about Vince Kaminski tells us he was Managing Director for Research and repetedly voiced objections to Enron's practices, warning that a single event could trigger a cascade of provision clauses in creditor contracts that would quickly lead to the demise of Enron. He was unfortunately proved right... Would this explain the discrepancy between the number of emails sent andb received? A detailed analysis of his emails might give us some insight into this but this is outside the scope of this project.
In [138]:
sp = data_df.plot.scatter(x = 'from_poi_to_this_person', y = 'from_this_person_to_poi',
c = 'poi', edgecolors = 'Blue',
s = 50)
Again, let's look at who the two outliers are:
In [139]:
data_df[data_df['from_this_person_to_poi'] == np.max(data_df['from_this_person_to_poi'])]
Out[139]:
In [140]:
data_df[data_df['from_poi_to_this_person'] == np.max(data_df['from_poi_to_this_person'])]
Out[140]:
David Delainey is a POI; in fact he was amongst the first convicted employees of Enron. John Lavorato is not a POI, and seems to have privately expressed concerns about some of Enron's behaviour.
In this section we will plot histograms for each feature variable and apply log transformations where required to make them closer to a normal distribution, which helps many machine learning models.
In [141]:
fig, ax = plt.subplots()
data_df.hist('salary', ax = ax, bins = 50)
ax.set_ylim(0, 20)
Out[141]:
Many salaries are 0 -- presumably, not current Enron employees. Salaries are distributed in a fairly normal way, if we exclude the 0 values.
In [142]:
fig, ax = plt.subplots()
data_df.hist('deferral_payments', ax = ax, bins = np.logspace(np.log10(1.e-5),
np.log10(2.e6),
50))
ax.set_xscale('log')
ax.set_xlim(1e3, None)
ax.set_ylim(0, 10)
Out[142]:
For deferral_payments, I had to use a log transformation to get something resembling a normal distribution. Note that the count of non-zero values is low.
In [143]:
fig, ax = plt.subplots()
data_df.hist('total_payments', ax = ax, bins = np.logspace(np.log10(1.e-5),
np.log10(2.e8),
50))
ax.set_xscale('log')
ax.set_xlim(1.e2, None)
ax.set_ylim(0, None)
Out[143]:
Again, a logarithmic transformation is required to have a roughly normal distribution of the non-zero values.
In [144]:
fig, ax = plt.subplots()
data_df.hist('loan_advances', ax = ax, bins = 50)
Out[144]:
The number of non-zero values does not justify using this predictor in the model.
In [145]:
fig, ax = plt.subplots()
data_df.hist('bonus', ax = ax, bins = np.logspace(np.log10(1.e-5),
np.log10(2.e6),
100))
ax.set_xscale('log')
ax.set_xlim(1e4, None)
ax.set_ylim(0, 20)
Out[145]:
Here again, I used a log transformation.
In [146]:
fig, ax = plt.subplots()
data_df.hist('restricted_stock_deferred', ax = ax, bins = 50)
Out[146]:
In this section, we will boxplot the poi label against the feature variables to try and find the most relevant features to select in our model.
In [147]:
fig, ax = plt.subplots(2, 2, sharey = False, figsize = (10, 15))
bp = data_df.boxplot(['salary', 'bonus', 'expenses', 'director_fees'], by = 'poi',
ax = ax)
It looks like salary and bonus might be good predictors. expenses seem less significant.
Finally, director_fees seems useful because none of the POI seems to have received any director fees. However, this also applies to many non-POIs so it is not enough for perfect prediction. Moreover, very few employees received these fees so the information might not be very significant.
In [148]:
data_df.loc[:, ['director_fees', 'poi']]
Out[148]:
Let's continue our investigation with the next set of predictors. For these predictors, a logarithmic y scale is more adequate:
In [149]:
data_df.loc[:, 'deferred_income'] = np.abs(data_df.loc[:, 'deferred_income'])
In [150]:
fig, axes = plt.subplots(2, 3, sharey = False, figsize = (15, 15))
bp = data_df.boxplot(['deferral_payments', 'loan_advances', 'deferred_income',
'long_term_incentive', 'other', 'total_payments'], by = 'poi',
ax = axes)
for i in range(2):
for j in range(3):
axes[i][j].set_yscale('log')
axes[i][j].set_ylim(1000, None)
Some of these variables have very few non-zero values, such as loan advances for instance. deferral_payment might not be a very strong predictor, but the other variables seem all significant. However we need to be warry of not duplicating information, so we should probably not include total_payments (which is the sum of all other pay-related features), or conversely we should only keep the total but not (all) the elements making it up.
Let's continue with the stock value features:
In [151]:
data_df.loc[:, 'restricted_stock_deferred'] = np.abs(data_df.loc[:, 'restricted_stock_deferred'])
In [152]:
fig, axes = plt.subplots(2, 2, sharey = False, figsize = (10, 10))
bp = data_df.boxplot(['restricted_stock_deferred', 'exercised_stock_options',
'restricted_stock', 'total_stock_value'], by = 'poi',
ax = axes)
for i in range(2):
for j in range(2):
axes[i][j].set_yscale('log')
axes[i][j].set_ylim(1000, None)
It appears that only non-POI have non-zero values for restricted_stock_deferred. However the number of non-zero observations is low. The other predictors all seem useful, but total_stock_value is the sum of all of them so we may need to choose whether to keep the total or the individual predictors that make it up.
Finally, let's have a look at the email features:
In [153]:
fig, axes = plt.subplots(1, 5, sharey = False, figsize = (20, 10))
bp = data_df.boxplot(['to_messages', 'from_messages', 'from_poi_to_this_person',
'from_this_person_to_poi', 'shared_receipt_with_poi'], by = 'poi',
ax = axes)
for i in range(5):
axes[i].set_yscale('log')
axes[i].set_ylim(1, None)
All these predictors seem relevant to predict the POI status of a member of staff. Note that there seems to be a circular logic in these features: To predict whether or not someone is a POI, we look at whether they sent emails or received emails from other POIs, which implies that we already know if they are POIs or not...
Given the small dataset size, I would like to restrict the number of predictors to as low a number as possible. To that end, I will try to aggregate some of the variables in a meaningful way. My first idea is to use the email features to look at from / to ratios and total number of emails involving POIs.
In [154]:
extended_data = data_df.loc[:, ['poi', 'salary', 'bonus', 'expenses', 'director_fees']]
extended_data.loc[:, 'sent_vs_received'] = data_df.loc[:, 'from_messages'] / data_df.loc[:, 'to_messages']
extended_data.loc[:, 'total_emails']= data_df.loc[:, 'from_messages'] + data_df.loc[:, 'to_messages']
extended_data.loc[:, 'emails_with_poi'] = data_df.loc[:, 'from_this_person_to_poi'] + \
data_df.loc[:, 'from_poi_to_this_person'] + \
data_df.loc[:, 'shared_receipt_with_poi']
Let's see if these variables teach us anything:
In [155]:
fig, axes = plt.subplots(1, 3, sharey = False, figsize = (15, 10))
bp = extended_data.boxplot(['sent_vs_received', 'total_emails', 'emails_with_poi'], by = 'poi',
ax = axes)
The boxes for POI / non-POI in the first plot overlap quite a lot, meaning the predictor might not be as useful as others, but median values are quite diffent. It seems that on average, POI tend to send far less emails than they receive, which is intuitively consistent with senior executives being cc'd on a lot of conversations.
Based on the analysis above, I will make a feature selection for my first model, bearing in mind the need to keep the feature count as low as possible. There are also some variables that need to be converted to their logarithmic values.
Note: By accident, I discovered that keeping both
Selected features:
salarybonus)expensesdirector_feesdeferred_income)long_term_incentive)other)restricted_stock_deferred)total_stock_value)sent_vs_receivedtotal_emailsemails_with_poi
In [156]:
# Create and export final dataset
extended_data.loc[:, 'log_bonus'] = np.log(data_df.loc[:, 'bonus'])
extended_data.loc[:, 'log_deferred_income'] = np.log(data_df.loc[:, 'deferred_income'])
extended_data.loc[:, 'log_long_term_incentive'] = np.log(data_df.loc[:, 'long_term_incentive'])
extended_data.loc[:, 'log_other'] = np.log(data_df.loc[:, 'other'])
extended_data.loc[:, 'log_restricted_stock_deferred'] = np.log(data_df.loc[:, 'restricted_stock_deferred'])
extended_data.loc[:, 'log_total_stock_value'] = np.log(data_df.loc[:, 'total_stock_value'])
# List of features used in the model
features_list = list(extended_data.columns)
# Put dataset into the dict format expected by the test module
my_data_dict = extended_data.to_dict(orient = 'index')
print features_list
In [157]:
extended_data.describe()
Out[157]:
In [158]:
extended_data.columns
Out[158]: