In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from tools import tester
from tools.tester import dump_classifier_and_data
from sklearn import preprocessing
from sklearn.svm import SVC
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
final_project_dataset.pkl
Financial data from the included enron61712insiderpay.pdf
have been combined into a dictionary in the included final_project_dataset.pkl file. In the dictionary, the key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person. The features in the data fall into three major types, namely financial features, email features and POI labels.
financial features:
# (all units are in US dollars)
[
'salary',
'deferral_payments',
'total_payments',
'loan_advances',
'bonus',
'restricted_stock_deferred',
'deferred_income',
'total_stock_value',
'expenses',
'exercised_stock_options',
'other',
'long_term_incentive',
'restricted_stock',
'director_fees'
]
email features:
# (units are generally number of emails messages that reference the ; notable exception is ‘email_address’, which is a text string)
[
'to_messages',
'email_address',
'from_poi_to_this_person',
'from_messages',
'from_this_person_to_poi',
'poi', # POI Label (boolean, represented as integer).
'shared_receipt_with_poi'
]
In [2]:
cat ./poi_names.txt
This file contains a list of 35 people who were a person of interest in the Enron scandal. A POI is defined as someone who was:
In [3]:
enron_data = pickle.load(open("./final_project_dataset.pkl"))
In [4]:
enron_data.iteritems().next()
Out[4]:
In the dataset there are 146 employees with 20 features and 'poi' label for each employee. To further summarize the features in the dataset, we will convert the dictionary to a pandas DataFrame.
In [5]:
# Replace "Nan" with NaN
for columns in enron_data.itervalues():
for k,v in columns.iteritems():
if type(v) is str and v.lower() == "nan":
columns[k] = np.nan
In [6]:
enron_df = pd.DataFrame.from_dict(enron_data, orient="index")
enron_df
Out[6]:
There is a row with the name, "TOTAL". This row should be removed.
In [7]:
# Omit the TOTAL index
enron_df.drop('TOTAL', inplace=True)
Financial Features
In [8]:
enron_df.loc[:, ['salary',
'deferral_payments',
'total_payments',
'loan_advances',
'bonus',
'restricted_stock_deferred',
'deferred_income',]].describe()
Out[8]:
In [9]:
enron_df.loc[:, ['total_stock_value',
'expenses',
'exercised_stock_options',
'other',
'long_term_incentive',
'restricted_stock',
'director_fees']].describe()
Out[9]:
Email Features
In [10]:
enron_df.loc[:, ['to_messages',
'email_address',
'from_poi_to_this_person',
'from_messages',
'from_this_person_to_poi',
'shared_receipt_with_poi']].describe()
Out[10]:
Persons of Interest
In [11]:
enron_poi = enron_df[enron_df['poi']==True]
print("Number of POI's: " + str(len(enron_poi)))
enron_poi
Out[11]:
Number of NaN's:
In [12]:
enron_df.isnull().sum()
Out[12]:
In [13]:
sum(enron_df.isnull().sum())
Out[13]:
Since there were so many 'NaN' entries, I went back to the pdf that the data was derived from and noticed that entries with '-' were being interpreted as NaN. To fix this I will be replacing the 'NaN's with the value zero.
In [14]:
enron_df.fillna(0, inplace=True)
One outlier that was removed earlier was the "Total" index, which was represented the total sums of each columns. There were two entries that were invalid data points. One was "THE TRAVEL AGENCY IN THE PARK", which can not be a person and definitely not a person of interest. The other invalid entry was "LOCKHART EUGENE E", which had NaN values for all the features.
In [15]:
# Drop email_address column
enron_df.drop('email_address', axis=1, inplace=True)
enron_df.drop("THE TRAVEL AGENCY IN THE PARK", inplace=True)
enron_df.drop("LOCKHART EUGENE E", inplace=True)
Indexes Removed
The 'email_address' column was also removed since it is irrelevant data.
Do POI's receive more emails from other POI's compared to non POI's?
In [16]:
enron_df['from_poi_ratio'] = enron_df['from_poi_to_this_person'] / enron_df['from_messages']
enron_df.fillna(0, inplace=True)
Do POI's write more emails to other POI's compared to non POI's?
In [17]:
enron_df['to_poi_ratio'] = enron_df['from_this_person_to_poi'] / enron_df['to_messages']
enron_df.fillna(0, inplace=True)
Do POI's have a bigger bonus to salary ratio?
In [18]:
enron_df['bonus_ratio'] = enron_df['bonus'] / enron_df['salary']
In [19]:
enron_df[['poi','bonus_ratio']]
Out[19]:
For NaN values, the labels are more POI than not so these values will be filled with 0, since there seems to be a weak correlation between POI's and a large bonus_ratio.
In [20]:
enron_df.fillna(0, inplace=True)
In [21]:
# Separate labels and features
enron_df_labels = enron_df['poi']
enron_df_features = enron_df[enron_df.columns.difference(['poi'])]
In [22]:
pipeline = Pipeline([
('kbest', SelectKBest()),
('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], "kbest__score_func": [f_classif]}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)
Out[22]:
In [23]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])
my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list
Out[23]:
In [24]:
data = enron_df[my_list].transpose().to_dict()
In [25]:
dump_classifier_and_data(GaussianNB(), data, my_list)
In [26]:
tester.main()
Using PCA instead of selectKBest:
In [27]:
pipeline = Pipeline([
("scale", preprocessing.StandardScaler()),
('pca', PCA()),
('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {
"pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)
Out[27]:
In [28]:
pca = clf.best_estimator_.steps[1][1]
In [29]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components
Out[29]:
In [30]:
pca_nb = Pipeline([
("scale", preprocessing.StandardScaler()),
('pca', PCA(n_components=pca.n_components)),
('gnb', GaussianNB())])
In [31]:
features_list = list(enron_df.columns)
features_list.remove('poi')
features_list = ['poi'] + features_list
In [32]:
dump_classifier_and_data(pca_nb, enron_df.transpose().to_dict(), features_list)
In [33]:
tester.main()
PCA in our case performs poorly when compared to selectKBest. This indicates that variance is needed in the dataset.
In [34]:
pipeline = Pipeline([
('kbest', SelectKBest()),
('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], 'dt__max_features': [None, 'auto', 'log2'],
'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)
Out[34]:
In [35]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])
my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list
Out[35]:
In [36]:
clf.best_estimator_.steps[1][1]
Out[36]:
In [37]:
data = enron_df[my_list].transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[1][1], data, my_list)
In [38]:
tester.main()
In [39]:
pipeline = Pipeline([
('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {'dt__max_features': [1, 2, 3, 5, 8, 13, 19],
'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)
Out[39]:
In [40]:
data = enron_df.transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[0][1], data, features_list)
In [41]:
tester.main()
In [42]:
pipeline = Pipeline([
("scale", preprocessing.StandardScaler()),
('pca', PCA()),
('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
'dt__max_features': [None, 'auto', 'log2'],
'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)
Out[42]:
In [43]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components
Out[43]:
In [44]:
pca_dt = Pipeline([
("scale", preprocessing.StandardScaler()),
('pca', PCA(n_components=pca.n_components)),
('dt', clf.best_estimator_.steps[2][1])])
In [45]:
dump_classifier_and_data(pca_dt, enron_df.transpose().to_dict(), features_list)
tester.main()