In [1]:
%load ../ud120-projects/datasets_questions/explore_enron_data.py
In [6]:
#!/usr/bin/python
"""
starter code for exploring the Enron dataset (emails + finances)
loads up the dataset (pickled dict of dicts)
the dataset has the form
enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
{features_dict} is a dictionary of features associated with that person
you should explore features_dict as part of the mini-project,
but here's an example to get you started:
enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
"""
import pickle
enron_data = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r"))
In [9]:
len(enron_data.keys())
Out[9]:
In [12]:
len(enron_data.values()[0])
Out[12]:
In [15]:
from collections import Counter
In [39]:
sum([enron_data[i]['poi']==1 for i in enron_data.keys()])
Out[39]:
In [40]:
%load ../ud120-projects/final_project/poi_names.txt
In [ ]:
http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm
(y) Lay, Kenneth
(y) Skilling, Jeffrey
(n) Howard, Kevin
(n) Krautz, Michael
(n) Yeager, Scott
(n) Hirko, Joseph
(n) Shelby, Rex
(n) Bermingham, David
(n) Darby, Giles
(n) Mulgrew, Gary
(n) Bayley, Daniel
(n) Brown, James
(n) Furst, Robert
(n) Fuhs, William
(n) Causey, Richard
(n) Calger, Christopher
(n) DeSpain, Timothy
(n) Hannon, Kevin
(n) Koenig, Mark
(y) Forney, John
(n) Rice, Kenneth
(n) Rieker, Paula
(n) Fastow, Lea
(n) Fastow, Andrew
(y) Delainey, David
(n) Glisan, Ben
(n) Richter, Jeffrey
(n) Lawyer, Larry
(n) Belden, Timothy
(n) Kopper, Michael
(n) Duncan, David
(n) Bowen, Raymond
(n) Colwell, Wesley
(n) Boyle, Dan
(n) Loehr, Christopher
In [62]:
# POI's with data
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
# Count POIs in poi_names.txt
print sum([i.split(" ")[0] == '(y)' for i in n.readlines()])
In [123]:
# POI's in both poi_names and our data doctionary
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
# Remove first two lines
n.readline()
n.readline()
text_names = [i.split(" ", 1)[1].strip('\n').lower().replace(',', '') for i in n.readlines()]
# Pull names out and lower case them for comparison
enron_data_names = [name.lower() for name in enron_data.keys()]
# Grab the first and last names, leaving out the middle initial at the end
enron_data_names = [substring.split(" ")[:2] for substring in enron_data_names]
# Combine the first/last names into one string
enron_data_names = [" ".join(i) for i in enron_data_names]
print set(enron_data_names) & set(text_names)
#print set(text_names)
#print ""
#print set(enron_data_names)
In [130]:
#Total POIs
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
# Count POIs in poi_names.txt
print sum([i.split(" ")[0] in ['(n)', '(y)'] for i in n.readlines()])
In [41]:
%load ../ud120-projects/final_project/poi_email_addresses.py
In [ ]:
def poiEmails():
email_list = ["kenneth_lay@enron.net",
"kenneth_lay@enron.com",
"klay.enron@enron.com",
"kenneth.lay@enron.com",
"klay@enron.com",
"layk@enron.com",
"chairman.ken@enron.com",
"jeffreyskilling@yahoo.com",
"jeff_skilling@enron.com",
"jskilling@enron.com",
"effrey.skilling@enron.com",
"skilling@enron.com",
"jeffrey.k.skilling@enron.com",
"jeff.skilling@enron.com",
"kevin_a_howard.enronxgate.enron@enron.net",
"kevin.howard@enron.com",
"kevin.howard@enron.net",
"kevin.howard@gcm.com",
"michael.krautz@enron.com"
"scott.yeager@enron.com",
"syeager@fyi-net.com",
"scott_yeager@enron.net",
"syeager@flash.net",
"joe'.'hirko@enron.com",
"joe.hirko@enron.com",
"rex.shelby@enron.com",
"rex.shelby@enron.nt",
"rex_shelby@enron.net",
"jbrown@enron.com",
"james.brown@enron.com",
"rick.causey@enron.com",
"richard.causey@enron.com",
"rcausey@enron.com",
"calger@enron.com",
"chris.calger@enron.com",
"christopher.calger@enron.com",
"ccalger@enron.com",
"tim_despain.enronxgate.enron@enron.net",
"tim.despain@enron.com",
"kevin_hannon@enron.com",
"kevin'.'hannon@enron.com",
"kevin_hannon@enron.net",
"kevin.hannon@enron.com",
"mkoenig@enron.com",
"mark.koenig@enron.com",
"m..forney@enron.com",
"ken'.'rice@enron.com",
"ken.rice@enron.com",
"ken_rice@enron.com",
"ken_rice@enron.net",
"paula.rieker@enron.com",
"prieker@enron.com",
"andrew.fastow@enron.com",
"lfastow@pdq.net",
"andrew.s.fastow@enron.com",
"lfastow@pop.pdq.net",
"andy.fastow@enron.com",
"david.w.delainey@enron.com",
"delainey.dave@enron.com",
"'delainey@enron.com",
"david.delainey@enron.com",
"'david.delainey'@enron.com",
"dave.delainey@enron.com",
"delainey'.'david@enron.com",
"ben.glisan@enron.com",
"bglisan@enron.com",
"ben_f_glisan@enron.com",
"ben'.'glisan@enron.com",
"jeff.richter@enron.com",
"jrichter@nwlink.com",
"lawrencelawyer@aol.com",
"lawyer'.'larry@enron.com",
"larry_lawyer@enron.com",
"llawyer@enron.com",
"larry.lawyer@enron.com",
"lawrence.lawyer@enron.com",
"tbelden@enron.com",
"tim.belden@enron.com",
"tim_belden@pgn.com",
"tbelden@ect.enron.com",
"michael.kopper@enron.com",
"dave.duncan@enron.com",
"dave.duncan@cipco.org",
"duncan.dave@enron.com",
"ray.bowen@enron.com",
"raymond.bowen@enron.com",
"'bowen@enron.com",
"wes.colwell@enron.com",
"dan.boyle@enron.com",
"cloehr@enron.com",
"chris.loehr@enron.com"
]
return email_list
In [135]:
enron_data['PRENTICE JAMES']['total_stock_value']
Out[135]:
In [138]:
enron_data['COLWELL WESLEY']['from_this_person_to_poi']
Out[138]:
In [148]:
skilling = [i for i in enron_data.keys() if 'SKILLING' in i ][0]
enron_data[skilling]['exercised_stock_options']
Out[148]:
In [149]:
enron_data[skilling]['total_payments']
Out[149]:
In [153]:
fastow = [i for i in enron_data.keys() if 'FASTOW' in i ][0]
enron_data[fastow]['total_payments']
Out[153]:
In [163]:
lay = [i for i in enron_data.keys() if 'LAY KENNETH' in i ][0]
enron_data[lay]['total_payments']
Out[163]:
In [165]:
import pprint
pprint.pprint(enron_data.values()[0])
In [209]:
def check_num_values(value, dict_):
return sum([v[value] != 'NaN' for k, v in dict_.iteritems()])
In [210]:
print "Salaries: ", check_num_values('salary', enron_data)
print "Emails: ", check_num_values('email_address', enron_data)
In [189]:
%load ../ud120-projects/tools/feature_format.py
In [ ]:
#!/usr/bin/python
"""
A general tool for converting data from the
dictionary format to an (n x k) python list that's
ready for training an sklearn algorithm
n--no. of key-value pairs in dictonary
k--no. of features being extracted
dictionary keys are names of persons in dataset
dictionary values are dictionaries, where each
key-value pair in the dict is the name
of a feature, and its value for that person
In addition to converting a dictionary to a numpy
array, you may want to separate the labels from the
features--this is what targetFeatureSplit is for
so, if you want to have the poi label as the target,
and the features you want to use are the person's
salary and bonus, here's what you would do:
feature_list = ["poi", "salary", "bonus"]
data_array = featureFormat( data_dictionary, feature_list )
label, features = targetFeatureSplit(data_array)
the line above (targetFeatureSplit) assumes that the
label is the _first_ item in feature_list--very important
that poi is listed first!
"""
import numpy as np
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False ):
""" convert dictionary to numpy array of features
remove_NaN=True will convert "NaN" string to 0.0
remove_all_zeroes=True will omit any data points for which
all the features you seek are 0.0
remove_any_zeroes=True will omit any data points for which
any of the features you seek are 0.0
"""
return_list = []
for key in dictionary.keys():
tmp_list = []
append = False
for feature in features:
try:
dictionary[key][feature]
except KeyError:
print "error: key ", feature, " not present"
return
value = dictionary[key][feature]
if value=="NaN" and remove_NaN:
value = 0
tmp_list.append( float(value) )
### if all features are zero and you want to remove
### data points that are all zero, do that here
if remove_all_zeroes:
all_zeroes = True
for item in tmp_list:
if item != 0 and item != "NaN":
append = True
### if any features for a given data point are zero
### and you want to remove data points with any zeroes,
### handle that here
if remove_any_zeroes:
any_zeroes = False
if 0 in tmp_list or "NaN" in tmp_list:
append = False
if append:
return_list.append( np.array(tmp_list) )
return np.array(return_list)
def targetFeatureSplit( data ):
"""
given a numpy array like the one returned from
featureFormat, separate out the first feature
and put it into its own list (this should be the
quantity you want to predict)
return targets and features as separate lists
(sklearn can generally handle both lists and numpy arrays as
input formats when training/predicting)
"""
target = []
features = []
for item in data:
target.append( item[0] )
features.append( item[1:] )
return target, features
In [214]:
tot_ppl = float(len(enron_data.keys()))
print ((tot_ppl - check_num_values('total_payments', enron_data)) / tot_ppl)
In [236]:
def return_pois_values(value, dict_):
return sum([dict_[k]['total_payments'] == 'NaN' for k, v in dict_.iteritems() if v[value] == True])
num_NaN = return_pois_values('poi', enron_data)
def tot_pois():
return sum([enron_data[k]['poi'] == True for k, v in enron_data.iteritems()])
pois = tot_pois()
print num_NaN / float(pois)
In [239]:
len(enron_data.keys()) + 10
Out[239]:
In [247]:
156 - check_num_values('total_payments', enron_data)
Out[247]:
In [249]:
pois + 10
Out[249]:
In [251]:
10/28.
Out[251]:
In [ ]: