notebook.community

Edit and run



In [1]:

    
%load ../ud120-projects/datasets_questions/explore_enron_data.py



In [6]:

    
#!/usr/bin/python

""" 
    starter code for exploring the Enron dataset (emails + finances) 
    loads up the dataset (pickled dict of dicts)

    the dataset has the form
    enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }

    {features_dict} is a dictionary of features associated with that person
    you should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
    
"""

import pickle

enron_data = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r"))



In [9]:

    
len(enron_data.keys())









    Out[9]:





146



In [12]:

    
len(enron_data.values()[0])









    Out[12]:





21



In [15]:

    
from collections import Counter



In [39]:

    
sum([enron_data[i]['poi']==1 for i in enron_data.keys()])









    Out[39]:





18



In [40]:

    
%load ../ud120-projects/final_project/poi_names.txt



In [ ]:

    
http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm

(y) Lay, Kenneth
(y) Skilling, Jeffrey
(n) Howard, Kevin
(n) Krautz, Michael
(n) Yeager, Scott
(n) Hirko, Joseph
(n) Shelby, Rex
(n) Bermingham, David
(n) Darby, Giles
(n) Mulgrew, Gary
(n) Bayley, Daniel
(n) Brown, James
(n) Furst, Robert
(n) Fuhs, William
(n) Causey, Richard
(n) Calger, Christopher
(n) DeSpain, Timothy
(n) Hannon, Kevin
(n) Koenig, Mark
(y) Forney, John
(n) Rice, Kenneth
(n) Rieker, Paula
(n) Fastow, Lea
(n) Fastow, Andrew
(y) Delainey, David
(n) Glisan, Ben
(n) Richter, Jeffrey
(n) Lawyer, Larry
(n) Belden, Timothy
(n) Kopper, Michael
(n) Duncan, David
(n) Bowen, Raymond
(n) Colwell, Wesley
(n) Boyle, Dan
(n) Loehr, Christopher



In [62]:

    
# POI's with data
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
    # Count POIs in poi_names.txt
    print sum([i.split(" ")[0] == '(y)' for i in n.readlines()])



In [123]:

    
# POI's in both poi_names and our data doctionary
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
    # Remove first two lines
    n.readline()
    n.readline()
    text_names = [i.split(" ", 1)[1].strip('\n').lower().replace(',', '') for i in n.readlines()]

# Pull names out and lower case them for comparison    
enron_data_names = [name.lower() for name in enron_data.keys()]
# Grab the first and last names, leaving out the middle initial at the end
enron_data_names = [substring.split(" ")[:2] for substring in enron_data_names]
# Combine the first/last names into one string
enron_data_names = [" ".join(i) for i in enron_data_names]

print set(enron_data_names) & set(text_names)

#print set(text_names)
#print ""
#print set(enron_data_names)









    



set(['lay kenneth', 'skilling jeffrey', 'belden timothy', 'calger christopher', 'koenig mark', 'hirko joseph', 'causey richard', 'fastow andrew', 'colwell wesley', 'shelby rex', 'delainey david', 'kopper michael', 'hannon kevin', 'rice kenneth', 'rieker paula'])



In [130]:

    
#Total POIs
with open('../ud120-projects/final_project/poi_names.txt', 'r') as n:
    # Count POIs in poi_names.txt
    print sum([i.split(" ")[0] in ['(n)', '(y)']  for i in n.readlines()])



In [41]:

    
%load ../ud120-projects/final_project/poi_email_addresses.py



In [ ]:

    
def poiEmails():
    email_list = ["kenneth_lay@enron.net",    
            "kenneth_lay@enron.com",
            "klay.enron@enron.com",
            "kenneth.lay@enron.com", 
            "klay@enron.com",
            "layk@enron.com",
            "chairman.ken@enron.com",
            "jeffreyskilling@yahoo.com",
            "jeff_skilling@enron.com",
            "jskilling@enron.com",
            "effrey.skilling@enron.com",
            "skilling@enron.com",
            "jeffrey.k.skilling@enron.com",
            "jeff.skilling@enron.com",
            "kevin_a_howard.enronxgate.enron@enron.net",
            "kevin.howard@enron.com",
            "kevin.howard@enron.net",
            "kevin.howard@gcm.com",
            "michael.krautz@enron.com"
            "scott.yeager@enron.com",
            "syeager@fyi-net.com",
            "scott_yeager@enron.net",
            "syeager@flash.net",
            "joe'.'hirko@enron.com", 
            "joe.hirko@enron.com", 
            "rex.shelby@enron.com", 
            "rex.shelby@enron.nt", 
            "rex_shelby@enron.net",
            "jbrown@enron.com",
            "james.brown@enron.com", 
            "rick.causey@enron.com", 
            "richard.causey@enron.com", 
            "rcausey@enron.com",
            "calger@enron.com",
            "chris.calger@enron.com", 
            "christopher.calger@enron.com", 
            "ccalger@enron.com",
            "tim_despain.enronxgate.enron@enron.net", 
            "tim.despain@enron.com",
            "kevin_hannon@enron.com", 
            "kevin'.'hannon@enron.com", 
            "kevin_hannon@enron.net", 
            "kevin.hannon@enron.com",
            "mkoenig@enron.com", 
            "mark.koenig@enron.com",
            "m..forney@enron.com",
            "ken'.'rice@enron.com", 
            "ken.rice@enron.com",
            "ken_rice@enron.com", 
            "ken_rice@enron.net",
            "paula.rieker@enron.com",
            "prieker@enron.com", 
            "andrew.fastow@enron.com", 
            "lfastow@pdq.net", 
            "andrew.s.fastow@enron.com", 
            "lfastow@pop.pdq.net", 
            "andy.fastow@enron.com",
            "david.w.delainey@enron.com", 
            "delainey.dave@enron.com", 
            "'delainey@enron.com", 
            "david.delainey@enron.com", 
            "'david.delainey'@enron.com", 
            "dave.delainey@enron.com", 
            "delainey'.'david@enron.com",
            "ben.glisan@enron.com", 
            "bglisan@enron.com", 
            "ben_f_glisan@enron.com", 
            "ben'.'glisan@enron.com",
            "jeff.richter@enron.com", 
            "jrichter@nwlink.com",
            "lawrencelawyer@aol.com", 
            "lawyer'.'larry@enron.com", 
            "larry_lawyer@enron.com", 
            "llawyer@enron.com", 
            "larry.lawyer@enron.com", 
            "lawrence.lawyer@enron.com",
            "tbelden@enron.com", 
            "tim.belden@enron.com", 
            "tim_belden@pgn.com", 
            "tbelden@ect.enron.com",
            "michael.kopper@enron.com",
            "dave.duncan@enron.com", 
            "dave.duncan@cipco.org", 
            "duncan.dave@enron.com",
            "ray.bowen@enron.com", 
            "raymond.bowen@enron.com", 
            "'bowen@enron.com",
            "wes.colwell@enron.com",
            "dan.boyle@enron.com",
            "cloehr@enron.com", 
            "chris.loehr@enron.com"
        ]
    return email_list



In [135]:

    
enron_data['PRENTICE JAMES']['total_stock_value']









    Out[135]:





1095040



In [138]:

    
enron_data['COLWELL WESLEY']['from_this_person_to_poi']









    Out[138]:





11



In [148]:

    
skilling = [i for i in enron_data.keys() if 'SKILLING' in i ][0]
enron_data[skilling]['exercised_stock_options']









    Out[148]:





19250000



In [149]:

    
enron_data[skilling]['total_payments']









    Out[149]:





8682716



In [153]:

    
fastow = [i for i in enron_data.keys() if 'FASTOW' in i ][0]
enron_data[fastow]['total_payments']









    Out[153]:





2424083



In [163]:

    
lay = [i for i in enron_data.keys() if 'LAY KENNETH' in i ][0]
enron_data[lay]['total_payments']









    Out[163]:





103559793



In [165]:

    
import pprint
pprint.pprint(enron_data.values()[0])









    



{'bonus': 600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'mark.metts@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 94299,
 'from_messages': 29,
 'from_poi_to_this_person': 38,
 'from_this_person_to_poi': 1,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 1740,
 'poi': False,
 'restricted_stock': 585062,
 'restricted_stock_deferred': 'NaN',
 'salary': 365788,
 'shared_receipt_with_poi': 702,
 'to_messages': 807,
 'total_payments': 1061827,
 'total_stock_value': 585062}



In [209]:

    
def check_num_values(value, dict_):
    return sum([v[value] != 'NaN' for k, v in dict_.iteritems()])



In [210]:

    
print "Salaries: ", check_num_values('salary', enron_data)
print "Emails: ",  check_num_values('email_address', enron_data)









    



Salaries:  95
Emails:  111



In [189]:

    
%load ../ud120-projects/tools/feature_format.py



In [ ]:

    
#!/usr/bin/python

""" 
    A general tool for converting data from the
    dictionary format to an (n x k) python list that's 
    ready for training an sklearn algorithm

    n--no. of key-value pairs in dictonary
    k--no. of features being extracted

    dictionary keys are names of persons in dataset
    dictionary values are dictionaries, where each
        key-value pair in the dict is the name
        of a feature, and its value for that person

    In addition to converting a dictionary to a numpy 
    array, you may want to separate the labels from the
    features--this is what targetFeatureSplit is for

    so, if you want to have the poi label as the target,
    and the features you want to use are the person's
    salary and bonus, here's what you would do:

    feature_list = ["poi", "salary", "bonus"] 
    data_array = featureFormat( data_dictionary, feature_list )
    label, features = targetFeatureSplit(data_array)

    the line above (targetFeatureSplit) assumes that the
    label is the _first_ item in feature_list--very important
    that poi is listed first!
"""


import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False ):
    """ convert dictionary to numpy array of features
        remove_NaN=True will convert "NaN" string to 0.0
        remove_all_zeroes=True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes=True will omit any data points for which
            any of the features you seek are 0.0
    """


    return_list = []

    for key in dictionary.keys():
        tmp_list = []
        append = False
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            all_zeroes = True
            for item in tmp_list:
                if item != 0 and item != "NaN":
                    append = True

        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            any_zeroes = False
            if 0 in tmp_list or "NaN" in tmp_list:
                append = False
        if append:
            return_list.append( np.array(tmp_list) )


    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features



In [214]:

    
tot_ppl = float(len(enron_data.keys()))
print ((tot_ppl - check_num_values('total_payments', enron_data)) / tot_ppl)









    



0.143835616438



In [236]:

    
def return_pois_values(value, dict_):
    return sum([dict_[k]['total_payments'] == 'NaN' for k, v in dict_.iteritems() if v[value] == True]) 

num_NaN = return_pois_values('poi', enron_data)

def tot_pois():
    return sum([enron_data[k]['poi'] == True for k, v in enron_data.iteritems()])
    
pois = tot_pois()
print num_NaN / float(pois)

0.0



In [239]:

    
len(enron_data.keys()) + 10









    Out[239]:





156



In [247]:

    
156 - check_num_values('total_payments', enron_data)









    Out[247]:





31



In [249]:

    
pois + 10









    Out[249]:





28



In [251]:

    
10/28.









    Out[251]:





0.35714285714285715



In [ ]: