I will start with adding general code that I use everywhere and then load the data and turn it into a data frame. Using this data frame I will explore the data and get an understanding



In [5]:

    
import sys

projects_home = '/home/aseem/projects/ud120-projects'
final_project_home = projects_home + '/final_project/'
sys.path.append(final_project_home)
sys.path.append(projects_home + '/tools/')



In [6]:

    
%matplotlib inline

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

with open(final_project_home + "final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
    data_df = pd.DataFrame.from_dict(data_dict, orient = 'index')
    #del data_dict

I will note down the versions that I am using so as to avoid any confusion later



In [7]:

    
print sns.__version__
print matplotlib.__version__
print np.__version__
print pd.__version__









    



0.7.0
1.5.1
1.10.4
0.18.0



In [8]:

    
data_df.head()









    Out[8]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      ...
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      email_address
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
      201955
      2902
      2869717
      4484442
      1729541
      4175000
      126027
      1407
      -126027
      1729541
      ...
      NaN
      2195
      152
      65
      False
      NaN
      -3081055
      304805
      phillip.allen@enron.com
      47
    
    
      BADUM JAMES P
      NaN
      NaN
      178980
      182466
      257817
      NaN
      NaN
      NaN
      NaN
      257817
      ...
      NaN
      NaN
      NaN
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      BANNANTINE JAMES M
      477
      566
      NaN
      916197
      4046157
      NaN
      1757552
      465
      -560222
      5243487
      ...
      NaN
      29
      864523
      0
      False
      NaN
      -5104
      NaN
      james.bannantine@enron.com
      39
    
    
      BAXTER JOHN C
      267102
      NaN
      1295738
      5634343
      6680544
      1200000
      3942714
      NaN
      NaN
      10623258
      ...
      NaN
      NaN
      2660303
      NaN
      False
      NaN
      -1386055
      1586055
      NaN
      NaN
    
    
      BAY FRANKLIN R
      239671
      NaN
      260455
      827696
      NaN
      400000
      145796
      NaN
      -82782
      63014
      ...
      NaN
      NaN
      69
      NaN
      False
      NaN
      -201641
      NaN
      frank.bay@enron.com
      NaN
    
  

5 rows × 21 columns

I remember that there was an outlier that needed to be removed. Also email_address does not seem to be useful



In [9]:

    
imputated_df1 = data_df.drop(['TOTAL'], axis=0).drop(['email_address'], axis = 1)
imputated_df1.head()









    Out[9]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
      201955
      2902
      2869717
      4484442
      1729541
      4175000
      126027
      1407
      -126027
      1729541
      13868
      NaN
      2195
      152
      65
      False
      NaN
      -3081055
      304805
      47
    
    
      BADUM JAMES P
      NaN
      NaN
      178980
      182466
      257817
      NaN
      NaN
      NaN
      NaN
      257817
      3486
      NaN
      NaN
      NaN
      NaN
      False
      NaN
      NaN
      NaN
      NaN
    
    
      BANNANTINE JAMES M
      477
      566
      NaN
      916197
      4046157
      NaN
      1757552
      465
      -560222
      5243487
      56301
      NaN
      29
      864523
      0
      False
      NaN
      -5104
      NaN
      39
    
    
      BAXTER JOHN C
      267102
      NaN
      1295738
      5634343
      6680544
      1200000
      3942714
      NaN
      NaN
      10623258
      11200
      NaN
      NaN
      2660303
      NaN
      False
      NaN
      -1386055
      1586055
      NaN
    
    
      BAY FRANKLIN R
      239671
      NaN
      260455
      827696
      NaN
      400000
      145796
      NaN
      -82782
      63014
      129142
      NaN
      NaN
      69
      NaN
      False
      NaN
      -201641
      NaN
      NaN

I can see a lot of NaN. I will replace them with 0 for now



In [10]:

    
imputated_df2 = imputated_df1.replace(to_replace='NaN', value = 0)
imputated_df2.head()









    Out[10]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
      201955
      2902
      2869717
      4484442
      1729541
      4175000
      126027
      1407
      -126027
      1729541
      13868
      0
      2195
      152
      65
      False
      0
      -3081055
      304805
      47
    
    
      BADUM JAMES P
      0
      0
      178980
      182466
      257817
      0
      0
      0
      0
      257817
      3486
      0
      0
      0
      0
      False
      0
      0
      0
      0
    
    
      BANNANTINE JAMES M
      477
      566
      0
      916197
      4046157
      0
      1757552
      465
      -560222
      5243487
      56301
      0
      29
      864523
      0
      False
      0
      -5104
      0
      39
    
    
      BAXTER JOHN C
      267102
      0
      1295738
      5634343
      6680544
      1200000
      3942714
      0
      0
      10623258
      11200
      0
      0
      2660303
      0
      False
      0
      -1386055
      1586055
      0
    
    
      BAY FRANKLIN R
      239671
      0
      260455
      827696
      0
      400000
      145796
      0
      -82782
      63014
      129142
      0
      0
      69
      0
      False
      0
      -201641
      0
      0

Now with some of the obvious things out of the way I will do some univariate analysis so I can find get some understanding of the data. Let's start with seeing the stats



In [11]:

    
imputated_df2.columns









    Out[11]:





Index([u'salary', u'to_messages', u'deferral_payments', u'total_payments',
       u'exercised_stock_options', u'bonus', u'restricted_stock',
       u'shared_receipt_with_poi', u'restricted_stock_deferred',
       u'total_stock_value', u'expenses', u'loan_advances', u'from_messages',
       u'other', u'from_this_person_to_poi', u'poi', u'director_fees',
       u'deferred_income', u'long_term_incentive', u'from_poi_to_this_person'],
      dtype='object')



In [12]:

    
imputated_df2.describe().transpose()









    Out[12]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      salary
      145.0
      1.841671e+05
      1.969598e+05
      0.0
      0.0
      210500.0
      269076.0
      1111258.0
    
    
      to_messages
      145.0
      1.230014e+03
      2.232153e+03
      0.0
      0.0
      312.0
      1607.0
      15149.0
    
    
      deferral_payments
      145.0
      2.205579e+05
      7.517046e+05
      -102500.0
      0.0
      0.0
      7961.0
      6426990.0
    
    
      total_payments
      145.0
      2.243477e+06
      8.817819e+06
      0.0
      91093.0
      916197.0
      1934359.0
      103559793.0
    
    
      exercised_stock_options
      145.0
      2.061486e+06
      4.781941e+06
      0.0
      0.0
      607837.0
      1668260.0
      34348384.0
    
    
      bonus
      145.0
      6.713353e+05
      1.230148e+06
      0.0
      0.0
      300000.0
      800000.0
      8000000.0
    
    
      restricted_stock
      145.0
      8.625464e+05
      2.010852e+06
      -2604490.0
      0.0
      360528.0
      698920.0
      14761694.0
    
    
      shared_receipt_with_poi
      145.0
      6.977655e+02
      1.075128e+03
      0.0
      0.0
      114.0
      900.0
      5521.0
    
    
      restricted_stock_deferred
      145.0
      7.291157e+04
      1.297469e+06
      -1787380.0
      0.0
      0.0
      0.0
      15456290.0
    
    
      total_stock_value
      145.0
      2.889718e+06
      6.172223e+06
      -44093.0
      221141.0
      955873.0
      2282768.0
      49110078.0
    
    
      expenses
      145.0
      3.513137e+04
      4.524718e+04
      0.0
      0.0
      18834.0
      53122.0
      228763.0
    
    
      loan_advances
      145.0
      5.787931e+05
      6.771012e+06
      0.0
      0.0
      0.0
      0.0
      81525000.0
    
    
      from_messages
      145.0
      3.610759e+02
      1.445945e+03
      0.0
      0.0
      17.0
      52.0
      14368.0
    
    
      other
      145.0
      2.952100e+05
      1.127404e+06
      0.0
      0.0
      947.0
      150458.0
      10359729.0
    
    
      from_this_person_to_poi
      145.0
      2.445517e+01
      7.952707e+01
      0.0
      0.0
      0.0
      14.0
      609.0
    
    
      director_fees
      145.0
      9.911490e+03
      3.120271e+04
      0.0
      0.0
      0.0
      0.0
      137864.0
    
    
      deferred_income
      145.0
      -1.923475e+05
      6.041174e+05
      -3504386.0
      -36666.0
      0.0
      0.0
      0.0
    
    
      long_term_incentive
      145.0
      3.346340e+05
      6.853639e+05
      0.0
      0.0
      0.0
      374347.0
      5145434.0
    
    
      from_poi_to_this_person
      145.0
      3.848966e+01
      7.408836e+01
      0.0
      0.0
      4.0
      41.0
      528.0



In [13]:

    
imputated_df2.median()









    Out[13]:





salary                       210500.0
to_messages                     312.0
deferral_payments                 0.0
total_payments               916197.0
exercised_stock_options      607837.0
bonus                        300000.0
restricted_stock             360528.0
shared_receipt_with_poi         114.0
restricted_stock_deferred         0.0
total_stock_value            955873.0
expenses                      18834.0
loan_advances                     0.0
from_messages                    17.0
other                           947.0
from_this_person_to_poi           0.0
poi                               0.0
director_fees                     0.0
deferred_income                   0.0
long_term_incentive               0.0
from_poi_to_this_person           4.0
dtype: float64



In [14]:

    
imputated_df2.mode()









    Out[14]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      False
      0
      0
      0
      0

We see a lot of zeros. Maybe there were a lot of nulls? More than I was expecting. Let me see how many were there



In [15]:

    
imputated_df1.replace(to_replace='NaN', value=np.nan).isnull().sum()









    Out[15]:





salary                        51
to_messages                   59
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       59
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 59
other                         53
from_this_person_to_poi       59
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
from_poi_to_this_person       59
dtype: int64

There are a lot of nulls. How many records were there again?



In [16]:

    
imputated_df2.shape









    Out[16]:





(145, 20)

There are some which have very small amount of values. Let's check them out



In [17]:

    
imputated_df2.loan_advances.value_counts()









    Out[17]:





0           142
400000        1
2000000       1
81525000      1
Name: loan_advances, dtype: int64



In [18]:

    
imputated_df2[imputated_df2.loan_advances != 0]









    Out[18]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      FREVERT MARK A
      1060932
      3275
      6426990
      17252530
      10433518
      2000000
      4188667
      2979
      0
      14622185
      86987
      2000000
      21
      7427621
      6
      False
      0
      -3367011
      1617011
      242
    
    
      LAY KENNETH L
      1072321
      4273
      202911
      103559793
      34348384
      7000000
      14761694
      2411
      0
      49110078
      99832
      81525000
      36
      10359729
      16
      True
      0
      -300000
      3600000
      123
    
    
      PICKERING MARK R
      655037
      898
      0
      1386690
      28798
      300000
      0
      728
      0
      28798
      31653
      400000
      67
      0
      0
      False
      0
      0
      0
      7

Although we have a POI for which this value is present I don't think this gives us any discriminating power. Hence I will remove this



In [19]:

    
imputated_df3 = imputated_df2.drop(['loan_advances'], axis = 1)
imputated_df3.head()









    Out[19]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
      201955
      2902
      2869717
      4484442
      1729541
      4175000
      126027
      1407
      -126027
      1729541
      13868
      2195
      152
      65
      False
      0
      -3081055
      304805
      47
    
    
      BADUM JAMES P
      0
      0
      178980
      182466
      257817
      0
      0
      0
      0
      257817
      3486
      0
      0
      0
      False
      0
      0
      0
      0
    
    
      BANNANTINE JAMES M
      477
      566
      0
      916197
      4046157
      0
      1757552
      465
      -560222
      5243487
      56301
      29
      864523
      0
      False
      0
      -5104
      0
      39
    
    
      BAXTER JOHN C
      267102
      0
      1295738
      5634343
      6680544
      1200000
      3942714
      0
      0
      10623258
      11200
      0
      2660303
      0
      False
      0
      -1386055
      1586055
      0
    
    
      BAY FRANKLIN R
      239671
      0
      260455
      827696
      0
      400000
      145796
      0
      -82782
      63014
      129142
      0
      69
      0
      False
      0
      -201641
      0
      0



In [20]:

    
imputated_df3.shape









    Out[20]:





(145, 19)



In [21]:

    
imputated_df3.poi.value_counts()









    Out[21]:





False    127
True      18
Name: poi, dtype: int64



In [22]:

    
def get_poi_wise_breakup(df, feature):
    
    def my_func(test):
        return {
            'isNull': test[feature].isnull().sum(),
            'isNotNull': test[feature].notnull().sum()
        }

    return df.replace(to_replace=0, value=np.nan)\
             .loc[:, ['poi', feature]]\
             .groupby('poi')\
             .apply(my_func)



In [23]:

    
get_poi_wise_breakup(imputated_df3, 'deferral_payments')









    Out[23]:





poi
False    {u'isNull': 94, u'isNotNull': 33}
True      {u'isNull': 13, u'isNotNull': 5}
dtype: object



In [24]:

    
get_poi_wise_breakup(imputated_df3, 'restricted_stock_deferred')









    Out[24]:





poi
False    {u'isNull': 110, u'isNotNull': 17}
True       {u'isNull': 18, u'isNotNull': 0}
dtype: object



In [25]:

    
get_poi_wise_breakup(imputated_df3, 'director_fees')









    Out[25]:





poi
False    {u'isNull': 111, u'isNotNull': 16}
True       {u'isNull': 18, u'isNotNull': 0}
dtype: object

Looking at these I can see that restricted_stock_deferred as well as director_fees are null for all of the POI. So we can't really discriminate between POI/Non-POI based on this. I looked at some discussions but don't really agree and I don't want to make assumptions. Thus I am going to remove those. deferral_payments still has some data for both the cases so I am going to keep it for now.



In [26]:

    
imputated_df4 = imputated_df3.drop(['restricted_stock_deferred', 'director_fees'], axis = 1)
imputated_df4.head()









    Out[26]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      total_stock_value
      expenses
      from_messages
      other
      from_this_person_to_poi
      poi
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
      201955
      2902
      2869717
      4484442
      1729541
      4175000
      126027
      1407
      1729541
      13868
      2195
      152
      65
      False
      -3081055
      304805
      47
    
    
      BADUM JAMES P
      0
      0
      178980
      182466
      257817
      0
      0
      0
      257817
      3486
      0
      0
      0
      False
      0
      0
      0
    
    
      BANNANTINE JAMES M
      477
      566
      0
      916197
      4046157
      0
      1757552
      465
      5243487
      56301
      29
      864523
      0
      False
      -5104
      0
      39
    
    
      BAXTER JOHN C
      267102
      0
      1295738
      5634343
      6680544
      1200000
      3942714
      0
      10623258
      11200
      0
      2660303
      0
      False
      -1386055
      1586055
      0
    
    
      BAY FRANKLIN R
      239671
      0
      260455
      827696
      0
      400000
      145796
      0
      63014
      129142
      0
      69
      0
      False
      -201641
      0
      0

Summary

removed the key TOTAL as that was an outlier as per the exercises already done
replaced NaN with 0 in all the columns

removed the column email_address
removed the column loan_advances
removed the colummns restricted_stock_deferred as well as director_fees

the column deferral_payments was kept but doubtful whether it can be useful or not

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	restricted_stock_deferred	total_stock_value	...	loan_advances	from_messages	other	from_this_person_to_poi	poi	director_fees	deferred_income	long_term_incentive	email_address	from_poi_to_this_person
ALLEN PHILLIP K	201955	2902	2869717	4484442	1729541	4175000	126027	1407	-126027	1729541	...	NaN	2195	152	65	False	NaN	-3081055	304805	phillip.allen@enron.com	47
BADUM JAMES P	NaN	NaN	178980	182466	257817	NaN	NaN	NaN	NaN	257817	...	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	NaN	NaN
BANNANTINE JAMES M	477	566	NaN	916197	4046157	NaN	1757552	465	-560222	5243487	...	NaN	29	864523	0	False	NaN	-5104	NaN	james.bannantine@enron.com	39
BAXTER JOHN C	267102	NaN	1295738	5634343	6680544	1200000	3942714	NaN	NaN	10623258	...	NaN	NaN	2660303	NaN	False	NaN	-1386055	1586055	NaN	NaN
BAY FRANKLIN R	239671	NaN	260455	827696	NaN	400000	145796	NaN	-82782	63014	...	NaN	NaN	69	NaN	False	NaN	-201641	NaN	frank.bay@enron.com	NaN

	count	mean	std	min	25%	50%	75%	max
salary	145.0	1.841671e+05	1.969598e+05	0.0	0.0	210500.0	269076.0	1111258.0
to_messages	145.0	1.230014e+03	2.232153e+03	0.0	0.0	312.0	1607.0	15149.0
deferral_payments	145.0	2.205579e+05	7.517046e+05	-102500.0	0.0	0.0	7961.0	6426990.0
total_payments	145.0	2.243477e+06	8.817819e+06	0.0	91093.0	916197.0	1934359.0	103559793.0
exercised_stock_options	145.0	2.061486e+06	4.781941e+06	0.0	0.0	607837.0	1668260.0	34348384.0
bonus	145.0	6.713353e+05	1.230148e+06	0.0	0.0	300000.0	800000.0	8000000.0
restricted_stock	145.0	8.625464e+05	2.010852e+06	-2604490.0	0.0	360528.0	698920.0	14761694.0
shared_receipt_with_poi	145.0	6.977655e+02	1.075128e+03	0.0	0.0	114.0	900.0	5521.0
restricted_stock_deferred	145.0	7.291157e+04	1.297469e+06	-1787380.0	0.0	0.0	0.0	15456290.0
total_stock_value	145.0	2.889718e+06	6.172223e+06	-44093.0	221141.0	955873.0	2282768.0	49110078.0
expenses	145.0	3.513137e+04	4.524718e+04	0.0	0.0	18834.0	53122.0	228763.0
loan_advances	145.0	5.787931e+05	6.771012e+06	0.0	0.0	0.0	0.0	81525000.0
from_messages	145.0	3.610759e+02	1.445945e+03	0.0	0.0	17.0	52.0	14368.0
other	145.0	2.952100e+05	1.127404e+06	0.0	0.0	947.0	150458.0	10359729.0
from_this_person_to_poi	145.0	2.445517e+01	7.952707e+01	0.0	0.0	0.0	14.0	609.0
director_fees	145.0	9.911490e+03	3.120271e+04	0.0	0.0	0.0	0.0	137864.0
deferred_income	145.0	-1.923475e+05	6.041174e+05	-3504386.0	-36666.0	0.0	0.0	0.0
long_term_incentive	145.0	3.346340e+05	6.853639e+05	0.0	0.0	0.0	374347.0	5145434.0
from_poi_to_this_person	145.0	3.848966e+01	7.408836e+01	0.0	0.0	4.0	41.0	528.0

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	total_stock_value	expenses	loan_advances	from_messages	other	from_this_person_to_poi	poi	deferred_income	long_term_incentive	from_poi_to_this_person
FREVERT MARK A	1060932	3275	6426990	17252530	10433518	2000000	4188667	2979	14622185	86987	2000000	21	7427621	6	False	-3367011	1617011	242
LAY KENNETH L	1072321	4273	202911	103559793	34348384	7000000	14761694	2411	49110078	99832	81525000	36	10359729	16	True	-300000	3600000	123
PICKERING MARK R	655037	898	0	1386690	28798	300000	0	728	28798	31653	400000	67	0	0	False	0	0	7