notebook.community

Edit and run



In [1]:

    
from __future__ import division, print_function
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:

    
# change cwd based on OS # ToDO
import os
print(os.getcwd())
path = str(os.getcwd())









    



/Users/stefan/Code/nanodegree/p5/final_project



In [3]:

    
# load data    
if path == "/Users/stefan/Code/nanodegree/p5/final_project":
    f = "./final_project_dataset.pkl"

elif "v093216" in path:
    f = "D:/DATA/v093216/GIT/nanodegree/p5/final_project/final_project_dataset.pkl"

else:
    f = "./final_project/final_project_dataset.pkl"
    
with open(f, "rb") as data_file:
    data_dict = pickle.load(data_file)

# Remove TOTAL
data_dict.pop("TOTAL", 0)

# Remove previously identified outliers
data_dict.pop("LAY KENNETH L", 0)
data_dict.pop("SKILLING JEFFREY K", 0)

# Remove outlier identified during anaylsis
data_dict.pop("LAVORATO JOHN J", 0)
data_dict.pop("BELFER ROBERT", 0)









    Out[3]:





{'bonus': 'NaN',
 'deferral_payments': -102500,
 'deferred_income': 'NaN',
 'director_fees': 3285,
 'email_address': 'NaN',
 'exercised_stock_options': 3285,
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 'NaN',
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 44093,
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 102500,
 'total_stock_value': -44093}



In [4]:

    
# extract features
from collections import defaultdict
features = defaultdict(list)

for k, v in data_dict.iteritems():
    for k2, v2 in v.iteritems():
        if v2 == "NaN":
            features[k2].append(np.nan)
        else:
            features[k2].append(v2)



In [5]:

    
# supress scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# load features into df
df = pd.DataFrame(features)
print(df.shape)
print(df.dtypes)









    



(141, 21)
bonus                        float64
deferral_payments            float64
deferred_income              float64
director_fees                float64
email_address                 object
exercised_stock_options      float64
expenses                     float64
from_messages                float64
from_poi_to_this_person      float64
from_this_person_to_poi      float64
loan_advances                float64
long_term_incentive          float64
other                        float64
poi                             bool
restricted_stock             float64
restricted_stock_deferred    float64
salary                       float64
shared_receipt_with_poi      float64
to_messages                  float64
total_payments               float64
total_stock_value            float64
dtype: object



In [6]:

    
# drop email column
df.drop(["email_address"], axis=1, inplace=True)



In [7]:

    
for c in df.columns:
    print(df[c].describe())









    



count        78.00
mean     983892.55
std      907348.31
min       70000.00
25%      406250.00
50%      750000.00
75%     1100000.00
max     5249999.00
Name: bonus, dtype: float64
count        36.00
mean     885569.03
std     1310814.45
min        7961.00
25%       83501.50
50%      243952.00
75%      938989.25
max     6426990.00
Name: deferral_payments, dtype: float64
count         47.00
mean     -587029.60
std       951340.06
min     -3504386.00
25%      -639093.50
50%      -144062.00
75%       -37506.00
max         -833.00
Name: deferred_income, dtype: float64
count       15.00
mean     95592.07
std      35219.47
min      17500.00
25%     100017.00
50%     108579.00
75%     113138.00
max     137864.00
Name: director_fees, dtype: float64
count         97.00
mean     2486132.18
std      4259595.40
min         9803.00
25%       506765.00
50%      1104054.00
75%      2282768.00
max     30766064.00
Name: exercised_stock_options, dtype: float64
count       91.00
mean     54014.77
std      46546.39
min        148.00
25%      21937.00
50%      46145.00
75%      78265.00
max     228763.00
Name: expenses, dtype: float64
count      83.00
mean      597.92
std      1859.73
min        12.00
25%        21.50
50%        41.00
75%       145.00
max     14368.00
Name: from_messages, dtype: float64
count    83.00
mean     58.34
std      71.66
min       0.00
25%      10.00
50%      35.00
75%      65.00
max     305.00
Name: from_poi_to_this_person, dtype: float64
count    83.00
mean     37.22
std      93.21
min       0.00
25%       1.00
50%       7.00
75%      23.50
max     609.00
Name: from_this_person_to_poi, dtype: float64
count         2.00
mean    1200000.00
std     1131370.85
min      400000.00
25%      800000.00
50%     1200000.00
75%     1600000.00
max     2000000.00
Name: loan_advances, dtype: float64
count        62.00
mean     660750.77
std      768428.64
min       69223.00
25%      275000.00
50%      395480.50
75%      751112.00
max     5145434.00
Name: long_term_incentive, dtype: float64
count        89.00
mean     364292.70
std      932560.15
min           2.00
25%        1191.00
50%       52382.00
75%      356071.00
max     7427621.00
Name: other, dtype: float64
count       141
unique        2
top       False
freq        125
Name: poi, dtype: object
count        106.00
mean      966563.31
std      1760292.43
min     -2604490.00
25%       239541.25
50%       441096.00
75%       955263.75
max     13847074.00
Name: restricted_stock, dtype: float64
count         16.00
mean      658005.31
std      3968666.39
min     -1787380.00
25%      -349757.25
50%      -146975.00
75%       -80191.25
max     15456290.00
Name: restricted_stock_deferred, dtype: float64
count        91.00
mean     265729.25
std      132441.84
min         477.00
25%      211240.00
50%      251654.00
75%      296349.50
max     1060932.00
Name: salary, dtype: float64
count     83.00
mean    1117.60
std     1146.03
min        2.00
25%      224.00
50%      728.00
75%     1667.00
max     5521.00
Name: shared_receipt_with_poi, dtype: float64
count      83.00
mean     1966.18
std      2546.84
min        57.00
25%       528.00
50%      1088.00
75%      2523.50
max     15149.00
Name: to_messages, dtype: float64
count        120.00
mean     1687778.83
std      2383100.77
min          148.00
25%       386380.25
50%      1095881.50
75%      2023024.50
max     17252530.00
Name: total_payments, dtype: float64
count        121.00
mean     2799027.50
std      4680926.62
min        28798.00
25%       494136.00
50%      1080988.00
75%      2493616.00
max     30766064.00
Name: total_stock_value, dtype: float64



In [8]:

    
# create new features
df["performance_compensation"] = df["bonus"] + df["exercised_stock_options"] + df["total_stock_value"]
df["poi_communication"] = df["from_poi_to_this_person"] + df["from_this_person_to_poi"] / df["from_messages"] + df["to_messages"] + 1 # hack to circument zero-division error
    
df.shape









    Out[8]:





(141, 22)



In [11]:

    
df.corr()









    Out[11]:







  
    
      
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      ...
      poi
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
      performance_compensation
      poi_communication
    
  
  
    
      bonus
      1.00
      0.54
      -0.66
      nan
      0.11
      0.04
      0.05
      0.41
      0.30
      1.00
      ...
      0.25
      0.03
      0.12
      0.24
      0.59
      0.35
      0.54
      0.12
      0.22
      0.35
    
    
      deferral_payments
      0.54
      1.00
      -0.88
      nan
      0.11
      0.04
      0.31
      0.45
      0.20
      nan
      ...
      -0.08
      0.64
      -0.22
      0.70
      0.40
      0.37
      0.80
      0.24
      0.74
      0.38
    
    
      deferred_income
      -0.66
      -0.88
      1.00
      -0.46
      -0.67
      0.09
      -0.31
      -0.19
      -0.27
      nan
      ...
      -0.29
      -0.48
      0.17
      -0.51
      -0.42
      -0.40
      -0.57
      -0.69
      -0.74
      -0.39
    
    
      director_fees
      nan
      nan
      -0.46
      1.00
      0.97
      -0.50
      1.00
      nan
      1.00
      nan
      ...
      nan
      -0.97
      0.97
      nan
      1.00
      -1.00
      0.36
      0.77
      nan
      -1.00
    
    
      exercised_stock_options
      0.11
      0.11
      -0.67
      0.97
      1.00
      -0.05
      -0.05
      0.15
      -0.04
      1.00
      ...
      0.38
      0.41
      -0.07
      0.21
      0.04
      -0.04
      0.07
      0.94
      0.96
      -0.04
    
    
      expenses
      0.04
      0.04
      0.09
      -0.50
      -0.05
      1.00
      0.10
      -0.07
      0.03
      1.00
      ...
      0.05
      -0.01
      0.04
      0.17
      0.24
      0.16
      0.07
      -0.03
      -0.11
      0.16
    
    
      from_messages
      0.05
      0.31
      -0.31
      1.00
      -0.05
      0.10
      1.00
      0.15
      0.60
      -1.00
      ...
      -0.06
      0.09
      -0.33
      0.05
      0.22
      0.47
      -0.01
      -0.00
      -0.04
      0.47
    
    
      from_poi_to_this_person
      0.41
      0.45
      -0.19
      nan
      0.15
      -0.07
      0.15
      1.00
      0.29
      1.00
      ...
      0.22
      0.17
      -0.46
      0.22
      0.64
      0.50
      0.23
      0.14
      0.17
      0.52
    
    
      from_this_person_to_poi
      0.30
      0.20
      -0.27
      1.00
      -0.04
      0.03
      0.60
      0.29
      1.00
      1.00
      ...
      0.16
      0.17
      -0.40
      0.06
      0.43
      0.55
      0.07
      0.03
      0.03
      0.55
    
    
      loan_advances
      1.00
      nan
      nan
      nan
      1.00
      1.00
      -1.00
      1.00
      1.00
      1.00
      ...
      nan
      nan
      nan
      1.00
      1.00
      1.00
      1.00
      1.00
      1.00
      1.00
    
    
      long_term_incentive
      0.25
      0.11
      -0.60
      nan
      0.23
      -0.17
      -0.07
      0.05
      0.02
      nan
      ...
      0.13
      0.28
      -0.19
      0.30
      0.04
      -0.01
      0.52
      0.22
      0.47
      -0.01
    
    
      other
      0.13
      0.73
      -0.40
      nan
      0.22
      -0.04
      -0.10
      0.17
      -0.12
      nan
      ...
      -0.06
      0.40
      0.01
      0.56
      0.09
      -0.02
      0.67
      0.34
      0.46
      -0.02
    
    
      poi
      0.25
      -0.08
      -0.29
      nan
      0.38
      0.05
      -0.06
      0.22
      0.16
      nan
      ...
      1.00
      0.05
      nan
      0.08
      0.21
      0.03
      0.03
      0.24
      0.18
      0.04
    
    
      restricted_stock
      0.03
      0.64
      -0.48
      -0.97
      0.41
      -0.01
      0.09
      0.17
      0.17
      nan
      ...
      0.05
      1.00
      -0.87
      0.24
      0.25
      0.21
      0.07
      0.74
      0.60
      0.22
    
    
      restricted_stock_deferred
      0.12
      -0.22
      0.17
      0.97
      -0.07
      0.04
      -0.33
      -0.46
      -0.40
      nan
      ...
      nan
      -0.87
      1.00
      -0.59
      -0.35
      -0.36
      0.95
      -0.81
      -0.82
      -0.37
    
    
      salary
      0.24
      0.70
      -0.51
      nan
      0.21
      0.17
      0.05
      0.22
      0.06
      1.00
      ...
      0.08
      0.24
      -0.59
      1.00
      0.29
      0.16
      0.64
      0.25
      0.37
      0.17
    
    
      shared_receipt_with_poi
      0.59
      0.40
      -0.42
      1.00
      0.04
      0.24
      0.22
      0.64
      0.43
      1.00
      ...
      0.21
      0.25
      -0.35
      0.29
      1.00
      0.84
      0.28
      0.10
      0.15
      0.84
    
    
      to_messages
      0.35
      0.37
      -0.40
      -1.00
      -0.04
      0.16
      0.47
      0.50
      0.55
      1.00
      ...
      0.03
      0.21
      -0.36
      0.16
      0.84
      1.00
      0.13
      0.04
      0.03
      1.00
    
    
      total_payments
      0.54
      0.80
      -0.57
      0.36
      0.07
      0.07
      -0.01
      0.23
      0.07
      1.00
      ...
      0.03
      0.07
      0.95
      0.64
      0.28
      0.13
      1.00
      0.18
      0.32
      0.13
    
    
      total_stock_value
      0.12
      0.24
      -0.69
      0.77
      0.94
      -0.03
      -0.00
      0.14
      0.03
      1.00
      ...
      0.24
      0.74
      -0.81
      0.25
      0.10
      0.04
      0.18
      1.00
      0.98
      0.04
    
    
      performance_compensation
      0.22
      0.74
      -0.74
      nan
      0.96
      -0.11
      -0.04
      0.17
      0.03
      1.00
      ...
      0.18
      0.60
      -0.82
      0.37
      0.15
      0.03
      0.32
      0.98
      1.00
      0.03
    
    
      poi_communication
      0.35
      0.38
      -0.39
      -1.00
      -0.04
      0.16
      0.47
      0.52
      0.55
      1.00
      ...
      0.04
      0.22
      -0.37
      0.17
      0.84
      1.00
      0.13
      0.04
      0.03
      1.00
    
  

22 rows × 22 columns



In [15]:

    
# drop colums with low corr
to_drop = ["deferral_payments", "deferred_income", "director_fees", "expenses",
           "loan_advances", "other", "restricted_stock", "restricted_stock_deferred", "salary", 
          "to_messages", "total_payments","from_messages", "from_this_person_to_poi",
           "performance_compensation", "poi_communication"]
df_features = df.drop(to_drop, axis=1)
df_features.shape









    Out[15]:





(141, 7)



In [16]:

    
df_features.describe()









    Out[16]:







  
    
      
      bonus
      exercised_stock_options
      from_poi_to_this_person
      long_term_incentive
      shared_receipt_with_poi
      total_stock_value
    
  
  
    
      count
      78.00
      97.00
      83.00
      62.00
      83.00
      121.00
    
    
      mean
      983892.55
      2486132.18
      58.34
      660750.77
      1117.60
      2799027.50
    
    
      std
      907348.31
      4259595.40
      71.66
      768428.64
      1146.03
      4680926.62
    
    
      min
      70000.00
      9803.00
      0.00
      69223.00
      2.00
      28798.00
    
    
      25%
      406250.00
      506765.00
      10.00
      275000.00
      224.00
      494136.00
    
    
      50%
      750000.00
      1104054.00
      35.00
      395480.50
      728.00
      1080988.00
    
    
      75%
      1100000.00
      2282768.00
      65.00
      751112.00
      1667.00
      2493616.00
    
    
      max
      5249999.00
      30766064.00
      305.00
      5145434.00
      5521.00
      30766064.00



In [35]:

    
# outlier seems to be LAVORATO JOHN J
# negative total_stock_value is associated with BELFER ROBERT
for k,v in data_dict.iteritems():
    if k == "THE TRAVEL AGENCY IN THE PARK" or k == "LOCKHART EUGENE E":
        print(k,v)



In [99]:

    
_ = df_features.hist(bins=10, figsize=(96,96), xlabelsize=45, ylabelsize=45, xrot=40, layout=(4,2))



In [31]:

    
sns.heatmap(df_features.corr(), cmap="YlOrRd", vmax=0.5)









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x110572b10>



In [100]:

    
from pandas import scatter_matrix
_ = scatter_matrix(df_features, alpha=0.5, figsize=(20, 20))



In [101]:

    
_ = scatter_matrix(df_features[["exercised_stock_options", "bonus", "total_stock_value", "poi"]], alpha = 0.5, figsize=(16,16))

	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	...	poi	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value	performance_compensation	poi_communication
bonus	1.00	0.54	-0.66	nan	0.11	0.04	0.05	0.41	0.30	1.00	...	0.25	0.03	0.12	0.24	0.59	0.35	0.54	0.12	0.22	0.35
deferral_payments	0.54	1.00	-0.88	nan	0.11	0.04	0.31	0.45	0.20	nan	...	-0.08	0.64	-0.22	0.70	0.40	0.37	0.80	0.24	0.74	0.38
deferred_income	-0.66	-0.88	1.00	-0.46	-0.67	0.09	-0.31	-0.19	-0.27	nan	...	-0.29	-0.48	0.17	-0.51	-0.42	-0.40	-0.57	-0.69	-0.74	-0.39
director_fees	nan	nan	-0.46	1.00	0.97	-0.50	1.00	nan	1.00	nan	...	nan	-0.97	0.97	nan	1.00	-1.00	0.36	0.77	nan	-1.00
exercised_stock_options	0.11	0.11	-0.67	0.97	1.00	-0.05	-0.05	0.15	-0.04	1.00	...	0.38	0.41	-0.07	0.21	0.04	-0.04	0.07	0.94	0.96	-0.04
expenses	0.04	0.04	0.09	-0.50	-0.05	1.00	0.10	-0.07	0.03	1.00	...	0.05	-0.01	0.04	0.17	0.24	0.16	0.07	-0.03	-0.11	0.16
from_messages	0.05	0.31	-0.31	1.00	-0.05	0.10	1.00	0.15	0.60	-1.00	...	-0.06	0.09	-0.33	0.05	0.22	0.47	-0.01	-0.00	-0.04	0.47
from_poi_to_this_person	0.41	0.45	-0.19	nan	0.15	-0.07	0.15	1.00	0.29	1.00	...	0.22	0.17	-0.46	0.22	0.64	0.50	0.23	0.14	0.17	0.52
from_this_person_to_poi	0.30	0.20	-0.27	1.00	-0.04	0.03	0.60	0.29	1.00	1.00	...	0.16	0.17	-0.40	0.06	0.43	0.55	0.07	0.03	0.03	0.55
loan_advances	1.00	nan	nan	nan	1.00	1.00	-1.00	1.00	1.00	1.00	...	nan	nan	nan	1.00	1.00	1.00	1.00	1.00	1.00	1.00
long_term_incentive	0.25	0.11	-0.60	nan	0.23	-0.17	-0.07	0.05	0.02	nan	...	0.13	0.28	-0.19	0.30	0.04	-0.01	0.52	0.22	0.47	-0.01
other	0.13	0.73	-0.40	nan	0.22	-0.04	-0.10	0.17	-0.12	nan	...	-0.06	0.40	0.01	0.56	0.09	-0.02	0.67	0.34	0.46	-0.02
poi	0.25	-0.08	-0.29	nan	0.38	0.05	-0.06	0.22	0.16	nan	...	1.00	0.05	nan	0.08	0.21	0.03	0.03	0.24	0.18	0.04
restricted_stock	0.03	0.64	-0.48	-0.97	0.41	-0.01	0.09	0.17	0.17	nan	...	0.05	1.00	-0.87	0.24	0.25	0.21	0.07	0.74	0.60	0.22
restricted_stock_deferred	0.12	-0.22	0.17	0.97	-0.07	0.04	-0.33	-0.46	-0.40	nan	...	nan	-0.87	1.00	-0.59	-0.35	-0.36	0.95	-0.81	-0.82	-0.37
salary	0.24	0.70	-0.51	nan	0.21	0.17	0.05	0.22	0.06	1.00	...	0.08	0.24	-0.59	1.00	0.29	0.16	0.64	0.25	0.37	0.17
shared_receipt_with_poi	0.59	0.40	-0.42	1.00	0.04	0.24	0.22	0.64	0.43	1.00	...	0.21	0.25	-0.35	0.29	1.00	0.84	0.28	0.10	0.15	0.84
to_messages	0.35	0.37	-0.40	-1.00	-0.04	0.16	0.47	0.50	0.55	1.00	...	0.03	0.21	-0.36	0.16	0.84	1.00	0.13	0.04	0.03	1.00
total_payments	0.54	0.80	-0.57	0.36	0.07	0.07	-0.01	0.23	0.07	1.00	...	0.03	0.07	0.95	0.64	0.28	0.13	1.00	0.18	0.32	0.13
total_stock_value	0.12	0.24	-0.69	0.77	0.94	-0.03	-0.00	0.14	0.03	1.00	...	0.24	0.74	-0.81	0.25	0.10	0.04	0.18	1.00	0.98	0.04
performance_compensation	0.22	0.74	-0.74	nan	0.96	-0.11	-0.04	0.17	0.03	1.00	...	0.18	0.60	-0.82	0.37	0.15	0.03	0.32	0.98	1.00	0.03
poi_communication	0.35	0.38	-0.39	-1.00	-0.04	0.16	0.47	0.52	0.55	1.00	...	0.04	0.22	-0.37	0.17	0.84	1.00	0.13	0.04	0.03	1.00

	bonus	exercised_stock_options	from_poi_to_this_person	long_term_incentive	shared_receipt_with_poi	total_stock_value
count	78.00	97.00	83.00	62.00	83.00	121.00
mean	983892.55	2486132.18	58.34	660750.77	1117.60	2799027.50
std	907348.31	4259595.40	71.66	768428.64	1146.03	4680926.62
min	70000.00	9803.00	0.00	69223.00	2.00	28798.00
25%	406250.00	506765.00	10.00	275000.00	224.00	494136.00
50%	750000.00	1104054.00	35.00	395480.50	728.00	1080988.00
75%	1100000.00	2282768.00	65.00	751112.00	1667.00	2493616.00
max	5249999.00	30766064.00	305.00	5145434.00	5521.00	30766064.00