In [1]:
from __future__ import division, print_function
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
# change cwd based on OS # ToDO
import os
print(os.getcwd())
path = str(os.getcwd())
In [3]:
# load data
if path == "/Users/stefan/Code/nanodegree/p5/final_project":
f = "./final_project_dataset.pkl"
elif "v093216" in path:
f = "D:/DATA/v093216/GIT/nanodegree/p5/final_project/final_project_dataset.pkl"
else:
f = "./final_project/final_project_dataset.pkl"
with open(f, "rb") as data_file:
data_dict = pickle.load(data_file)
# Remove TOTAL
data_dict.pop("TOTAL", 0)
# Remove previously identified outliers
data_dict.pop("LAY KENNETH L", 0)
data_dict.pop("SKILLING JEFFREY K", 0)
# Remove outlier identified during anaylsis
data_dict.pop("LAVORATO JOHN J", 0)
data_dict.pop("BELFER ROBERT", 0)
Out[3]:
In [4]:
# extract features
from collections import defaultdict
features = defaultdict(list)
for k, v in data_dict.iteritems():
for k2, v2 in v.iteritems():
if v2 == "NaN":
features[k2].append(np.nan)
else:
features[k2].append(v2)
In [5]:
# supress scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# load features into df
df = pd.DataFrame(features)
print(df.shape)
print(df.dtypes)
In [6]:
# drop email column
df.drop(["email_address"], axis=1, inplace=True)
In [7]:
for c in df.columns:
print(df[c].describe())
In [8]:
# create new features
df["performance_compensation"] = df["bonus"] + df["exercised_stock_options"] + df["total_stock_value"]
df["poi_communication"] = df["from_poi_to_this_person"] + df["from_this_person_to_poi"] / df["from_messages"] + df["to_messages"] + 1 # hack to circument zero-division error
df.shape
Out[8]:
In [11]:
df.corr()
Out[11]:
In [15]:
# drop colums with low corr
to_drop = ["deferral_payments", "deferred_income", "director_fees", "expenses",
"loan_advances", "other", "restricted_stock", "restricted_stock_deferred", "salary",
"to_messages", "total_payments","from_messages", "from_this_person_to_poi",
"performance_compensation", "poi_communication"]
df_features = df.drop(to_drop, axis=1)
df_features.shape
Out[15]:
In [16]:
df_features.describe()
Out[16]:
In [35]:
# outlier seems to be LAVORATO JOHN J
# negative total_stock_value is associated with BELFER ROBERT
for k,v in data_dict.iteritems():
if k == "THE TRAVEL AGENCY IN THE PARK" or k == "LOCKHART EUGENE E":
print(k,v)
In [99]:
_ = df_features.hist(bins=10, figsize=(96,96), xlabelsize=45, ylabelsize=45, xrot=40, layout=(4,2))
In [31]:
sns.heatmap(df_features.corr(), cmap="YlOrRd", vmax=0.5)
Out[31]:
In [100]:
from pandas import scatter_matrix
_ = scatter_matrix(df_features, alpha=0.5, figsize=(20, 20))
In [101]:
_ = scatter_matrix(df_features[["exercised_stock_options", "bonus", "total_stock_value", "poi"]], alpha = 0.5, figsize=(16,16))