In [2]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *
from IPython.display import Image
import warnings
from feature_engineering.feature_format import feature_format, target_feature_split
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
with open("./data/final_project_dataset.pkl", "rb") as data_file:
data_dict = pickle.load(data_file)
In [3]:
df = pd.DataFrame.from_records(list(data_dict.values()), index=data_dict.keys())
# df = df.replace('NaN', 0).drop(['email_address'], axis=1)
In [5]:
df.drop('poi', inplace=True, axis=1)
In [8]:
df.columns.values
Out[8]:
In [13]:
features_list = ['poi'] + list(df.columns.values)
In [14]:
my_dataset = df.to_dict('index')
data = feature_format(my_dataset, features_list, sort_keys = True)
labels, features = target_feature_split(data)
In [15]:
len(features)
Out[15]:
In [16]:
def counts(col, tag):
counter = 0
for each in col:
if each == tag:
counter += 1
return counter
df.apply(lambda col: counts(col, 'NaN'), axis=0)
Out[16]:
In [17]:
dtype_df = df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()
Out[17]:
In [18]:
df.info()
In [19]:
df.replace('NaN', 0, inplace=True)
POI_type = {'POI': len(df[df.poi == True]),
'non POIs': len(df[df.poi == False])}
pd.DataFrame(list(POI_type.items()),
columns=['Class', 'Counts'])
Out[19]:
In [20]:
sns.set(font_scale=1.4)
f, ax = plt.subplots(figsize=(14, 11))
cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
ax = sns.heatmap(df.corr(), cmap=cmap, vmax=.5, vmin=-.3, center=0,
square=True, linewidths=.5, cbar=0,
annot=True, annot_kws={"size":8})
plt.show()
In [21]:
df = df[df.index != 'TOTAL']
In [22]:
h = ggplot(aes(x='bonus'), data=df) + \
geom_histogram(binwidth=500000,
fill='deeppink',
color='black',
alpha=0.5) +\
theme(plot_title=element_text(size=20)) +\
scale_x_continuous(breaks=range(0, 8000000, 1500000)) +\
ggtitle('Bonus distribution')
t = theme_bw()
t._rcParams['font.size'] = 20
t._rcParams['figure.figsize'] = 10, 6
h + t
Out[22]:
As we can see that the data is highly skewed but most situalted from 0 to 1500000 Lets look at the data within this range
In [23]:
df['log_bonus'] = np.log10(df.bonus + 0.1)
h = ggplot(aes(x='log_bonus'), data=df) + \
geom_histogram(binwidth=.5,
fill='deeppink',
color='black',
alpha=0.5) +\
theme(plot_title = element_text(size=20)) +\
scale_x_continuous(limits=(4, 8)) +\
ggtitle('Bonus distribution (Log Scale)')
t = theme_bw()
t._rcParams['font.size'] = 20
t._rcParams['figure.figsize'] = 10, 6
h + t
Out[23]:
The distribution seems pretty normal barring the people with 0 bonus.
In [24]:
h = ggplot(aes(x='long_term_incentive'), data=df) + \
geom_histogram(binwidth=500000,
fill='darkgreen',
color='black',
alpha=0.5) +\
theme(plot_title = element_text(size=20)) +\
ggtitle('Long term incentive distribution')
t = theme_bw()
t._rcParams['font.size'] = 20
t._rcParams['figure.figsize'] = 10, 6
h + t
Out[24]:
In [30]:
Image(url="./img/importance_random_forest.png", retina=True)
Out[30]:
In [31]:
Image(url="./img/importance_xgboost.png", retina=True)
Out[31]:
In [32]:
new_df = pd.read_pickle('final_df.pkl')
We can see from the feature importance that poi_interaction and deferred_income are the two most important features. Let's explore these variables
In [39]:
h = ggplot(aes(x='poi_interaction'), data=new_df) + \
geom_histogram(fill='orange',
color='black',
alpha=0.5) +\
theme(plot_title = element_text(size=20)) +\
ggtitle('Poi Interaction distribution')
t = theme_bw()
t._rcParams['font.size'] = 20
t._rcParams['figure.figsize'] = 10, 6
h + t
Out[39]:
In [ ]:
In [40]:
h = ggplot(aes(x='deferred_income'), data=new_df) + \
geom_histogram(fill='orange',
color='black',
alpha=0.5) +\
theme(plot_title = element_text(size=20)) +\
ggtitle('Poi Interaction distribution')
t = theme_bw()
t._rcParams['font.size'] = 20
t._rcParams['figure.figsize'] = 10, 6
h + t
Out[40]:
In [59]:
new_df.groupby('poi').describe().poi_interaction.reset_index()
Out[59]:
In [64]:
sns.set_style("whitegrid")
sns.barplot(y='poi_interaction', x='poi', data=new_df)
plt.title('Distribution of POIs for poi_interaction')
plt.show()
In [ ]: