In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper
import keras
helper.info_gpu()
#sns.set_palette("Reds")
helper.reproducible(seed=0) # setup reproducible results from run to run using Keras
%matplotlib inline
%load_ext autoreload
%autoreload
In [2]:
data_path = 'data/enron_financial_data.pkl'
target = ['poi']
df = pd.read_pickle(data_path)
df = pd.DataFrame.from_dict(df, orient='index')
In [3]:
helper.info_data(df, target)
Imbalanced target: the evaluation metric used in this problem is the Area Under the ROC Curve
poi = person of interest (boolean)
In [4]:
df.head(3)
Out[4]:
In [5]:
# delete 'TOTAL' row (at the bottom)
if 'TOTAL' in df.index:
df.drop('TOTAL', axis='index', inplace=True)
# convert dataframe values (objects) to numerical. There are no categorical features
df = df.apply(pd.to_numeric, errors='coerce')
In [6]:
helper.missing(df)
High-missing features, like 'loan_advances', are needed to obtain better models
In [7]:
df.drop('email_address', axis='columns', inplace=True)
In [8]:
num = list(df.select_dtypes(include=[np.number]))
df = helper.classify_data(df, target, numerical=num)
helper.get_types(df)
Out[8]:
In [9]:
# Reeplace NaN values with the median
df.fillna(df.median(), inplace=True)
#helper.fill_simple(df, target, inplace=True) # same result
In [10]:
df.describe(percentiles=[0.5]).astype(int)
Out[10]:
In [11]:
helper.show_numerical(df, kde=True, ncols=5)
In [12]:
helper.show_target_vs_numerical(df, target, jitter=0.05, point_size=50, ncols=5)
In [13]:
# df.plot.scatter(x='salary', y='total_stock_value')
# df.plot.scatter(x='long_term_incentive', y='total_stock_value')
# sns.lmplot(x="salary", y="total_stock_value", hue='poi', data=df)
# sns.lmplot(x="long_term_incentive", y="total_stock_value", hue='poi', data=df)
g = sns.PairGrid(
df,
y_vars=["total_stock_value"],
x_vars=["salary", "long_term_incentive", "from_this_person_to_poi"],
hue='poi',
size=4)
g.map(sns.regplot).add_legend()
plt.ylim(
ymin=0, ymax=0.5e8)
#sns.pairplot(df, hue='poi', vars=['long_term_incentive', 'total_stock_value', 'from_poi_to_this_person'], kind='reg', size=3)
Out[13]:
The person of interest seems to have a higher stock vs salary and long-term incentive, especially when his stock value is high. There is no dependency between POI and the amount of emails from or to another person of interest.
In [14]:
helper.correlation(df, target)
In [15]:
droplist = [] # features to drop from the model
# For the model 'data' instead of 'df'
data = df.copy()
data.drop(droplist, axis='columns', inplace=True)
data.head(3)
Out[15]:
In [16]:
data, scale_param = helper.scale(data)
There are no categorical variables
In [17]:
test_size = 0.4
random_state = 9
x_train, y_train, x_test, y_test = helper.simple_split(data, target, True, test_size,
random_state)
In [18]:
y_train, y_test = helper.one_hot_output(y_train, y_test)
In [19]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("test size \t X:{} \t Y:{} ".format(x_test.shape, y_test.shape))
In [20]:
helper.dummy_clf(x_train, y_train, x_test, y_test)
Out[20]:
In [21]:
# class weight for imbalance target
cw = helper.get_class_weight(y_train[:,1])
In [29]:
model_path = os.path.join("models", "enron_scandal.h5")
model = None
model = helper.build_nn_clf(x_train.shape[1], y_train.shape[1], dropout=0.3, summary=True)
helper.train_nn(model, x_train, y_train, class_weight=cw, path=model_path)
from sklearn.metrics import roc_auc_score
y_pred_train = model.predict(x_train, verbose=0)
print('\nROC_AUC train:\t{:.2f} \n'.format(roc_auc_score(y_train, y_pred_train)))
In [30]:
# Dataset too small for train, validation, and test sets. More data is needed for a proper
y_pred = model.predict(x_test, verbose=0)
helper.binary_classification_scores(y_test[:, 1], y_pred[:, 1], return_dataframe=True, index="DNN")
Out[30]:
In [31]:
helper.ml_classification(x_train, y_train[:,1], x_test, y_test[:,1])
Out[31]: