In [1]:
import sys
import pickle
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
# The following function is used to create counts and percentages in the pie
def make_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
return my_autopct
In [2]:
# Load data
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
my_dataset = data_dict
df = pd.DataFrame.from_dict(my_dataset,orient='index')
#Convert fields to numeric fields.
df['salary'] = pd.to_numeric(df['salary'],errors='coerce')
df['deferral_payments'] = pd.to_numeric(df['deferral_payments'],errors='coerce')
df['total_payments'] = pd.to_numeric(df['total_payments'],errors='coerce')
df['deferral_payments'] = pd.to_numeric(df['deferral_payments'],errors='coerce')
df['exercised_stock_options'] = pd.to_numeric(df['exercised_stock_options'],errors='coerce')
df['bonus'] = pd.to_numeric(df['bonus'],errors='coerce')
df['restricted_stock'] = pd.to_numeric(df['restricted_stock'],errors='coerce')
df['total_stock_value'] = pd.to_numeric(df['total_stock_value'],errors='coerce')
df['expenses'] = pd.to_numeric(df['expenses'],errors='coerce')
df['loan_advances'] = pd.to_numeric(df['loan_advances'],errors='coerce')
df['long_term_incentive'] = pd.to_numeric(df['long_term_incentive'],errors='coerce')
df['deferred_income'] = pd.to_numeric(df['deferred_income'],errors='coerce')
df['from_this_person_to_poi'] = pd.to_numeric(df['from_this_person_to_poi'],errors='coerce')
df['from_poi_to_this_person'] = pd.to_numeric(df['from_poi_to_this_person'],errors='coerce')
df['shared_receipt_with_poi'] = pd.to_numeric(df['shared_receipt_with_poi'],errors='coerce')
df['to_messages'] = pd.to_numeric(df['to_messages'],errors='coerce')
df['from_messages'] = pd.to_numeric(df['from_messages'],errors='coerce')
df['director_fees'] = pd.to_numeric(df['director_fees'],errors='coerce')
df['other'] = pd.to_numeric(df['other'],errors='coerce')
df['restricted_stock_deferred'] = pd.to_numeric(df['restricted_stock_deferred'],errors='coerce')
In [3]:
print "Total number of data points: " + str(len(df))
In [4]:
poiVsNotPoi = df.groupby('poi')['poi'].count()
poiVsNotPoi.plot.pie(figsize=(8, 8),
autopct = make_autopct(poiVsNotPoi))
Out[4]:
In [5]:
print "feature and data types"
print df.dtypes
print "\nNumber of features (poi is not a feature): " + str(len(list(df.columns.values)) - 1)
In [6]:
print "Missing values"
print df.isnull().sum()
In [7]:
poiIsNull = df[df['poi'] == True].isnull().sum()
poiPctNull = poiIsNull / len(df[df['poi'] == True])
notPoiIsNull = df[df['poi'] != True].isnull().sum()
notPoiPctNull = notPoiIsNull / len(df[df['poi'] != True])
print pd.concat([poiIsNull,poiPctNull,notPoiIsNull,notPoiPctNull,poiPctNull-notPoiPctNull], axis=1)
There is a lot of data missing....
Although it looks like there are some trends here (for example):
Trends like this make me think that for the financial data I should make some counters (for and against being a a poi)
For testing features I will run tests with the 3 classifiers... I want to test adding/changing features indpedent of the classifiers.
I am going to use email data:
to_messages from_messages shared_receipt_with_poi from_this_person_to_poi from_poi_to_this_person
And use the financial as counters:
poi for things that when the there is a not null value is more likely a poi. non_poi for things that when the there is a not null value is more likely not a poi.
salary (poi)
bonus (poi)
restricted_stock (poi)
restricted_stock_deferred (non_poi)
expenses (poi)
other (poi)
director_fees (non_poi)
deferred_income (poi)
long_term_incentive (poi)
In [8]:
email_data = ['to_messages',
'from_messages',
'from_poi_to_this_person',
'from_this_person_to_poi',
'shared_receipt_with_poi']
print "Distributions\n------------------"
for field in email_data:
print df[field].describe()
sortedDF = df.sort(field, ascending=0)
print sortedDF[field].head()
print "\n"
For these, there appearsto be any real standout outliers.
There is a row called 'TOTAL' and a row called 'THE TRAVEL AGENCY IN THE PARK' which should be removed.
In [9]:
df = df.drop('TOTAL')
df = df.drop('THE TRAVEL AGENCY IN THE PARK')
For the following I need to figure out what to do with null values. 0 or mean?
to_messages from_messages shared_receipt_with_poi from_this_person_to_poi from_poi_to_this_person
My thought is to_messages and from_messages should recieve the mean (everyone sends emails) while the poi emails with null get a 0.
In [10]:
email_data = ['to_messages',
'from_messages',
'from_poi_to_this_person',
'from_this_person_to_poi',
'shared_receipt_with_poi']
#Fill all NAs with 0s
df_all_zero = df.copy()
df_all_zero[email_data] = df_all_zero[email_data].fillna(0)
#Fill all NAs with the mean of the column
df_all_mean = df.copy()
df_all_mean[email_data] = df_all_zero[email_data].fillna(df[email_data].mean())
#Fill some NAs with mean, and other with 9
df_mixed = df.copy()
df_mixed[email_data[:2]] = df_mixed[email_data[:2]].fillna(df[email_data[:2]].mean())
df_mixed[email_data[2:]] = df_mixed[email_data[2:]].fillna(0)
features = [('poi')] + email_data
import ml_test
#Import the classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
gnb = GaussianNB()
dt = tree.DecisionTreeClassifier()
svc = svm.SVC()
ada = AdaBoostClassifier(n_estimators=100)
rf = RandomForestClassifier(n_estimators=100)
clfs = [gnb,dt,svc,ada,rf]
#Test each data set with the classifiers
print "All zero:"
ml_test.testMany(df_all_zero,features,clfs,0.3)
print "All mean:"
ml_test.testMany(df_all_mean,features,clfs,0.3)
print "Mixed:"
ml_test.testMany(df_mixed,features,clfs,0.3)
Clearly SVM is just guessing false for everyone. That is why it's accuracy is high but it's recall is 0.
Using 0's gives the best recall in nb and dt, so lets go with that.
In [11]:
df = df_all_zero
In [12]:
gnb = GaussianNB()
dt = tree.DecisionTreeClassifier()
svc = svm.SVC()
ada = AdaBoostClassifier(n_estimators=10)
rf = RandomForestClassifier(n_estimators=10)
clfs = [gnb,dt,svc,ada,rf]
#Test each data set with some features removed
print "to_messages:"
ml_test.testMany(df,features[:1] + features[2:],clfs,0.3)
print "from_messages:"
ml_test.testMany(df,features[:2] + features[3:],clfs,0.3)
print "from_poi_to_this_person:"
ml_test.testMany(df,features[:3] + features[4:],clfs,0.3)
print "from_this_person_to_poi:"
ml_test.testMany(df,features[:4] + features[5:],clfs,0.3)
print "shared_receipt_with_poi:"
ml_test.testMany(df[features[:5]],features[:5],clfs,0.3)
In [13]:
finance_fields = ['salary',
'bonus',
'restricted_stock',
'expenses',
'other',
'deferred_income',
'long_term_incentive']
df['finance_field_count'] = df[finance_fields].notnull().sum(axis=1)
features_added_pos = features + ['finance_field_count']
print "Current features"
ml_test.testMany(df[features],features,clfs,0.3)
print "Added positive feature"
ml_test.testMany(df[features_added_pos],features_added_pos,clfs,0.3)
In [14]:
df.ix[df.to_messages==0, 'to_rate'] = 0
df.ix[df.to_messages!=0, 'to_rate'] = df[df.to_messages!=0]['from_poi_to_this_person'] / df[df.to_messages!=0]['to_messages']
df.ix[df.from_messages==0, 'from_rate'] = 0
df.ix[df.from_messages!=0, 'from_rate'] = df[df.from_messages!=0]['from_this_person_to_poi'] / df[df.from_messages!=0]['from_messages']
rateFeatures = ['poi',
'to_rate',
'from_rate',
'shared_receipt_with_poi',
'finance_field_count']
print "Current features"
ml_test.testMany(df[features],features,clfs,0.3)
print "Rate features"
ml_test.testMany(df[rateFeatures],rateFeatures,clfs,0.3)
In [15]:
feature_list = rateFeatures
In [28]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
labelIndex = feature_list[0]
featureIndexes = feature_list[1:]
labels = df[labelIndex].values[:].astype(bool)
features = df[featureIndexes].values[:,:].astype(np.float64)
clf = RandomForestClassifier(n_estimators=10)
from sklearn.cross_validation import StratifiedShuffleSplit
kf = StratifiedShuffleSplit(labels, 100, random_state = 42)
recallScore = cross_val_score(clf, features, labels, scoring='recall',cv=kf).mean()
perScore = cross_val_score(clf, features, labels, scoring='precision',cv=kf).mean()
accuracyScore = cross_val_score(clf, features, labels, scoring='accuracy',cv=kf).mean()
print "Acc: %f, Precision: %f, Recall: %f" % (accuracyScore,perScore,recallScore)
First, I will autoscale the inputs. See if that helps.
In [24]:
from sklearn import preprocessing
labelIndex = feature_list[0]
featureIndexes = feature_list[1:]
labels = df[labelIndex].values[:].astype(bool)
features = df[featureIndexes].values[:,:].astype(np.float64)
featuresScaled = preprocessing.scale(features)
clf = RandomForestClassifier(n_estimators=10)
recallScore = cross_val_score(clf, featuresScaled, labels, scoring='recall',cv=kf).mean()
perScore = cross_val_score(clf, featuresScaled, labels, scoring='precision',cv=kf).mean()
accuracyScore = cross_val_score(clf, featuresScaled, labels, scoring='accuracy',cv=kf).mean()
print "Acc: %f, Precision: %f, Recall: %f" % (accuracyScore,perScore,recallScore)
In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=len(featuresScaled[0])-1)
pcaFeatures = pca.fit_transform(featuresScaled)
clf = RandomForestClassifier(n_estimators=10)
recallScore = cross_val_score(clf, featuresScaled, labels, scoring='recall',cv=kf).mean()
perScore = cross_val_score(clf, featuresScaled, labels, scoring='precision',cv=kf).mean()
accuracyScore = cross_val_score(clf, featuresScaled, labels, scoring='accuracy',cv=kf).mean()
print "Acc: %f, Precision: %f, Recall: %f" % (accuracyScore,perScore,recallScore)
PCA seemed a little inconsistent. At best it does as good. At worst it does much worse. Don't use in final script.
By adding rates for poi emails, I manually reduced the dimensions.
In [29]:
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth': np.arange(2, 10),
'min_samples_split': np.arange(5, 20),
'class_weight': [None,'balanced'],
'max_features': [None,'auto','sqrt']}
clf = GridSearchCV(RandomForestClassifier(n_estimators=5), param_grid,scoring='recall',n_jobs=-1,cv=kf)
clf.fit(featuresScaled,labels)
print "Best score: " + str(clf.best_score_)
print(clf.best_params_)
print(featureIndexes)
print(clf.best_estimator_.feature_importances_)
In [31]:
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth': np.arange(2, 10),
'min_samples_split': np.arange(5, 20),
'class_weight': [None,'balanced','balanced'],
'max_features': [None,'auto','sqrt']}
clf = GridSearchCV(RandomForestClassifier(n_estimators=5), param_grid,scoring='precision',n_jobs=-1,cv=kf)
clf.fit(featuresScaled,labels)
print "Best score: " + str(clf.best_score_)
print(clf.best_params_)
print(featureIndexes)
print(clf.best_estimator_.feature_importances_)
I want to optimize for max recall. I find that the values tuning for recall also give the best outcome for precision.