In [27]:
import sys
import os
from time import time
import re
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
#sys.path.append("ud120-projects/maildir/")
import numpy as np
import pandas as pd
#from matplotlib import pyplot as plt
#import seaborn as sns
#%matplotlib inline
In [28]:
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_union
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [29]:
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )
In [30]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers.
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
test_classifier(clf, my_dataset, features_list)
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(clf, my_dataset, features_list)
In [31]:
print my_dataset.keys()[0]
my_dataset.itervalues().next()
Out[31]:
In [32]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
names = np.array(my_dataset.keys())
print "number of names in data: ", names.shape
print "\nfirst five names:\n", names[:5]
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print "\nfeatures:\n", features_list
In [33]:
### convert dictionary to pandas dataframe
df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df.poi = df.poi.astype('int')
df = df.convert_objects(convert_numeric=True)
for col in list(df.columns):
df[col] = df[col].round(decimals=3)
print "POI Count:\n", df.poi.value_counts()
df.head()
Out[33]:
In [34]:
df.describe()
Out[34]:
In [35]:
# create labels
y = df.poi.values
print y.shape
print y[:5]
In [36]:
# create initial features
X = df.drop('poi', axis=1).values
print X.shape
print X[:3]
In [37]:
### Task 2: Remove outliers
# hand-tuned to remove ~5% (in this case, 7%)
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()
for i in xrange(num_cols):
point_five_percentile = np.percentile(X[:,i], 0.5)
ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
for j in xrange(num_rows):
if X[j,i] < point_five_percentile or X[j,i] > ninety_nine_point_five_percentile:
rows_to_remove.add(j)
print X.shape
X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))
print "names associated with outlier-containing rows to remove:"
for i in rows_to_remove:
print "\t",names[i], " (poi? {})".format(y[i])
names = np.delete(names, list(rows_to_remove))
print "\nnew X shape: ", X.shape
print "\nnew y shape: ", y.shape
print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))
In [38]:
# impute 'NaN' values to column means
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(X)
X = imp.transform(X)
print X[:3]
imp_values = imp.statistics_
In [39]:
### Task 3: Create new feature(s)
def selectkbest():
# select K best to explore feature engineering possibilities
selector = SelectKBest().fit(X, y)
features = features_list[1:]
high_scores = []
print "SelectKBest SCORES:"
selectkbest_scores = np.round(selector.scores_, 2)
for i in xrange(len(features)):
print "\t", features[i], ": ", selectkbest_scores[i]
if selectkbest_scores[i] > 2:
high_scores.append(i)
print "\nSelectKBest HIGH SCORES:"
for i in high_scores:
print "\t", features[i], "[{}]".format(i), ": ", selectkbest_scores[i]
In [40]:
selectkbest()
In [41]:
def create_new_feature(X, col1, col2, operation, feature_name):
features_list.append(feature_name)
new_col = []
if operation == '*':
new_col = (X[:,col1] * X[:,col2])
elif operation == '/':
new_col = np.true_divide(X[:,col1], X[:, col2])
new_col.shape = (new_col.shape[0], 1)
#print new_col.shape
X = np.hstack((X, new_col))
#print X.shape
return X
In [42]:
X = create_new_feature(X, 0, 14, '*', 'selectkbest_product')
selectkbest()
In [43]:
corr_df = pd.DataFrame(X)
corr_df['label'] = y
In [44]:
corr_df.corr()['label'].values[:-1]
Out[44]:
In [45]:
corr = corr_df.corr()['label'].values[:-1]
In [46]:
for i in xrange(len(corr)):
if abs(corr[i]) > 0.2:
print i, abs(corr[i])
In [47]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
def grid_searcher(clf, pca_skb, output):
t0 = time()
even_range = range(2,X.shape[1],2)
random_state = [42]
t_or_f = [True, False]
#powers_of_ten = [10**x for x in range(-5,5)]
logspace = np.logspace(-5, 5, 10)
#kernels = ['linear', 'poly', 'rbf', 'sigmoid'] # takes too long, unfortunately
kernels = ['rbf']
criteria = ['gini', 'entropy']
splitters = ['best', 'random']
max_features = ['auto', 'sqrt', 'log2', None]
# modify features, remove features via pipeline
pipeline = []
params = dict()
pipeline_clf = ""
if pca_skb == "pca_skb":
#pipeline = make_pipeline(MinMaxScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)
pipeline = make_pipeline(StandardScaler(), make_union(RandomizedPCA(), SelectKBest()), clf)
params = dict(featureunion__randomizedpca__n_components = even_range,
featureunion__randomizedpca__iterated_power = [1, 2, 3],
featureunion__randomizedpca__whiten = t_or_f,
featureunion__randomizedpca__random_state = random_state,
featureunion__selectkbest__k = even_range)
elif pca_skb == "pca":
#pipeline = make_pipeline(MinMaxScaler(), RandomizedPCA(), clf)
pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), clf)
params = dict(randomizedpca__n_components = [4],
randomizedpca__iterated_power = [1, 2, 3],
randomizedpca__whiten = t_or_f,
randomizedpca__random_state = random_state)
elif pca_skb == "skb":
#pipeline = make_pipeline(MinMaxScaler(), SelectKBest(), clf)
pipeline = make_pipeline(StandardScaler(), SelectKBest(), clf)
params = dict(selectkbest__k = [4])
pipeline_clf = pipeline.steps[2][0]
if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
params["{}__criterion".format(pipeline_clf)] = criteria
#params["{}__splitter".format(pipeline_clf)] = splitters
params["{}__max_features".format(pipeline_clf)] = max_features
#params["{}__min_samples_split".format(pipeline_clf)] = even_range
params["{}__class_weight".format(pipeline_clf)] = ['auto', None]
params["{}__random_state".format(pipeline_clf)] = random_state
if pipeline_clf == 'svc':
params['svc__C'] = logspace
params['svc__kernel'] = kernels
#params['svc__degree'] = [1,2,3,4,5] # for use with 'poly'
params['svc__gamma'] = logspace
params['svc__random_state'] = random_state
# cross validation
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=random_state[0])
# tune parameters
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)
grid_search.fit(X, y)
if output == True:
print "*"*15, pipeline_clf.upper(), "*"*15
print "\nBEST SCORE: ", grid_search.best_score_, "\n"
print "\nBEST PARAMS: ", grid_search.best_params_, "\n"
# split into training and testing data for reporting results
if output == True:
print "#"*50
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state[0])
if output == True:
print "\nBEST ESTIMATOR:"
clf = grid_search.best_estimator_
if output == True:
print clf
clf.fit(X_train, y_train)
if pca_skb == "skb" or pca_skb == "pca_skb":
if output == True:
print "\nSelectKBest SCORES:"
features = features_list[1:]
selectkbest_scores = clf.steps[1][1].scores_ if pca_skb == "skb" else clf.steps[1][1].transformer_list[1][1].scores_
selectkbest_scores = np.round(selectkbest_scores, 2)
for i in xrange(len(features)):
if output == True:
print "\t", features[i], ": ", selectkbest_scores[i]
if pipeline_clf == 'decisiontreeclassifier' or pipeline_clf == 'randomforestclassifier':
if output == True:
print "\n{} FEATURE IMPORTANCES:".format(pipeline_clf.upper())
print clf.steps[2][1].feature_importances_
if output == True:
print "\n", "#"*50
print "\nPREDICTIONS:"
print "\nground truth:\n\t", y_test
y_pred = clf.predict(X_test)
if output == True:
print "\npredictions:\n\t", y_pred
print "\nscore: ", clf.score(X_test, y_test)
print "\nEVALUATIONS:"
print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
print "\nclassification report:\n", classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
print "ELAPSED TIME: ", round(time()-t0,3), "s"
return clf
In [48]:
# prepare for Udacity tester
# remove emails
for key in my_dataset.keys():
my_dataset[key].pop('email_address')
# remove outliers from original data set
for key in my_dataset.keys():
if key not in names:
my_dataset.pop(key)
# replace 'NaN's
for key in my_dataset.keys():
for sub_key in my_dataset[key].keys():
if my_dataset[key][sub_key] == 'NaN':
i = (df.columns.get_loc(sub_key) - 1)
my_dataset[key][sub_key] = imp_values[i]
# add created feature
i = 0
for key in my_dataset.keys():
my_dataset[key]['selectkbest_product'] = X[i,-1]
i += 1
In [49]:
def udacity_tester():
# use Udacity tester
print "\nUDACITY TESTER RESULTS: "
test_classifier(clf, my_dataset, features_list)
In [50]:
for classifier in [GaussianNB(), DecisionTreeClassifier()]:
for transfomer in ['pca', 'skb', 'pca_skb']:
clf = grid_searcher(classifier, transfomer, output=True)
udacity_tester()
In [2724]:
#dump_classifier_and_data(clf, my_dataset, features_list)
In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clfs = dict()
for classifier in [GaussianNB(), DecisionTreeClassifier()]:
clfs[str(classifier)] = dict()
for transformer in ['pca', 'skb', 'pca_skb']:
clf = grid_searcher(classifier, transformer, output=False)
clf.fit(X_train, y_train)
clfs[str(classifier)][transformer] = dict()
y_pred = clf.predict(X_test)
clfs[str(classifier)][transformer]['predictions'] = y_pred
clf_report = classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
clfs[str(classifier)][transformer]['clf_report'] = clf_report
In [146]:
clfs['GaussianNB()']['pca']['predictions']
Out[146]:
In [152]:
clfs['GaussianNB()']['pca']['clf_report']
Out[152]:
In [154]:
best_predictions = dict()
for clsfr in clfs.keys():
best_predictions[clsfr] = dict()
#print clsfr
best_f1_score = 0.0
for transformer in clfs[clsfr].keys():
clf_report = clfs[clsfr][transformer]['clf_report']
clf_report = [re.sub(r"[a-z]|\n", '', x) for x in clf_report.split(" ")]
clf_report = filter(None, clf_report)
#print "\t", transformer
f1_score = float(clf_report[-2])
#print f1_score
if f1_score > best_f1_score:
best_f1_score = f1_score
best_predictions[clsfr]['transformer'] = transformer
best_predictions[clsfr]['f1_score'] = f1_score
best_predictions[clsfr]['predictions'] = clfs[clsfr][transformer]['predictions']
In [155]:
averaged_best_recall_predictions = np.zeros((28,))
for clsfr in best_predictions.keys():
print clsfr[:10]
print "\tbest transformer: ", best_predictions[clsfr]['transformer']
print "\tbest f1 score: ", best_predictions[clsfr]['f1_score']
print "\tbest predictions: ", best_predictions[clsfr]['predictions']
averaged_best_recall_predictions = np.maximum(averaged_best_recall_predictions, best_predictions[clsfr]['predictions'])
print "\naveraged best predictions: {}".format(averaged_best_recall_predictions.astype('int'))
print "\nresulting confusion matrix:\n", confusion_matrix(y_test, averaged_best_recall_predictions)
print "\nresulting classification report:\n", classification_report(y_test, averaged_best_recall_predictions, target_names=["non-poi", "poi"])
I had hoped to use email-text data along with the financial data, but clearly there is a discrepancy between the inidividuals represented by the financial data and the email data.
In [2725]:
# compare individuals represented by the financial data and by the email-corpus data
directory_names = []
poi_directory_names = []
true_count = 0
false_count = 0
for key in my_dataset.keys():
names = key.lower().split(' ')
dirname = names.pop(0)
if len(names) > 0:
dirname = dirname + "-" + names[0][0]
exist = os.path.exists('/Users/excalibur/Dropbox/datasets/maildir/{}'.format(dirname))
#print dirname, "\n\temails exist: ", exist, names
directory_names.append(dirname)
if exist:
true_count += 1
if my_dataset[key]['poi'] == True:
poi_directory_names.append(dirname)
else:
false_count += 1
print "email directories matching individuals represented by financial data:"
print "\texist: ", true_count, "(POIs: {})".format(len(poi_directory_names))
print "\tdon't exist: ", false_count
#print sorted(directory_names)
Unfortunately, it seems clear that it would be difficult to join the two data sets in a meaningful way due to their lack of overlap.
After this project, I may spend some time with text-vectorization of the email corpus and examine things like word frequencies (I have started working on some of the initial code for that process below); however, with such an apparent low number of known POI-emails being available (only 3!?), it's entirely unclear how useful such an endeavor would be for identifying POIs, although there are surely other interesting insights to be gleaned.
In [2726]:
email_corpus_dir = '/Users/excalibur/Dropbox/datasets/maildir/'
In [2727]:
email_dirs = os.listdir(email_corpus_dir)
print "number of email directories: ", len(email_dirs)
print "\nfirst five email directories:\n", email_dirs[:5]
In [2728]:
email_text = dict()
for email_dir in email_dirs:
for dirpath, dirnames, filenames in os.walk(email_corpus_dir + email_dir):
for dirname in dirnames:
for filename in filenames:
path = dirpath + "/" + dirname + "/" + filename
if os.path.isfile(path):
with open(dirpath + "/" + dirname + "/" + filename, 'r') as f:
read_data = f.readlines()
if email_dir not in email_text:
email_text[email_dir] = ''.join(read_data[15:])
else:
email_text[email_dir] += ''.join(read_data[15:])
#print email_text['white-s']