In [398]:

    
import sys
from time import time
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline



In [399]:

    
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



In [400]:

    
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )



In [401]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)









    



GaussianNB()
	Accuracy: 0.25560	Precision: 0.18481	Recall: 0.79800	F1: 0.30011	F2: 0.47968
	Total predictions: 10000	True positives: 1596	False positives: 7040	False negatives:  404	True negatives:  960



In [402]:

    
clf









    Out[402]:





GaussianNB()



In [403]:

    
print my_dataset.keys()[0]
my_dataset.itervalues().next()









    



METTS MARK






    Out[403]:





{'bonus': 600000,
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'mark.metts@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 94299,
 'from_messages': 29,
 'from_poi_to_this_person': 38,
 'from_this_person_to_poi': 1,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 1740,
 'poi': False,
 'restricted_stock': 585062,
 'restricted_stock_deferred': 'NaN',
 'salary': 365788,
 'shared_receipt_with_poi': 702,
 'to_messages': 807,
 'total_payments': 1061827,
 'total_stock_value': 585062}



In [404]:

    
features_list









    Out[404]:





['poi', 'salary']



In [405]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

names = np.array(my_dataset.keys())
print names.shape, names[:5], "\n"
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print features_list









    



(146,) ['METTS MARK' 'BAXTER JOHN C' 'ELLIOTT STEVEN' 'CORDES WILLIAM R'
 'HANNON KEVIN P'] 

['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']



In [406]:

    
### convert dictionary to pandas dataframe

df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df.poi = df.poi.astype('int')
df = df.convert_objects(convert_numeric=True)

for col in list(df.columns):
    df[col] = df[col].round(decimals=3)
    
print "POI Count:\n", df.poi.value_counts()
df.head()









    



POI Count:
0    128
1     18
dtype: int64






    Out[406]:






  
    
      
      poi
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      long_term_incentive
      other
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
    
  
  
    
      0
      0
      600000
      NaN
      NaN
      NaN
      NaN
      94299
      29
      38
      1
      NaN
      NaN
      1740
      585062
      NaN
      365788
      702
      807
      1061827
      585062
    
    
      1
      0
      1200000
      1295738
      -1386055
      NaN
      6680544
      11200
      NaN
      NaN
      NaN
      NaN
      1586055
      2660303
      3942714
      NaN
      267102
      NaN
      NaN
      5634343
      10623258
    
    
      2
      0
      350000
      NaN
      -400729
      NaN
      4890344
      78552
      NaN
      NaN
      NaN
      NaN
      NaN
      12961
      1788391
      NaN
      170941
      NaN
      NaN
      211725
      6678735
    
    
      3
      0
      NaN
      NaN
      NaN
      NaN
      651850
      NaN
      12
      10
      0
      NaN
      NaN
      NaN
      386335
      NaN
      NaN
      58
      764
      NaN
      1038185
    
    
      4
      1
      1500000
      NaN
      -3117011
      NaN
      5538001
      34039
      32
      32
      21
      NaN
      1617011
      11350
      853064
      NaN
      243293
      1035
      1045
      288682
      6391065



In [407]:

    
# create labels
y = df.poi.values
print y.shape
print y[:5]









    



(146,)
[0 0 0 0 1]



In [408]:

    
# create initial features
X = df.drop('poi', axis=1).values
print X.shape



In [409]:

    
# imputation for 'NaN' values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
print X[:5]









    



[[  6.00000000e+05   1.64267415e+06  -1.14047514e+06   1.66804882e+05
    5.98705377e+06   9.42990000e+04   2.90000000e+01   3.80000000e+01
    1.00000000e+00   4.19625000e+07   1.47036145e+06   1.74000000e+03
    5.85062000e+05   1.66410556e+05   3.65788000e+05   7.02000000e+02
    8.07000000e+02   1.06182700e+06   5.85062000e+05]
 [  1.20000000e+06   1.29573800e+06  -1.38605500e+06   1.66804882e+05
    6.68054400e+06   1.12000000e+04   6.08790698e+02   6.48953488e+01
    4.12325581e+01   4.19625000e+07   1.58605500e+06   2.66030300e+06
    3.94271400e+06   1.66410556e+05   2.67102000e+05   1.17646512e+03
    2.07386047e+03   5.63434300e+06   1.06232580e+07]
 [  3.50000000e+05   1.64267415e+06  -4.00729000e+05   1.66804882e+05
    4.89034400e+06   7.85520000e+04   6.08790698e+02   6.48953488e+01
    4.12325581e+01   4.19625000e+07   1.47036145e+06   1.29610000e+04
    1.78839100e+06   1.66410556e+05   1.70941000e+05   1.17646512e+03
    2.07386047e+03   2.11725000e+05   6.67873500e+06]
 [  2.37423461e+06   1.64267415e+06  -1.14047514e+06   1.66804882e+05
    6.51850000e+05   1.08728916e+05   1.20000000e+01   1.00000000e+01
    0.00000000e+00   4.19625000e+07   1.47036145e+06   9.19064968e+05
    3.86335000e+05   1.66410556e+05   5.62194295e+05   5.80000000e+01
    7.64000000e+02   5.08152649e+06   1.03818500e+06]
 [  1.50000000e+06   1.64267415e+06  -3.11701100e+06   1.66804882e+05
    5.53800100e+06   3.40390000e+04   3.20000000e+01   3.20000000e+01
    2.10000000e+01   4.19625000e+07   1.61701100e+06   1.13500000e+04
    8.53064000e+05   1.66410556e+05   2.43293000e+05   1.03500000e+03
    1.04500000e+03   2.88682000e+05   6.39106500e+06]]



In [410]:

    
### Task 2: Remove outliers
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()

for i in xrange(num_cols):
    point_five_percentile = np.percentile(X[:,i], 0.5)
    ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
    
    for j in xrange(num_rows):
        if X[j,i] < point_five_percentile:
            #print "\tlow outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
            rows_to_remove.add(j)
        elif X[j,i] > ninety_nine_point_five_percentile:
            #print "\thigh outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
            rows_to_remove.add(j)

X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))

print "names associated with outlier-containing rows to remove:"
for i in rows_to_remove:
    print "\t",names[i]
    
names = np.delete(names, list(rows_to_remove))

print "\nnew X shape: ", X.shape
print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))









    



names associated with outlier-containing rows to remove:
	MENDELSOHN JOHN
	PICKERING MARK R
	FOY JOE
	TOTAL
	DELAINEY DAVID W
	LAVORATO JOHN J
	BOWEN JR RAYMOND M
	BANNANTINE JAMES M
	BELDEN TIMOTHY N
	SHAPIRO RICHARD S
	WALLS JR ROBERT H
	BHATNAGAR SANJAY
	BELFER ROBERT
	KAMINSKI WINCENTY J
	DODSON KEITH
	HICKERSON GARY J

new X shape:  (130, 19)

total rows removed:  16 (0.11)



In [411]:

    
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape









    



(117, 19) (13, 19) (117,) (13,)



In [412]:

    
### Task 3: Create new feature(s)
# scale
scaler = MinMaxScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
print X_train.shape
X_test = scaler.transform(X_test)
print X_test.shape









    



(117, 19)
(13, 19)



In [413]:

    
X_train









    Out[413]:





array([[  2.02898551e-01,   2.54666734e-01,   1.10572927e-01, ...,
          7.77410208e-02,   2.77527157e-03,   1.29298865e-01],
       [  3.29599219e-01,   2.54666734e-01,   9.93161391e-01, ...,
          1.58779180e-01,   7.96371026e-04,   7.83579828e-03],
       [  3.29599219e-01,   2.66425031e-02,   6.74758419e-01, ...,
          1.58779180e-01,   1.74960989e-03,   4.29068687e-03],
       ..., 
       [  3.29599219e-01,   7.73713594e-02,   6.74758419e-01, ...,
          8.74291115e-03,   9.32074981e-03,   4.42488433e-02],
       [  3.26086957e-02,   1.28265194e-01,   8.33792228e-01, ...,
          2.69376181e-02,   9.97615705e-03,   4.09500490e-02],
       [  0.00000000e+00,   6.76177035e-03,   9.43209117e-01, ...,
          1.58779180e-01,   3.79684861e-03,   8.50866484e-02]])



In [414]:

    
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

classifiers = dict()

def grid_searcher(clf):
    
    # PUT IN FUNCTION FOR MINMAXSCALER
    
    t0 = time()
    
    even_range = range(2,X.shape[1],2)
    random_state = [42]
    t_or_f = [True, False]
    #powers_of_ten = [10**x for x in range(-5,5)]
    logspace = np.logspace(-5, 5, 10)
    #kernels = ['linear', 'poly', 'rbf', 'sigmoid']  # takes too long, unfortunately
    kernels = ['rbf']
    criteria = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_features = ['auto', 'sqrt', 'log2', None]
    inits = ['k-means++', 'random']
    
    # pca and select K best
    pipeline = make_pipeline(RandomizedPCA(), SelectKBest(), clf)
    
    params = dict(randomizedpca__n_components = even_range,
                  randomizedpca__whiten = t_or_f,
                  randomizedpca__random_state = random_state,
                  selectkbest__k = ['all'])
    
        
    if pipeline.steps[2][0] == 'decisiontreeclassifier':
        params['decisiontreeclassifier__criterion'] = criteria
        params['decisiontreeclassifier__splitter'] = splitters
        params['decisiontreeclassifier__max_features'] = max_features
        params['decisiontreeclassifier__random_state'] = random_state
    
    if pipeline.steps[2][0] == 'svc':
        params['svc__C'] = logspace
        params['svc__kernel'] = kernels
        #params['svc__degree'] = [1,2,3,4,5]  # for use with 'poly'
        params['svc__gamma'] = logspace
        params['svc__random_state'] = random_state

    if pipeline.steps[2][0] == 'kmeans':
        params['kmeans__n_clusters'] = [2]
        params['kmeans__init'] = inits
        params['kmeans__random_state'] = random_state
    
    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=4)

    grid_search = grid_search.fit(X_train, y_train)

    print "*"*15, pipeline.steps[2][0].upper(), "*"*15
    #print "\nbest estimator: ", grid_search.best_estimator_, "\n" 
    print "\nBEST SCORE: ", grid_search.best_score_, "\n"
    #print "\nbest params: ", grid_search.best_params_, "\n"

    #print "#"*50
    print "\nBEST ESTIMATOR:"
    clf = grid_search.best_estimator_.fit(X_train, y_train)
    
    #classifiers[pipeline.steps[2][0]] = clf
    
    X_test_pca = clf.steps[0][1].transform(X_test)
    X_test_skb = clf.steps[1][1].transform(X_test_pca) 
    print "new X_test shape: ", X_test_skb.shape
    
    #print "#"*50
    print "\nPREDICTIONS:"
    #test_classifier(clf, my_dataset, features_list)
    print "\nground truth:\n", y_test 
    
    y_pred = clf.steps[2][1].predict(X_test_skb)
    print "\npredictions:\n", y_pred

    #print "#"*50
    print "\nEVALUATIONS:"
    print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
    
    print "\nclassification report:\n", classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
    
    print "ELAPSED TIME: ", round(time()-t0,3), "s"

Initial Results

GaussianNB()
     Accuracy: 0.25560     Precision: 0.18481     Recall: 0.79800     F1: 0.30011     F2: 0.47968
     Total predictions: 10000     True positives: 1596     False positives: 7040     False negatives: 404
     True negatives: 960

New Results



In [415]:

    
grid_searcher(GaussianNB())









    



*************** GAUSSIANNB ***************

BEST SCORE:  0.888888888889 


BEST ESTIMATOR:
new X_test shape:  (13, 12)

PREDICTIONS:

ground truth:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

predictions:
[0 0 0 0 0 1 0 0 0 0 0 0 0]

EVALUATIONS:

confusion matrix:
[[12  1]
 [ 0  0]]

classification report:
             precision    recall  f1-score   support

    non-poi       1.00      0.92      0.96        13
        poi       0.00      0.00      0.00         0

avg / total       1.00      0.92      0.96        13

ELAPSED TIME:  0.273 s



In [416]:

    
grid_searcher(DecisionTreeClassifier())









    



*************** DECISIONTREECLASSIFIER ***************

BEST SCORE:  0.863247863248 


BEST ESTIMATOR:
new X_test shape:  (13, 12)

PREDICTIONS:

ground truth:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

predictions:
[0 0 1 0 0 0 0 0 0 0 1 0 1]

EVALUATIONS:

confusion matrix:
[[10  3]
 [ 0  0]]

classification report:
             precision    recall  f1-score   support

    non-poi       1.00      0.77      0.87        13
        poi       0.00      0.00      0.00         0

avg / total       1.00      0.77      0.87        13

ELAPSED TIME:  2.249 s



In [417]:

    
grid_searcher(SVC())









    



*************** SVC ***************

BEST SCORE:  0.897435897436 


BEST ESTIMATOR:
new X_test shape:  (13, 2)

PREDICTIONS:

ground truth:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

predictions:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

EVALUATIONS:

confusion matrix:
[[13]]

classification report:
             precision    recall  f1-score   support

    non-poi       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00        13

ELAPSED TIME:  14.182 s



In [418]:

    
grid_searcher(KMeans())









    



*************** KMEANS ***************

BEST SCORE:  -4.49750936803 


BEST ESTIMATOR:
new X_test shape:  (13, 2)

PREDICTIONS:

ground truth:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

predictions:
[0 0 0 0 0 0 0 0 0 0 0 0 0]

EVALUATIONS:

confusion matrix:
[[13]]

classification report:
             precision    recall  f1-score   support

    non-poi       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00        13

ELAPSED TIME:  0.662 s

	poi	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	long_term_incentive	other	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value
0	0	600000	NaN	NaN	NaN	NaN	94299	29	38	1	NaN	NaN	1740	585062	NaN	365788	702	807	1061827	585062
1	0	1200000	1295738	-1386055	NaN	6680544	11200	NaN	NaN	NaN	NaN	1586055	2660303	3942714	NaN	267102	NaN	NaN	5634343	10623258
2	0	350000	NaN	-400729	NaN	4890344	78552	NaN	NaN	NaN	NaN	NaN	12961	1788391	NaN	170941	NaN	NaN	211725	6678735
3	0	NaN	NaN	NaN	NaN	651850	NaN	12	10	0	NaN	NaN	NaN	386335	NaN	NaN	58	764	NaN	1038185
4	1	1500000	NaN	-3117011	NaN	5538001	34039	32	32	21	NaN	1617011	11350	853064	NaN	243293	1035	1045	288682	6391065