In [442]:

    
import sys
from time import time
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline



In [443]:

    
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report



In [444]:

    
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )



In [445]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)









    



GaussianNB()
	Accuracy: 0.25560	Precision: 0.18481	Recall: 0.79800	F1: 0.30011	F2: 0.47968
	Total predictions: 10000	True positives: 1596	False positives: 7040	False negatives:  404	True negatives:  960



In [446]:

    
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print features_list









    



['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']



In [447]:

    
### convert dictionary to pandas dataframe

df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df = df.convert_objects(convert_numeric=True)

for col in list(df.columns):
    df[col] = df[col].round(decimals=3)
    
print "POI Count:\n", df.poi.value_counts()
df.head()









    



POI Count:
False    128
True      18
dtype: int64






    Out[447]:






  
    
      
      poi
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      long_term_incentive
      other
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
    
  
  
    
      0
      False
      600000
      NaN
      NaN
      NaN
      NaN
      94299
      29
      38
      1
      NaN
      NaN
      1740
      585062
      NaN
      365788
      702
      807
      1061827
      585062
    
    
      1
      False
      1200000
      1295738
      -1386055
      NaN
      6680544
      11200
      NaN
      NaN
      NaN
      NaN
      1586055
      2660303
      3942714
      NaN
      267102
      NaN
      NaN
      5634343
      10623258
    
    
      2
      False
      350000
      NaN
      -400729
      NaN
      4890344
      78552
      NaN
      NaN
      NaN
      NaN
      NaN
      12961
      1788391
      NaN
      170941
      NaN
      NaN
      211725
      6678735
    
    
      3
      False
      NaN
      NaN
      NaN
      NaN
      651850
      NaN
      12
      10
      0
      NaN
      NaN
      NaN
      386335
      NaN
      NaN
      58
      764
      NaN
      1038185
    
    
      4
      True
      1500000
      NaN
      -3117011
      NaN
      5538001
      34039
      32
      32
      21
      NaN
      1617011
      11350
      853064
      NaN
      243293
      1035
      1045
      288682
      6391065



In [448]:

    
# create labels
y = df.poi.values
print y.shape
y[:5]









    



(146,)






    Out[448]:





array([False, False, False, False,  True], dtype=bool)



In [449]:

    
# create initial features
X = df.drop('poi', axis=1).values
print X.shape



In [450]:

    
# imputation for 'NaN' values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
X









    Out[450]:





array([[  6.00000000e+05,   1.64267415e+06,  -1.14047514e+06, ...,
          8.07000000e+02,   1.06182700e+06,   5.85062000e+05],
       [  1.20000000e+06,   1.29573800e+06,  -1.38605500e+06, ...,
          2.07386047e+03,   5.63434300e+06,   1.06232580e+07],
       [  3.50000000e+05,   1.64267415e+06,  -4.00729000e+05, ...,
          2.07386047e+03,   2.11725000e+05,   6.67873500e+06],
       ..., 
       [  6.00000000e+05,   2.27449000e+05,  -1.14047514e+06, ...,
          5.33000000e+02,   1.09266300e+06,   3.74504800e+06],
       [  8.00000000e+05,   1.64267415e+06,  -3.00000000e+05, ...,
          8.65000000e+02,   8.75760000e+05,   1.08098800e+06],
       [  6.00000000e+05,   1.64267415e+06,  -1.14047514e+06, ...,
          8.73000000e+02,   1.27228400e+06,   7.78546000e+05]])



In [451]:

    
### Task 2: Remove outliers
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()

for i in xrange(num_cols):
    point_five_percentile = np.percentile(X[:,i], 0.5)
    ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
    
    for j in xrange(num_rows):
        if X[j,i] < point_five_percentile:
            #print "\tlow outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
            rows_to_remove.add(j)
        elif X[j,i] > ninety_nine_point_five_percentile:
            #print "\thigh outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
            rows_to_remove.add(j)

X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))
print "new X shape: ", X.shape
print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))









    



new X shape:  (130, 19)

total rows removed:  16 (0.11)



In [452]:

    
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape









    



(104, 19) (26, 19) (104,) (26,)



In [453]:

    
### Task 3: Create new feature(s)
# scale
scaler = MinMaxScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [454]:

    
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# pca
#pca = RandomizedPCA().fit(X_train)      
#print [round(x, 4) for x in pca.explained_variance_ratio_]
#X_train = pca.transform(X_train)
#print X_train.shape
#X_test = pca.transform(X_test)
#print X_test.shape



In [455]:

    
# select K best
#selector = SelectKBest().fit(X_train, y_train)
#print selector.scores_
#X_train = selector.transform(X_train)
#print X_train.shape
#X_test = selector.transform(X_test)  
#print X_test.shape



In [472]:

    
dict(randomizedpca__n_components = even_range,
                  randomizedpca__whiten = t_or_f,
                  randomizedpca__random_state = random_state,
                  selectkbest__k = even_range)









    Out[472]:





{'randomizedpca__n_components': [2, 4, 6, 8, 10, 12, 14, 16, 18],
 'randomizedpca__random_state': 42,
 'randomizedpca__whiten': [True, False],
 'selectkbest__k': [2, 4, 6, 8, 10, 12, 14, 16, 18]}



In [489]:

    
even_range = range(2,X.shape[1],2)
random_state = [42]
t_or_f = [True, False]

best_clf = []

#for clf in [GaussianNB(), SVC(), DecisionTreeClassifier(), KMeans()]:
for clf in [GaussianNB()]:
    
    pipeline = make_pipeline(RandomizedPCA(), SelectKBest(), clf)
    
    #print pipeline
    
    params = dict(randomizedpca__n_components = even_range,
                  randomizedpca__whiten = t_or_f,
                  randomizedpca__random_state = random_state,
                  selectkbest__k = ['all'])
    
    grid_search = GridSearchCV(pipeline, param_grid=params, verbose=1)

    grid_search.fit(X, y)
    
    best_clf = grid_search.best_estimator_

    print "\nbest estimator: ", best_clf, "\n" 
    print "\nbest score: ", grid_search.best_score_, "\n"
    print "\nbest params: ", grid_search.best_params_, "\n"
    
clf = best_clf.fit(X_train, y_train)
test_classifier(clf, my_dataset, features_list)









    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.2s






    



Fitting 3 folds for each of 18 candidates, totalling 54 fits

best estimator:  Pipeline(steps=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=3, n_components=2, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k='all', score_func=<function f_classif at 0x11381a500>)), ('gaussiannb', GaussianNB())]) 


best score:  0.9 


best params:  {'randomizedpca__whiten': True, 'selectkbest__k': 'all', 'randomizedpca__n_components': 2, 'randomizedpca__random_state': 42} 

Pipeline(steps=[('randomizedpca', RandomizedPCA(copy=True, iterated_power=3, n_components=2, random_state=42,
       whiten=True)), ('selectkbest', SelectKBest(k='all', score_func=<function f_classif at 0x11381a500>)), ('gaussiannb', GaussianNB())])
	Accuracy: 0.87207	Precision: 0.54341	Recall: 0.25350	F1: 0.34572	F2: 0.28378
	Total predictions: 15000	True positives:  507	False positives:  426	False negatives: 1493	True negatives: 12574







    



[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    0.2s finished

GaussianNB()
     Accuracy: 0.25560     Precision: 0.18481     Recall: 0.79800     F1: 0.30011     F2: 0.47968
     Total predictions: 10000     True positives: 1596     False positives: 7040     False negatives: 404
     True negatives: 960



In [435]:

    
#print("Fitting the classifier to the training set")
#t0 = time()
#param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
#              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
#clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
#clf = clf.fit(X_train_pca, y_train)
#print("done in %0.3fs" % (time() - t0))
##print("Best estimator found by grid search:")
#print(clf.best_estimator_)

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])

test_classifier(clf, my_dataset, features_list)









    



0.884615384615
0.0
0.0
             precision    recall  f1-score   support

    non-poi       0.92      0.96      0.94        24
        poi       0.00      0.00      0.00         2

avg / total       0.85      0.88      0.87        26

GaussianNB()
	Accuracy: 0.33620	Precision: 0.14720	Recall: 0.83000	F1: 0.25006	F2: 0.43056
	Total predictions: 15000	True positives: 1660	False positives: 9617	False negatives:  340	True negatives: 3383



In [418]:

    
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])

test_classifier(clf, my_dataset, features_list)









    



0.923076923077
0.0
0.0
             precision    recall  f1-score   support

    non-poi       0.92      1.00      0.96        24
        poi       0.00      0.00      0.00         2

avg / total       0.85      0.92      0.89        26

Got a divide by zero when trying out: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)






    



/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:958: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:958: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [419]:

    
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])

test_classifier(clf, my_dataset, features_list)









    



0.846153846154
0.25
0.5
             precision    recall  f1-score   support

    non-poi       0.95      0.88      0.91        24
        poi       0.25      0.50      0.33         2

avg / total       0.90      0.85      0.87        26

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy: 0.78920	Precision: 0.20113	Recall: 0.19550	F1: 0.19828	F2: 0.19660
	Total predictions: 15000	True positives:  391	False positives: 1553	False negatives: 1609	True negatives: 11447



In [420]:

    
clf = KMeans()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='macro')
print recall_score(y_test, y_pred, average='macro')
#print classification_report(y_test, y_pred, target_names=['non-poi','poi'])

test_classifier(clf, my_dataset, features_list)









    



-4.15617151539
0.288461538462
0.34375
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
	Accuracy: 0.93740	Precision: 0.97610	Recall: 0.91248	F1: 0.94322	F2: 0.92453
	Total predictions: 15000	True positives: 7799	False positives:  191	False negatives:  748	True negatives: 6262







    



/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:960: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)



In [ ]:



In [ ]:

    
###############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
# http://scikit-learn.org/stable/auto_examples/feature_stacker.html
pca = RandomizedPCA()
selection = SelectKBest()

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

X_train = combined_features.fit(X_train, y_train).transform(X_train)
X_test = combined_features.transform(X_test)



In [ ]:

    
combined_features



In [ ]:

    
X_train.shape



In [ ]:

    
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)



In [ ]:

    
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)



In [ ]:

    
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)



In [ ]:

    
clf = KMeans()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)



In [ ]:

    
svm = SVC()
#gnb = GaussianNB()
#tree = DecisionTreeClassifier()
#kmeans = KMeans()

# Do grid search over k, n_components and C:

##pipeline = Pipeline([("features", combined_features), ("svm", svm), ("gnb", gnb), ("tree", tree), ("kmeans", kmeans)])
pipeline = Pipeline([("features", combined_features), ("gnb", gnb)])

param_grid = dict(features__pca__n_components=[x+1 for x in xrange(X.shape[1]-1)],
                  features__univ_select__k=[x+1 for x in xrange(X.shape[1]-1)],
                  #svm__C=[0.1, 1, 10])

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)



In [ ]:

    
print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
         'C': [1e3, 5e3, 1e4, 5e4, 1e5],
          'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
          }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "training time: ", round(time()-t0,3), "s"
print "Best estimator found by grid search:"
print clf.best_estimator_



In [ ]:

    
svm = SVC(kernel='rbf', C=10000.0)
gnb = GaussianNB()
tree = DecisionTreeClassifier(min_samples_split=40)
kmeans = KMeans(n_clusters=2)

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ("svm", svm)])

param_grid = dict(features__pca__n_components=[1, 2, 3],
                  features__univ_select__k=[1, 2],
                  svm__C=[0.1, 1, 10])

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)



In [ ]:



In [ ]:

    
col_first_half = list(df.columns[0:(len(df.columns)/6)])
col_first_half.append('poi')



In [ ]:

    
col_second_half = df.columns[(len(df.columns)/2):]



In [ ]:

    
sns.pairplot(df[col_first_half], hue="poi")



In [ ]:

    
### Task 2: Remove outliers
for entry in my_dataset.itervalues():
    for feature in features_list:
        print entry

	poi	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	long_term_incentive	other	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value
0	False	600000	NaN	NaN	NaN	NaN	94299	29	38	1	NaN	NaN	1740	585062	NaN	365788	702	807	1061827	585062
1	False	1200000	1295738	-1386055	NaN	6680544	11200	NaN	NaN	NaN	NaN	1586055	2660303	3942714	NaN	267102	NaN	NaN	5634343	10623258
2	False	350000	NaN	-400729	NaN	4890344	78552	NaN	NaN	NaN	NaN	NaN	12961	1788391	NaN	170941	NaN	NaN	211725	6678735
3	False	NaN	NaN	NaN	NaN	651850	NaN	12	10	0	NaN	NaN	NaN	386335	NaN	NaN	58	764	NaN	1038185
4	True	1500000	NaN	-3117011	NaN	5538001	34039	32	32	21	NaN	1617011	11350	853064	NaN	243293	1035	1045	288682	6391065