In [65]:
#!/usr/bin/python
import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
In [66]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
In [67]:
# Remove outliers
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK') # Not a name
data_dict.pop('LOCKHART EUGENE E') # All data is NaN
my_dataset = data_dict
In [68]:
# compute fraction
def computeFraction( poi_messages, all_messages ):
""" given a number messages to/from POI (numerator)
and number of all messages to/from a person (denominator),
return the fraction of messages to/from that person
that are from/to a POI
"""
### you fill in this code, so that it returns either
### the fraction of all messages to this person that come from POIs
### or
### the fraction of all messages from this person that are sent to POIs
### the same code can be used to compute either quantity
### beware of "NaN" when there is no known email address (and so
### no filled email features), and integer division!
### in case of poi_messages or all_messages having "NaN" value, return 0.
if poi_messages == 'NaN' or all_messages == 'NaN':
fraction = 0
else:
fraction = poi_messages *1.0 / all_messages
return fraction
In [69]:
# add the new features to dataset
for name in my_dataset:
data_point = my_dataset[name]
from_poi_to_this_person = data_point["from_poi_to_this_person"]
to_messages = data_point["to_messages"]
fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
data_point["fraction_from_poi"] = fraction_from_poi
from_this_person_to_poi = data_point["from_this_person_to_poi"]
from_messages = data_point["from_messages"]
fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
data_point["fraction_to_poi"] = fraction_to_poi
In [70]:
print 'Total # of people:',len(my_dataset)
In [71]:
print 'Total # of features:',len(my_dataset[my_dataset.keys()[0]])
In [72]:
# poi and non-poi
positive=0
negative=0
for name in my_dataset:
if my_dataset[name]['poi']==True:
positive+=1
elif my_dataset[name]['poi']==False:
negative+=1
print 'poi: ',positive
print 'non-poi: ',negative
In [73]:
# compute nan values
from collections import defaultdict
na_count=defaultdict(int)
for name in my_dataset:
for feature in my_dataset[name]:
if my_dataset[name][feature]=='NaN':
na_count[feature]+=1
print na_count
In [74]:
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
In [75]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
features_list = ['poi','salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees','to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi','fraction_from_poi','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
In [76]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
'min_samples_split': [2,5,8,10,12,15,20],
'min_samples_leaf':[1,2,5,8,10],
}
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)
clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')
In [77]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)
In [78]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_
Out[78]:
In [79]:
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock', 'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
In [80]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
'min_samples_split': [2,5,8,10,12,15,20],
'min_samples_leaf':[1,2,5,8,10],
}
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)
clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')
In [81]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)
In [82]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_
Out[82]:
In [83]:
features_list = ['poi', 'expenses', 'exercised_stock_options','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
In [84]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
'min_samples_split': [2,5,8,10,12,15,20],
'min_samples_leaf':[1,2,5,8,10],
}
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)
clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')
In [85]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)
In [86]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_
Out[86]:
In [87]:
from sklearn.preprocessing import MinMaxScaler
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock', 'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)
In [88]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
'min_samples_split': [2,5,8,10,12,15,20],
'min_samples_leaf':[1,2,5,8,10],
}
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)
clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')
In [89]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)
In [90]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_
Out[90]:
In [91]:
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock', 'from_messages']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)
In [92]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
'min_samples_split': [2,5,8,10,12,15,20],
'min_samples_leaf':[1,2,5,8,10],
}
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)
clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')
In [93]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)
In [94]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_
Out[94]:
In [95]:
from sklearn.naive_bayes import GaussianNB
from email_preprocess import preprocess
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock', 'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)
clf = GaussianNB()
features_train, features_test, labels_train, labels_test = preprocess()
clf.fit(features_train,labels_train)
test_classifier(clf, my_dataset, features_list, folds = 1000)