In [398]:
import sys
from time import time
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [399]:
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [400]:
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )
In [401]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers.
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
test_classifier(clf, my_dataset, features_list)
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(clf, my_dataset, features_list)
In [402]:
clf
Out[402]:
In [403]:
print my_dataset.keys()[0]
my_dataset.itervalues().next()
Out[403]:
In [404]:
features_list
Out[404]:
In [405]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
names = np.array(my_dataset.keys())
print names.shape, names[:5], "\n"
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print features_list
In [406]:
### convert dictionary to pandas dataframe
df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df.poi = df.poi.astype('int')
df = df.convert_objects(convert_numeric=True)
for col in list(df.columns):
df[col] = df[col].round(decimals=3)
print "POI Count:\n", df.poi.value_counts()
df.head()
Out[406]:
In [407]:
# create labels
y = df.poi.values
print y.shape
print y[:5]
In [408]:
# create initial features
X = df.drop('poi', axis=1).values
print X.shape
In [409]:
# imputation for 'NaN' values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
print X[:5]
In [410]:
### Task 2: Remove outliers
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()
for i in xrange(num_cols):
point_five_percentile = np.percentile(X[:,i], 0.5)
ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
for j in xrange(num_rows):
if X[j,i] < point_five_percentile:
#print "\tlow outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
rows_to_remove.add(j)
elif X[j,i] > ninety_nine_point_five_percentile:
#print "\thigh outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
rows_to_remove.add(j)
X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))
print "names associated with outlier-containing rows to remove:"
for i in rows_to_remove:
print "\t",names[i]
names = np.delete(names, list(rows_to_remove))
print "\nnew X shape: ", X.shape
print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))
In [411]:
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
In [412]:
### Task 3: Create new feature(s)
# scale
scaler = MinMaxScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
print X_train.shape
X_test = scaler.transform(X_test)
print X_test.shape
In [413]:
X_train
Out[413]:
In [414]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
classifiers = dict()
def grid_searcher(clf):
# PUT IN FUNCTION FOR MINMAXSCALER
t0 = time()
even_range = range(2,X.shape[1],2)
random_state = [42]
t_or_f = [True, False]
#powers_of_ten = [10**x for x in range(-5,5)]
logspace = np.logspace(-5, 5, 10)
#kernels = ['linear', 'poly', 'rbf', 'sigmoid'] # takes too long, unfortunately
kernels = ['rbf']
criteria = ['gini', 'entropy']
splitters = ['best', 'random']
max_features = ['auto', 'sqrt', 'log2', None]
inits = ['k-means++', 'random']
# pca and select K best
pipeline = make_pipeline(RandomizedPCA(), SelectKBest(), clf)
params = dict(randomizedpca__n_components = even_range,
randomizedpca__whiten = t_or_f,
randomizedpca__random_state = random_state,
selectkbest__k = ['all'])
if pipeline.steps[2][0] == 'decisiontreeclassifier':
params['decisiontreeclassifier__criterion'] = criteria
params['decisiontreeclassifier__splitter'] = splitters
params['decisiontreeclassifier__max_features'] = max_features
params['decisiontreeclassifier__random_state'] = random_state
if pipeline.steps[2][0] == 'svc':
params['svc__C'] = logspace
params['svc__kernel'] = kernels
#params['svc__degree'] = [1,2,3,4,5] # for use with 'poly'
params['svc__gamma'] = logspace
params['svc__random_state'] = random_state
if pipeline.steps[2][0] == 'kmeans':
params['kmeans__n_clusters'] = [2]
params['kmeans__init'] = inits
params['kmeans__random_state'] = random_state
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=4)
grid_search = grid_search.fit(X_train, y_train)
print "*"*15, pipeline.steps[2][0].upper(), "*"*15
#print "\nbest estimator: ", grid_search.best_estimator_, "\n"
print "\nBEST SCORE: ", grid_search.best_score_, "\n"
#print "\nbest params: ", grid_search.best_params_, "\n"
#print "#"*50
print "\nBEST ESTIMATOR:"
clf = grid_search.best_estimator_.fit(X_train, y_train)
#classifiers[pipeline.steps[2][0]] = clf
X_test_pca = clf.steps[0][1].transform(X_test)
X_test_skb = clf.steps[1][1].transform(X_test_pca)
print "new X_test shape: ", X_test_skb.shape
#print "#"*50
print "\nPREDICTIONS:"
#test_classifier(clf, my_dataset, features_list)
print "\nground truth:\n", y_test
y_pred = clf.steps[2][1].predict(X_test_skb)
print "\npredictions:\n", y_pred
#print "#"*50
print "\nEVALUATIONS:"
print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
print "\nclassification report:\n", classification_report(y_test, y_pred, target_names=["non-poi", "poi"])
print "ELAPSED TIME: ", round(time()-t0,3), "s"
In [415]:
grid_searcher(GaussianNB())
In [416]:
grid_searcher(DecisionTreeClassifier())
In [417]:
grid_searcher(SVC())
In [418]:
grid_searcher(KMeans())