In [442]:
import sys
from time import time
import pickle
sys.path.append("ud120-projects/tools/")
sys.path.append("ud120-projects/final_project/")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [443]:
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
In [444]:
### Load the dictionary containing the dataset
data_dict = pickle.load(open("ud120-projects/final_project/final_project_dataset.pkl", "r") )
In [445]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers.
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
test_classifier(clf, my_dataset, features_list)
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(clf, my_dataset, features_list)
In [446]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = my_dataset.itervalues().next().keys()
features_list.sort()
features_list.remove('poi')
features_list.insert(0, 'poi')
features_list.remove('email_address')
print features_list
In [447]:
### convert dictionary to pandas dataframe
df = pd.DataFrame([entry for entry in my_dataset.itervalues()])
df = df.drop('email_address', axis=1)
df = df[features_list]
#df.dtypes
#df.describe()
#df.count()
df = df.convert_objects(convert_numeric=True)
for col in list(df.columns):
df[col] = df[col].round(decimals=3)
print "POI Count:\n", df.poi.value_counts()
df.head()
Out[447]:
In [448]:
# create labels
y = df.poi.values
print y.shape
y[:5]
Out[448]:
In [449]:
# create initial features
X = df.drop('poi', axis=1).values
print X.shape
In [450]:
# imputation for 'NaN' values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
X
Out[450]:
In [451]:
### Task 2: Remove outliers
num_rows = X.shape[0]
num_cols = X.shape[1]
rows_to_remove = set()
for i in xrange(num_cols):
point_five_percentile = np.percentile(X[:,i], 0.5)
ninety_nine_point_five_percentile = np.percentile(X[:,i], 99.5)
for j in xrange(num_rows):
if X[j,i] < point_five_percentile:
#print "\tlow outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
rows_to_remove.add(j)
elif X[j,i] > ninety_nine_point_five_percentile:
#print "\thigh outlier: ", "row: ", j, "col: ", i, " -> ", X[j,i]
rows_to_remove.add(j)
X = np.delete(X, list(rows_to_remove), axis=0)
y = np.delete(y, list(rows_to_remove))
print "new X shape: ", X.shape
print "\ntotal rows removed: ", len(rows_to_remove), "({})".format(round(len(rows_to_remove)/float(num_rows), 2))
In [452]:
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
In [453]:
### Task 3: Create new feature(s)
# scale
scaler = MinMaxScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [454]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
# pca
#pca = RandomizedPCA().fit(X_train)
#print [round(x, 4) for x in pca.explained_variance_ratio_]
#X_train = pca.transform(X_train)
#print X_train.shape
#X_test = pca.transform(X_test)
#print X_test.shape
In [455]:
# select K best
#selector = SelectKBest().fit(X_train, y_train)
#print selector.scores_
#X_train = selector.transform(X_train)
#print X_train.shape
#X_test = selector.transform(X_test)
#print X_test.shape
In [472]:
dict(randomizedpca__n_components = even_range,
randomizedpca__whiten = t_or_f,
randomizedpca__random_state = random_state,
selectkbest__k = even_range)
Out[472]:
In [489]:
even_range = range(2,X.shape[1],2)
random_state = [42]
t_or_f = [True, False]
best_clf = []
#for clf in [GaussianNB(), SVC(), DecisionTreeClassifier(), KMeans()]:
for clf in [GaussianNB()]:
pipeline = make_pipeline(RandomizedPCA(), SelectKBest(), clf)
#print pipeline
params = dict(randomizedpca__n_components = even_range,
randomizedpca__whiten = t_or_f,
randomizedpca__random_state = random_state,
selectkbest__k = ['all'])
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=1)
grid_search.fit(X, y)
best_clf = grid_search.best_estimator_
print "\nbest estimator: ", best_clf, "\n"
print "\nbest score: ", grid_search.best_score_, "\n"
print "\nbest params: ", grid_search.best_params_, "\n"
clf = best_clf.fit(X_train, y_train)
test_classifier(clf, my_dataset, features_list)
GaussianNB()
Accuracy: 0.25560 Precision: 0.18481 Recall: 0.79800 F1: 0.30011 F2: 0.47968
Total predictions: 10000 True positives: 1596 False positives: 7040 False negatives: 404
True negatives: 960
In [435]:
#print("Fitting the classifier to the training set")
#t0 = time()
#param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
# 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
#clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
#clf = clf.fit(X_train_pca, y_train)
#print("done in %0.3fs" % (time() - t0))
##print("Best estimator found by grid search:")
#print(clf.best_estimator_)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])
test_classifier(clf, my_dataset, features_list)
In [418]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])
test_classifier(clf, my_dataset, features_list)
In [419]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='binary')
print recall_score(y_test, y_pred, average='binary')
print classification_report(y_test, y_pred, target_names=['non-poi','poi'])
test_classifier(clf, my_dataset, features_list)
In [420]:
clf = KMeans()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test)
print precision_score(y_test, y_pred, average='macro')
print recall_score(y_test, y_pred, average='macro')
#print classification_report(y_test, y_pred, target_names=['non-poi','poi'])
test_classifier(clf, my_dataset, features_list)
GaussianNB()
Accuracy: 0.25560 Precision: 0.18481 Recall: 0.79800 F1: 0.30011 F2: 0.47968
Total predictions: 10000 True positives: 1596 False positives: 7040 False negatives: 404
True negatives: 960
In [ ]:
In [ ]:
###############################################################################
# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# http://scikit-learn.org/stable/auto_examples/feature_stacker.html
pca = RandomizedPCA()
selection = SelectKBest()
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
X_train = combined_features.fit(X_train, y_train).transform(X_train)
X_test = combined_features.transform(X_test)
In [ ]:
combined_features
In [ ]:
X_train.shape
In [ ]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)
In [ ]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)
In [ ]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)
In [ ]:
clf = KMeans()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print clf.score(X_test, y_test), precision_score(y_test, y_pred), recall_score(y_test, y_pred)
In [ ]:
svm = SVC()
#gnb = GaussianNB()
#tree = DecisionTreeClassifier()
#kmeans = KMeans()
# Do grid search over k, n_components and C:
##pipeline = Pipeline([("features", combined_features), ("svm", svm), ("gnb", gnb), ("tree", tree), ("kmeans", kmeans)])
pipeline = Pipeline([("features", combined_features), ("gnb", gnb)])
param_grid = dict(features__pca__n_components=[x+1 for x in xrange(X.shape[1]-1)],
features__univ_select__k=[x+1 for x in xrange(X.shape[1]-1)],
#svm__C=[0.1, 1, 10])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)
In [ ]:
print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "training time: ", round(time()-t0,3), "s"
print "Best estimator found by grid search:"
print clf.best_estimator_
In [ ]:
svm = SVC(kernel='rbf', C=10000.0)
gnb = GaussianNB()
tree = DecisionTreeClassifier(min_samples_split=40)
kmeans = KMeans(n_clusters=2)
# Do grid search over k, n_components and C:
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
param_grid = dict(features__pca__n_components=[1, 2, 3],
features__univ_select__k=[1, 2],
svm__C=[0.1, 1, 10])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)
In [ ]:
In [ ]:
col_first_half = list(df.columns[0:(len(df.columns)/6)])
col_first_half.append('poi')
In [ ]:
col_second_half = df.columns[(len(df.columns)/2):]
In [ ]:
sns.pairplot(df[col_first_half], hue="poi")
In [ ]:
### Task 2: Remove outliers
for entry in my_dataset.itervalues():
for feature in features_list:
print entry