Exploring ensemble methods, feature engineering, and implementing in a live system.
Group 8:
Junbo Huang
Khanh "Katie" Le
Justin Shenk
Marie Sindermann
Follow along: https://github.com/JustinShenk/sonic-face
In [ ]:
import os
import numpy as np
import matplotlib
matplotlib.use('TkAgg') # For displaying animation
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn import svm as svm
from sklearn.externals import joblib
from sklearn import linear_model as lm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier as mlpc
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
AdaBoostClassifier, BaggingClassifier,
GradientBoostingClassifier)
from helper_functions import *
from normalize_data import *
from numpy import array
%matplotlib notebook
In [ ]:
RAW_DATA_DIR = 'sonic_pi_face/data/'
# Get list of data files
data_files = get_data_files(RAW_DATA_DIR)
# Load data into a dictionary
# Note: Checks for incomplete data
data_dict = get_gesture_data(data_files)
In [ ]:
gestures = list(data_dict)
print(gestures) # List gestures
sample = data_dict['open-close'][3] # 3rd Open-close sample
image = sample[4] # 5th frame of sample
plt.imshow(image[...,0]) # x-coordinates slice
plt.show()
In [ ]:
sample = data_dict['open-close'][5] # Fifth sample
anim = display_frames(sample)
In [ ]:
np.asarray(data_dict['slide-horizontally']).shape
In [ ]:
# FIXME: Complete HoG feature selection
import matplotlib.mlab as mlab
%matplotlib inline
data_slide_v = np.asarray(data_dict['slide-vertically'])
x_values = data_slide_v[:,4,...,0].flatten()
y_values = data_slide_v[:,4,...,1].flatten()
data_slide_h = np.asarray(data_dict['slide-horizontally'])
x_values_h = data_slide_h[:,4,...,0].flatten()
y_values_h = data_slide_h[:4,...,1].flatten()
fig, axarr = plt.subplots(2,2)
axarr[0,0].set_title('Horizontal motion in slide-vertical')
axarr[0,0].hist(x_values, bins=50, normed=True)
axarr[0,0].set_autoscaley_on(False)
axarr[0,0].set_ylim([0,1])
axarr[0,1].set_title('Vertical motion in slide-vertical')
axarr[0,1].hist(y_values, bins=50, normed=True,orientation='horizontal')
axarr[0,1].set_autoscalex_on(False)
axarr[0,1].set_xlim([0,.3])
axarr[1,0].set_title('Horizontal motion in slide-horizontal')
axarr[1,0].hist(x_values_h,bins=50,normed=True)
axarr[1,0].set_autoscaley_on(False)
axarr[1,0].set_ylim([0,1])
axarr[1,1].set_title('Vertical motion in slide-horizontal')
axarr[1,1].hist(y_values_h, bins=50,normed=True, orientation='horizontal')
axarr[1,1].set_autoscalex_on(False)
axarr[1,1].set_xlim([0,.3])
plt.tight_layout()
plt.show()
In [194]:
avg_frame = np.zeros((len(data_open_close),len(data_open_close[1][1]),len(data_open_close[1][1][1]),len(data_open_close[1][1][1][1])))
for i in range(len(data_open_close)):
sum_frame = np.zeros((40,40,2))
for j in range(len(data_open_close[i])):
sum_frame+= data_open_close[i][j]
avg_frame[i] = sum_frame/len(data_open_close[i])
In [195]:
avg_feature = np.zeros((len(data_open_close[1]),len(data_open_close[1][1]),len(data_open_close[1][1][1]),len(data_open_close[1][1][1][1])))
sum_feature = np.zeros((10,40,40,2))
for i in range(len(data_open_close)):
sum_feature+= data_open_close[i]
avg_feature = sum_feature/len(data_open_close)
In [ ]:
# Load all pre-processed data sets if available.
data_sets = []
DATA_DIR = 'data'
# Number of rows and colums to permute for optical flow feature extraction
divs=[4,10,20]
if os.path.exists(DATA_DIR):
for file in os.listdir(DATA_DIR):
if file.endswith('.csv'):
df = pd.read_csv(os.path.join(DATA_DIR,file))
df = df.drop('Unnamed: 0',axis=1)
data_sets.append(df)
else:
# Generate data sets.
print("Directory not found at {}\nPreprocessing data for "
"optimization.".format(os.path.join(os.getcwd(),DATA_DIR)))
data_sets = make_feature_sets(data_dict,divs=divs)
# Save locally
save_data_sets(data_sets,divs=divs)
In [ ]:
# Example: Reduce the features of one data set.
# Dataframe with 32 (16 * 2 (x and y coordinates)) dimensions
df_red = feature_extract(data_dict,cols=4,rows=4)
# Display comparison of feature reduction levels.
%matplotlib inline
gestures = ['slide-vertically','waving-beauty-pageant-style','empty']
ax = optimize_feature_dimensions(data_sets,divs,method='rf', gestures=gestures)
plt.show()
In [ ]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
# specify parameters and distributions to sample from
param_dist = {"max_depth": [None],
"max_features": sp_randint(5, 25),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 60
In [ ]:
# Prepare data
# List of gestures
gestures=['empty','waving-beauty-pageant-style','slide-vertically']
# Reduced features data set (pandas DataFrame)
data = data_sets[1] # Choose middle (or here argmax) feature set
data = data[data['label'].isin(gestures)]
data, targets = encode_target(data, 'label') # Encode target column
#-------------#
# Raw data analysis for comparison (numpy array)
empty_array = np.asarray(data_dict['empty'])
slide_v_array = np.asarray(data_dict['slide-vertically'])
waving_array = np.asarray(data_dict['waving-beauty-pageant-style'])
data_raw = np.concatenate([empty_array,slide_v_array,waving_array])
#-------------#
# Split into features and target
X, y = class_split(data,gestures=gestures)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Split into features and target (numpy raw data)
X_raw, y_raw = class_split(data,gestures=gestures)
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, y_raw, random_state=42)
In [ ]:
# Classify test data using random forest
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train, y_train)
accuracy = clf.score(X_test,y_test)
print("Predictions:\n{}".format(clf.predict(X_test)))
print("Actual:\n{}".format(y_test[:10]))
print("Score:\n{}".format(accuracy))
#FIXME
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search)
random_search.fit(X.values, y.values)
print("RandomizedSearchCV evaluated %d candidates"
" parameter settings." % (n_iter_search))
report(random_search.cv_results_)
In [ ]:
# Classify test data using random forest on raw data (optional)
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train_raw, y_train_raw)
accuracy = clf.score(X_test_raw,y_test_raw)
print("Predictions:\n{}".format(clf.predict(X_test_raw)))
print("Actual:\n{}".format(y_test_raw[:10]))
print("Score:\n{}".format(accuracy))
#FIXME
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search)
random_search.fit(X_raw.values, y_raw.values)
print("RandomizedSearchCV evaluated %d candidates"
" parameter settings." % (n_iter_search))
report(random_search.cv_results_)
In [ ]:
clf_adaboost = AdaBoostClassifier(DecisionTreeClassifier())
clf_adaboost = clf_adaboost.fit(X_train, y_train)
accuracy = clf_adaboost.score(X_test, y_test)
print("Predictions:\n{}".format(clf_adaboost.predict(X_test)))
print("Actual:\n{}".format(y_test[:10]))
print("Score:\n{}".format(accuracy))
In [ ]:
clf_bagging = BaggingClassifier()
clf_bagging = clf_bagging.fit(X_train, y_train)
print(clf_bagging.score(X_test, y_test))
In [ ]:
clf_extra_tree = ExtraTreesClassifier()
clf_extra_tree = clf_extra_tree.fit(X_train, y_train)
print(clf_extra_tree.score(X_test, y_test))
In [ ]:
clf_gradient_boosting = GradientBoostingClassifier(n_estimators=100)
clf_gradient_boosting = clf_gradient_boosting.fit(X_train, y_train)
print(clf_gradient_boosting.score(X_test,y_test))
# print("Predictions:\n{}".format(clf_bagging.predict(X_test)))
# print("Actual:\n{}".format(y_test))
In [ ]:
clf_mlpc = mlpc(hidden_layer_sizes=800,verbose=True)
clf_mlpc = clf_mlpc.fit(X_train, y_train)
print(clf_mlpc.score(X_test,y_test))
In [ ]:
clf_svm = svm.SVC(kernel='poly', C=1.0)
clf_svm = clf_svm.fit(X_train, y_train)
print(clf_svm.score(X_test,y_test))
In [ ]:
joblib.dump(clf_gradient_boosting, 'classifier.pkl')
In [ ]:
# specify parameters and distributions to sample from
param_dist_rf = {
"max_depth": [None],
"max_features": sp_randint(25, 32),
"min_samples_split": sp_randint(2,11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search_rf = 100
# specify parameters and distributions to sample from
param_dist_adb = {
"n_estimators": [25, 30,35,40,45,50,55,60],
"learning_rate": [0.1,0.2,0.3,0.4,0.5],
"algorithm": ["SAMME", "SAMME.R"]}
n_iter_search_adb = 80
# specify parameters and distributions to sample from
param_dist_bagging = {"n_estimators" : [5,10,15,20],
"max_samples" :[0.7,0.8,0.9,1.0],
"max_features" :[0.7,0.8,0.9,1.0],
"bootstrap": [True, False]
}
# run randomized search
n_iter_search_bagging = 50
# specify parameters and distributions to sample from
param_dist_extree = {"max_depth": [None],
"max_features": sp_randint(5, 25),
"min_samples_split": sp_randint(2,11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False]}
# run randomized search
n_iter_search_extree = 400
# specify parameters and distributions to sample from
param_dist_grab = {"learning_rate" : [0.1,0.2,0.3],
"max_depth": [None],
"max_features": sp_randint(5, 25),
"min_samples_split": sp_randint(2,11),
"min_samples_leaf": sp_randint(1, 11)
}
# run randomized search
n_iter_search_grab = 30
param_dist = [param_dist_rf,param_dist_adb,param_dist_bagging,
param_dist_extree,param_dist_grab]
n_iter_search = [n_iter_search_rf,n_iter_search_adb,n_iter_search_bagging,
n_iter_search_extree,n_iter_search_grab]
models = [
RandomForestClassifier(),
AdaBoostClassifier(DecisionTreeClassifier()),
BaggingClassifier(),
ExtraTreesClassifier(),
GradientBoostingClassifier()
]
In [ ]:
data_list = get_data_list(divs=[4,10,20])
divs=[4,10,20]
combis = get_combis(divs)
for index in range(len(models)):
# Train models
random_search = RandomizedSearchCV(models[index], param_distributions=param_dist[index],
n_iter=n_iter_search[index])
random_search.fit(X_train, y_train)
#print("RandomizedSearchCV evaluated %d candidates"
# " parameter settings." % (n_iter_search))
scores = random_search.score(X_test, y_test)
# Create a title for each column and the console by using str() and
# slicing away useless parts of the string
model_title = str(type(models[index])).split(".")[-1][:-2][:-len("Classifier")]
print("best score for {} is {}, \n with the parameters {}\n".format(model_title, random_search.best_score_,random_search.best_params_))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: