In [28]:
from __future__ import division
import os, time, math
import cPickle as pickle
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
import csv
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.externals import joblib
np.random.seed(seed=1009)
%matplotlib inline
In [29]:
#%qtconsole
In [30]:
file_path = '../data/'
DESKEWED = True
if DESKEWED:
train_img_filename = 'train-images_deskewed.csv'
test_img_filename = 't10k-images_deskewed.csv'
else:
train_img_filename = 'train-images.csv'
test_img_filename = 't10k-images.csv'
train_label_filename = 'train-labels.csv'
test_label_filename = 't10k-labels.csv'
In [31]:
portion = 1.0 # set to 1.0 for all of it less than 1.0 for less
In [32]:
with open(file_path + train_img_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainX = np.asarray(data, dtype = np.float64)
trainX = trainX[:portion*trainX.shape[0]]
print("trainX shape: {0}".format(trainX.shape))
with open(file_path + train_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainY = np.asarray(data, dtype = np.int8)
trainY = trainY[:portion*trainY.shape[0]].ravel()
print("trainY shape: {0}".format(trainY.shape))
In [33]:
with open(file_path + test_img_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testX = np.asarray(data, dtype = np.float64)
testX = testX[:portion*testX.shape[0]]
print("testX shape: {0}".format(testX.shape))
with open(file_path + test_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testY = np.asarray(data, dtype = np.int8)
testY = testY[:portion*testY.shape[0]].ravel()
print("testY shape: {0}".format(testY.shape))
In [34]:
def print_imgs(images, actual_labels, predicted_labels, starting_index = 0, size=6):
"""
print a grid of images
showing any differences in predicted values
images m x n array of pixels, n assumed to be a perfect square
actual_labels m x 1 array of the actual labels
predicted_labels m x 1 of predicted labels
starting_index scalar, where in 1...m to start
size scalar the grid of images is size x size
"""
img_dim = int(pow(images.shape[1],0.5)) # images assumed to be square
fig, axs = plt.subplots(size,size, figsize=(img_dim,img_dim), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = 0.0001, wspace=.001)
axs = axs.ravel()
for grid_i, img_i in enumerate(xrange(starting_index, starting_index+(size*size))):
# convert from 1 x flat to img_dim x img_dim; flat = img_dim^2
img = np.reshape(images[img_i,:],(img_dim,img_dim))
axs[grid_i].imshow(img, cmap=plt.cm.gray_r, interpolation='nearest')
if actual_labels[img_i] != predicted_labels[img_i]:
axs[grid_i].set_title("actual: {0}; predicted: {1}" \
.format(actual_labels[img_i], predicted_labels[img_i]),
fontsize=16,
color='r')
else:
axs[grid_i].set_title("label: {0}" \
.format(actual_labels[img_i]),
fontsize=16)
plt.show()
In [35]:
print_imgs(images = trainX,
actual_labels = trainY.ravel(),
predicted_labels = trainY.ravel(),#np.random.permutation(trainY),
starting_index = np.random.randint(0, high=trainY.shape[0]-36, size=1)[0],
size = 3)
In [36]:
# default parameters for GradientBoostingClassifier
# =================================================
default_gbm_params = {}
default_gbm_params['loss'] = 'deviance'
default_gbm_params['learning_rate'] = 0.01
default_gbm_params['n_estimators'] = 1650
default_gbm_params['max_depth'] = 4 # fix the tree size
default_gbm_params['max_leaf_nodes'] = None # choose tree size by deviance reduction
# Note: these two are mutually exclusive
default_gbm_params['subsample'] = 0.5 # stochastic by observations
default_gbm_params['max_features'] = None # stochastic by columns (similar to random forest)
default_gbm_params['min_samples_split'] = 2
default_gbm_params['min_samples_leaf'] = 1
#default_gbm_params['min_weight_fraction_leaf'] = 0.0
default_gbm_params['init'] = None
default_gbm_params['random_state'] = 1009 # set a random seed
default_gbm_params['verbose'] = 0
default_gbm_params['warm_start'] = False
# set parameters for the estimator
gbm_params = dict(default_gbm_params)
# the classifier
gbm_clf = GradientBoostingClassifier(**gbm_params)
In [37]:
t0 = time.time()
gbm_clf.fit(trainX, trainY)
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
In [38]:
t0 = time.time()
predicted_values = gbm_clf.predict(testX)
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
In [39]:
target_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
y_true, y_pred = testY.ravel(), predicted_values
print(classification_report(y_true, y_pred, target_names=target_names))
In [40]:
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=plt.cm.winter):
"""
given a confusion matrix (cm), make a nice plot
see the skikit-learn documentation for the original done for the iris dataset
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_true, y_pred)
print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)