In [28]:
    
from __future__ import division
import os, time, math
import cPickle as pickle
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
import csv
from sklearn.ensemble         import GradientBoostingClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search      import GridSearchCV
from sklearn.metrics          import classification_report, confusion_matrix, f1_score
from sklearn.externals        import joblib
np.random.seed(seed=1009)
%matplotlib inline
    
In [29]:
    
#%qtconsole
    
In [30]:
    
file_path = '../data/'
DESKEWED = True
if DESKEWED:
    train_img_filename = 'train-images_deskewed.csv'
    test_img_filename  = 't10k-images_deskewed.csv'
else:
    train_img_filename = 'train-images.csv'
    test_img_filename  = 't10k-images.csv'
    
train_label_filename   = 'train-labels.csv'
test_label_filename    = 't10k-labels.csv'
    
In [31]:
    
portion = 1.0  # set to 1.0 for all of it less than 1.0 for less
    
In [32]:
    
with open(file_path + train_img_filename,'r') as f:
    data_iter = csv.reader(f, delimiter = ',')
    data      = [data for data in data_iter]
trainX = np.asarray(data, dtype = np.float64)  
trainX = trainX[:portion*trainX.shape[0]]
print("trainX shape: {0}".format(trainX.shape))
with open(file_path + train_label_filename,'r') as f:
    data_iter = csv.reader(f, delimiter = ',')
    data      = [data for data in data_iter]
trainY = np.asarray(data, dtype = np.int8) 
trainY = trainY[:portion*trainY.shape[0]].ravel()
print("trainY shape: {0}".format(trainY.shape))
    
    
In [33]:
    
with open(file_path + test_img_filename,'r') as f:
    data_iter = csv.reader(f, delimiter = ',')
    data      = [data for data in data_iter]
testX = np.asarray(data, dtype = np.float64)  
testX = testX[:portion*testX.shape[0]]
print("testX shape: {0}".format(testX.shape))
with open(file_path + test_label_filename,'r') as f:
    data_iter = csv.reader(f, delimiter = ',')
    data      = [data for data in data_iter]
testY = np.asarray(data, dtype = np.int8)
testY = testY[:portion*testY.shape[0]].ravel()
print("testY shape: {0}".format(testY.shape))
    
    
In [34]:
    
def print_imgs(images, actual_labels, predicted_labels, starting_index = 0, size=6):
    """
    print a grid of images
    showing any differences in predicted values
    
    images           m x n array of pixels, n assumed to be a perfect square
    actual_labels    m x 1 array of the actual labels
    predicted_labels m x 1 of predicted labels
    starting_index   scalar, where in 1...m to start
    size             scalar the grid of images is size x size
    """
    img_dim  = int(pow(images.shape[1],0.5)) # images assumed to be square
    fig, axs = plt.subplots(size,size, figsize=(img_dim,img_dim), facecolor='w', edgecolor='k')
    fig.subplots_adjust(hspace = 0.0001, wspace=.001)
    axs = axs.ravel()
    
    for grid_i, img_i in enumerate(xrange(starting_index, starting_index+(size*size))):
        
        # convert from 1 x flat to img_dim x img_dim; flat = img_dim^2
        img = np.reshape(images[img_i,:],(img_dim,img_dim))
        
        axs[grid_i].imshow(img, cmap=plt.cm.gray_r, interpolation='nearest')
        
        if actual_labels[img_i] != predicted_labels[img_i]:
            axs[grid_i].set_title("actual: {0}; predicted: {1}" \
                                  .format(actual_labels[img_i], predicted_labels[img_i]), 
                                  fontsize=16,
                                  color='r')
        else:
            axs[grid_i].set_title("label: {0}" \
                                  .format(actual_labels[img_i]), 
                                  fontsize=16)
            
    plt.show()
    
In [35]:
    
print_imgs(images           = trainX, 
           actual_labels    = trainY.ravel(), 
           predicted_labels = trainY.ravel(),#np.random.permutation(trainY), 
           starting_index   = np.random.randint(0, high=trainY.shape[0]-36, size=1)[0],
           size             = 3)
    
    
In [36]:
    
# default parameters for GradientBoostingClassifier
# =================================================
default_gbm_params = {}
default_gbm_params['loss'] = 'deviance'
default_gbm_params['learning_rate'] = 0.01 
default_gbm_params['n_estimators']  = 1650
default_gbm_params['max_depth']      = 4     # fix the tree size
default_gbm_params['max_leaf_nodes'] = None  # choose tree size by deviance reduction
                                             # Note: these two are mutually exclusive
default_gbm_params['subsample']    = 0.5     # stochastic by observations
default_gbm_params['max_features'] = None    # stochastic by columns (similar to random forest)
default_gbm_params['min_samples_split'] = 2 
default_gbm_params['min_samples_leaf']  = 1 
#default_gbm_params['min_weight_fraction_leaf'] = 0.0
default_gbm_params['init']         = None 
default_gbm_params['random_state'] = 1009    # set a random seed
default_gbm_params['verbose']      = 0 
default_gbm_params['warm_start']   = False
# set parameters for the estimator
gbm_params = dict(default_gbm_params)
# the classifier
gbm_clf = GradientBoostingClassifier(**gbm_params)
    
In [37]:
    
t0 = time.time()
gbm_clf.fit(trainX, trainY)
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
    
    
In [38]:
    
t0 = time.time()
predicted_values = gbm_clf.predict(testX)
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
    
    
In [39]:
    
target_names     = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
y_true, y_pred   = testY.ravel(), predicted_values
print(classification_report(y_true, y_pred, target_names=target_names))
    
    
In [40]:
    
def plot_confusion_matrix(cm, 
                          target_names,
                          title='Confusion matrix', 
                          cmap=plt.cm.winter):  
    """
    given a confusion matrix (cm), make a nice plot
    see the skikit-learn documentation for the original done for the iris dataset
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
cm = confusion_matrix(y_true, y_pred)  
print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)