In [1]:
from __future__ import division
import os, time, math
import cPickle as pickle
#import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
import csv
from print_imgs import print_imgs # my own function to print a grid of square images
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.externals import joblib
np.random.seed(seed=1009)
%matplotlib inline
In [2]:
#%qtconsole
In [3]:
file_path = '../data/'
train_img_deskewed_filename = 'train-images_deskewed.csv'
train_img_original_filename = 'train-images.csv'
test_img_deskewed_filename = 't10k-images_deskewed.csv'
test_img_original_filename = 't10k-images.csv'
train_label_filename = 'train-labels.csv'
test_label_filename = 't10k-labels.csv'
In [4]:
portion = 1.0 # set to less than 1.0 for testing; set to 1.0 to use the entire dataset
In [5]:
# read both trainX files
with open(file_path + train_img_original_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainXo = np.ascontiguousarray(data, dtype = np.float64)
with open(file_path + train_img_deskewed_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainXd = np.ascontiguousarray(data, dtype = np.float64)
# vertically concatenate the two files
trainX = np.vstack((trainXo, trainXd))
trainXo = None
trainXd = None
# scale trainX
scaler = StandardScaler()
scaler.fit(trainX) # find mean/std for trainX
trainX = scaler.transform(trainX) # scale trainX with trainX mean/std
# read trainY twice and vertically concatenate
with open(file_path + train_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainYo = np.ascontiguousarray(data, dtype = np.int8)
trainYd = np.ascontiguousarray(data, dtype = np.int8)
trainY = np.vstack((trainYo, trainYd)).ravel()
trainYo = None
trainYd = None
data = None
# shuffle trainX & trainY
trainX, trainY = shuffle(trainX, trainY, random_state=0)
# use less data if specified
if portion < 1.0:
trainX = trainX[:portion*trainX.shape[0]]
trainY = trainY[:portion*trainY.shape[0]].ravel()
print("trainX shape: {0}".format(trainX.shape))
print("trainY shape: {0}\n".format(trainY.shape))
print(trainX.flags)
In [6]:
# read testX
with open(file_path + test_img_deskewed_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testX = np.ascontiguousarray(data, dtype = np.float64)
if portion < 1.0:
testX = testX[:portion*testX.shape[0]]
# scale testX
testX = scaler.transform(testX) # scale testX with trainX mean/std
# read testY
with open(file_path + test_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testY = np.ascontiguousarray(data, dtype = np.int8)
if portion < 1.0:
testY = testY[:portion*testY.shape[0]].ravel()
# shuffle testX, testY
testX, testY = shuffle(testX, testY, random_state=0)
print("testX shape: {0}".format(testX.shape))
print("testY shape: {0}".format(testY.shape))
In [7]:
print_imgs(images = trainX,
actual_labels = trainY.ravel(),
predicted_labels = trainY.ravel(),
starting_index = np.random.randint(0, high=trainY.shape[0]-36, size=1)[0],
size = 6)
In [8]:
# default parameters for SVC
# ==========================
default_svc_params = {}
default_svc_params['C'] = 1.0 # penalty
default_svc_params['class_weight'] = None # Set the parameter C of class i to class_weight[i]*C
# set to 'auto' for unbalanced classes
default_svc_params['gamma'] = 0.0 # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
default_svc_params['kernel'] = 'rbf' # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable
# use of 'sigmoid' is discouraged
default_svc_params['shrinking'] = True # Whether to use the shrinking heuristic.
default_svc_params['probability'] = False # Whether to enable probability estimates.
default_svc_params['tol'] = 0.001 # Tolerance for stopping criterion.
default_svc_params['cache_size'] = 200 # size of the kernel cache (in MB).
default_svc_params['max_iter'] = -1 # limit on iterations within solver, or -1 for no limit.
default_svc_params['random_state'] = 1009
default_svc_params['verbose'] = False
default_svc_params['degree'] = 3 # 'poly' only
default_svc_params['coef0'] = 0.0 # 'poly' and 'sigmoid' only
# set the parameters for the classifier
# =====================================
svc_params = dict(default_svc_params)
svc_params['C'] = 25.595479226995359
svc_params['gamma'] = 0.00068664884500429981
svc_params['cache_size'] = 2000
# create the classifier itself
# ============================
svc_clf = SVC(**svc_params)
In [9]:
t0 = time.time()
svc_clf.fit(trainX, trainY.ravel())
print(svc_clf)
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [10]:
target_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
predicted_values = svc_clf.predict(testX)
y_true, y_pred = testY.ravel(), predicted_values
print(classification_report(y_true, y_pred, target_names=target_names))
In [11]:
def plot_confusion_matrix(cm,
target_names,
title='Proportional Confusion matrix',
cmap=plt.cm.Paired):
"""
given a confusion matrix (cm), make a nice plot
see the skikit-learn documentation for the original done for the iris dataset
"""
plt.figure(figsize=(8, 6))
plt.imshow((cm/cm.sum(axis=1)), interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_true, y_pred)
print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)
The red line shows how well the model fits the data it was trained on:
The green line shows how well the model predicts the test data: if it's rising then it means more data to train on will produce better predictions
In [12]:
t0 = time.time()
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure(figsize=(8, 6))
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.tight_layout()
plt.legend(loc="best")
return plt
C_gamma = "C="+str(np.round(svc_params['C'],4))+", gamma="+str(np.round(svc_params['gamma'],6))
title = "Learning Curves (SVM, RBF, " + C_gamma + ")"
plot_learning_curve(estimator = svc_clf,
title = title,
X = trainX,
y = trainY.ravel(),
ylim = (0.85, 1.01),
cv = ShuffleSplit(n = trainX.shape[0],
n_iter = 5,
test_size = 0.2,
random_state=0),
n_jobs = 8)
plt.show()
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))