In [1]:
from __future__ import division
import os, time, math, csv
import cPickle as pickle
from operator import itemgetter
from tabulate import tabulate
import matplotlib.pyplot as plt
import numpy as np
import scipy
from print_imgs import print_imgs # my own function to print a grid of square images
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
np.random.seed(seed=1009)
%matplotlib inline
In [2]:
#%qtconsole
In [3]:
file_path = '../data/'
DESKEWED = True
if DESKEWED:
train_img_filename = 'train-images_deskewed.csv'
test_img_filename = 't10k-images_deskewed.csv'
else:
train_img_filename = 'train-images.csv'
test_img_filename = 't10k-images.csv'
train_label_filename = 'train-labels.csv'
test_label_filename = 't10k-labels.csv'
In [4]:
portion = 1.0 # set to 1.0 for all of it, less than 1.0 for less
In [5]:
# read trainX
with open(file_path + train_img_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainX = np.ascontiguousarray(data, dtype = np.float64)
# read trainY
with open(file_path + train_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainY = np.ascontiguousarray(data, dtype = np.int8).ravel()
# shuffle trainX & trainY
trainX, trainY = shuffle(trainX, trainY, random_state=0)
# select a subset if asked
if portion < 1.0:
trainX = trainX[:portion*trainX.shape[0]]
trainY = trainY[:portion*trainY.shape[0]]
print("trainX shape: {0}".format(trainX.shape))
print("trainY shape: {0}\n".format(trainY.shape))
print(trainX.flags)
In [6]:
# read testX
with open(file_path + test_img_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testX = np.ascontiguousarray(data, dtype = np.float64)
# read testY
with open(file_path + test_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testY = np.ascontiguousarray(data, dtype = np.int8).ravel()
# shuffle testX, testY
testX, testY = shuffle(testX, testY, random_state=0)
# select a subset if asked
if portion < 1.0:
testX = testX[:portion*testX.shape[0]]
testY = testY[:portion*testY.shape[0]]
print("testX shape: {0}".format(testX.shape))
print("testY shape: {0}".format(testY.shape))
In [7]:
print_imgs(images = trainX,
actual_labels = trainY,
predicted_labels = trainY,
starting_index = np.random.randint(0, high=trainY.shape[0]-36, size=1)[0],
size = 6)
In [8]:
t0 = time.time()
S = scipy.linalg.svd(trainX, compute_uv=False)
pctvar = np.cumsum((S**2)/np.sum(S**2)) # proportion of variance
pct80 = next(x[0] for x in enumerate(pctvar) if x[1] > 0.80)-1
pct85 = next(x[0] for x in enumerate(pctvar) if x[1] > 0.85)-1
pct90 = next(x[0] for x in enumerate(pctvar) if x[1] > 0.90)-1
pct95 = next(x[0] for x in enumerate(pctvar) if x[1] > 0.95)-1
pct99 = next(x[0] for x in enumerate(pctvar) if x[1] > 0.99)-1
S = None
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [9]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
plt.title("Number of princomps for percent of variance explained")
plt.axvline(x=pct80)
plt.axvline(x=pct85)
plt.axvline(x=pct90)
plt.axvline(x=pct95)
plt.axvline(x=pct99)
plt.axvline(x=trainX.shape[1],linestyle="--")
plt.plot(pctvar)
x1,x2,y1,y2 = plt.axis()
plt.axis((x1, x2, 0.4, 1.1))
ax.annotate(str(pct80), xy=(pct80, 0.8), xycoords='data',
xytext=(-50, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->")
)
ax.annotate("80%", xy=(pct80, 0.6), xycoords='data',
xytext=(-5, 0), textcoords='offset points')
ax.annotate(str(pct85), xy=(pct85, 0.85), xycoords='data',
xytext=(-50, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->")
)
ax.annotate("85%", xy=(pct85, 0.7), xycoords='data',
xytext=(-5, 0), textcoords='offset points')
ax.annotate(str(pct90), xy=(pct90, 0.9), xycoords='data',
xytext=(-50, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->")
)
ax.annotate("90%", xy=(pct90, 0.80), xycoords='data',
xytext=(-5, 0), textcoords='offset points')
ax.annotate(str(pct95), xy=(pct95, 0.95), xycoords='data',
xytext=(-50, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->")
)
ax.annotate("95%", xy=(pct95, 0.85), xycoords='data',
xytext=(-5, 0), textcoords='offset points')
ax.annotate(str(pct99), xy=(pct99, 0.99), xycoords='data',
xytext=(-50, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->")
)
ax.annotate("99%", xy=(pct99, 0.95), xycoords='data',
xytext=(-5, 0), textcoords='offset points')
plt.show()
In [10]:
table=[[80,pct80, pct80/trainX.shape[1]*100, (1-pct80/trainX.shape[1])*100],
[85,pct85, pct85/trainX.shape[1]*100, (1-pct85/trainX.shape[1])*100],
[90,pct90, pct90/trainX.shape[1]*100, (1-pct90/trainX.shape[1])*100],
[95,pct95, pct95/trainX.shape[1]*100, (1-pct95/trainX.shape[1])*100],
[99,pct99, pct99/trainX.shape[1]*100, (1-pct99/trainX.shape[1])*100]]
print tabulate(np.round(table), headers=["Pct Variance","Princomps","Percent", "Pct Reduction"])
In [11]:
pca = PCA(n_components=0.85, whiten=True)
trainX = pca.fit_transform(trainX)
testX = pca.transform(testX)
print("trainX shape: {0}".format(trainX.shape))
print("trainY shape: {0}\n".format(trainY.shape))
print("testX shape: {0}".format(testX.shape))
print("testY shape: {0}".format(testY.shape))
In [12]:
# default parameters for SVC
# ==========================
default_svc_params = {}
default_svc_params['C'] = 1.0 # penalty
default_svc_params['class_weight'] = None # Set the parameter C of class i to class_weight[i]*C
# set to 'auto' for unbalanced classes
default_svc_params['gamma'] = 0.0 # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
default_svc_params['kernel'] = 'rbf' # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable
default_svc_params['shrinking'] = True # Whether to use the shrinking heuristic.
default_svc_params['probability'] = False # Whether to enable probability estimates.
default_svc_params['tol'] = 0.001 # Tolerance for stopping criterion.
default_svc_params['cache_size'] = 200 # size of the kernel cache (in MB).
default_svc_params['max_iter'] = -1 # limit on iterations within solver, or -1 for no limit.
default_svc_params['verbose'] = False
default_svc_params['degree'] = 3 # 'poly' only
default_svc_params['coef0'] = 0.0 # 'poly' and 'sigmoid' only
# set parameters for the estimator
svc_params = dict(default_svc_params)
svc_params['cache_size'] = 2000
# the classifier
svc_clf = SVC(**svc_params)
In [13]:
t0 = time.time()
# search grid
# ===========
search_grid = dict(C = np.logspace(-1, 3, 50),
gamma = np.logspace(-4, 0, 50))
# stratified K-Fold indices
# =========================
SKFolds = StratifiedKFold(y = trainY,
n_folds = 3,
indices = None)
# default parameters for RandomizedSearchCV
# =========================================
default_random_params = {}
default_random_params['scoring'] = None
default_random_params['fit_params'] = None # dict of parameters to pass to the fit method
default_random_params['n_jobs'] = 1
default_random_params['pre_dispatch'] = '2*n_jobs' # memory is copied this many times
# reduce if you're running into memory problems
default_random_params['iid'] = True # assume the folds are iid
default_random_params['refit'] = True # Refit the best estimator with the entire dataset
default_random_params['cv'] = None
default_random_params['verbose'] = 0
default_random_params['n_iter'] = 10
# set parameters for the randomized grid search
# =============================================
random_params = dict(default_random_params)
random_params['verbose'] = 1
random_params['cv'] = SKFolds
random_params['n_jobs'] = -1 # -1 => use all available cores
# one core per fold
# for each point in the grid
random_params['n_iter'] = 100 # choose this many random combinations of parameters
# from the 'search_grid'
# perform the randomized parameter grid search
# ============================================
random_search = RandomizedSearchCV(estimator = svc_clf,
param_distributions = search_grid,
**random_params)
random_search.fit(trainX, trainY)
# ===========================
# save the classifier to disk
# ===========================
pickle.dump(random_search, open('SVC_RBF_PCA.pkl', 'wb'))
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [14]:
# =================================
# retrieve the classifier from disk
# =================================
random_search = None
random_search = pickle.load(open('SVC_RBF_PCA.pkl', 'rb'))
# how many duds?
mean_score_list = [score.mean_validation_score for score in random_search.grid_scores_]
print("\nProportion of random scores below 98%: {0:.2f}\n".format(sum(np.array(mean_score_list)<0.98)/len(mean_score_list)))
# what do the best ones look like?
for score in sorted(random_search.grid_scores_, key=itemgetter(1), reverse=True)[:10]:
print score
This script was extensively modified to work with the score results from RandomizedSearchCV
In [15]:
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):
"""Utility function to move the midpoint of a colormap to be around the values of interest."""
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
# --------------------------------------------------------------------------------
# skip this many parameter values on the display axes
tick_step_size_C = math.ceil(len(search_grid['C']) / 15)
tick_step_size_gamma = math.ceil(len(search_grid['gamma']) / 15)
# create 'heatmap'
# ================
# a C x gamma matrix; initially all zeros (black)
heatmap = np.zeros((len(search_grid['C']), len(search_grid['gamma'])))
# for each score, find the index in 'heatmap' of the 'C' and 'gamma' values
# at that index intersection put the mean score
for score in random_search.grid_scores_:
# index of C and gamma in 'search_grid'
ceeinx = search_grid['C'].tolist().index(score[0]['C'])
gaminx = search_grid['gamma'].tolist().index(score[0]['gamma'])
heatmap[ceeinx, gaminx] = score[1]
# display the heatmap
# ===================
plt.figure(figsize=(10, 8))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(heatmap, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
# label the axes
plt.xticks(np.arange(0, len(search_grid['gamma']), tick_step_size_gamma),
search_grid['gamma'][::tick_step_size_gamma],
rotation=45)
plt.yticks(np.arange(0, len(search_grid['C']), tick_step_size_C),
search_grid['C'][::tick_step_size_C])
# cross hairs
ceeinx = search_grid['C'].tolist().index(random_search.best_params_['C'])
plt.axhline(y=ceeinx)
gaminx = search_grid['gamma'].tolist().index(random_search.best_params_['gamma'])
plt.axvline(x=gaminx)
plt.title('Parameter-pair accuracy')
plt.show()
print("\nThe best parameters are %s\nwith a score of %0.2f, misclass of %0.4f"
% (random_search.best_params_, random_search.best_score_, 1-random_search.best_score_))
In [16]:
target_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
predicted_values = random_search.predict(testX)
y_true, y_pred = testY, predicted_values
print(classification_report(y_true, y_pred, target_names=target_names))
In [17]:
def plot_confusion_matrix(cm,
target_names,
title='Proportional Confusion matrix',
cmap=plt.cm.Paired):
"""
given a confusion matrix (cm), make a nice plot
see the skikit-learn documentation for the original done for the iris dataset
"""
plt.figure(figsize=(8, 6))
plt.imshow((cm/cm.sum(axis=1)), interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# --------------------------------------------------------------------------------------------
cm = confusion_matrix(y_true, y_pred)
print(cm)
model_accuracy = sum(cm.diagonal())/len(testY)
model_misclass = 1 - model_accuracy
print("\nModel accuracy: {0}, model misclass rate: {1}".format(model_accuracy, model_misclass))
plot_confusion_matrix(cm, target_names)
The red line shows how well we fit the training data. The larger the score, the lower the bias. We expect the red line to start very near to 1.0 since we ought to be able to fit just a few points very well. We expect the red line to decline slightly since more points to fit requires a more complex model.
The green line shows the accuracy of the predictions of the test set. We expect it to start much lower than the red line but to increase continuously as the amount of training data used to create the model grows. An appropriate algorithm, correctly parameterized should push the green line higher and higher as we train with more training data. The best case is for the red line to decline only very slightly from 1.0 and for the green line to rise to intersect the red line.
A red line that starts below 1.0 and/or declines steeply indicates bias, a model that does not even fit the data it already knows the answer for. In addition to reviewing whether the algorithm is appropriate and whether it is optimally parameterized you may consider ways to increase the number of useful predictor variables.
A red line that hugs the top but for which the green line does not rise to meet it indicates overfitting.
In [18]:
t0 = time.time()
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure(figsize=(10, 6))
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
# --------------------------------------------------------------------------------
C_gamma = "C=" + str(np.round(random_search.best_params_['C'],4)) + \
", gamma=" + str(np.round(random_search.best_params_['gamma'],6))
plot_learning_curve(estimator = random_search.best_estimator_,
title = "Learning Curves (SVM, RBF, " + C_gamma + ")",
X = trainX,
y = trainY,
ylim = (0.85, 1.01),
cv = ShuffleSplit(n = trainX.shape[0],
n_iter = 5,
test_size = 0.2,
random_state = 0),
train_sizes = np.linspace(.1, 1.0, 5),
n_jobs = -1)
plt.show()
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [19]:
t0 = time.time()
from sklearn.learning_curve import validation_curve
from sklearn.cross_validation import ShuffleSplit
for param_name, param_range in zip(["gamma","C"], [np.linspace(search_grid['gamma'][0],search_grid['gamma'][-1],10),
np.linspace(search_grid['C'][0],search_grid['C'][-1],10)]):
train_scores, test_scores = validation_curve(estimator = random_search.best_estimator_,
X = trainX,
y = trainY,
param_name = param_name,
param_range = param_range,
cv = ShuffleSplit(n = trainX.shape[0],
n_iter = 5,
test_size = 0.2,
random_state = 0),
scoring = 'accuracy',
n_jobs = -1,
pre_dispatch = '2*n_jobs',
verbose = 0)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10,6))
plt.title("Validation Curve for " + param_name)
plt.xlabel(param_name)
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range,
train_scores_mean,
label="training score", color="r")
plt.fill_between(param_range,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2, color="r")
plt.semilogx(param_range,
test_scores_mean,
label="test score", color="g")
plt.fill_between(param_range,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
print("\nThe best parameters chosen were %s\nwith a score of %0.2f, misclass of %0.4f"
% (random_search.best_params_, random_search.best_score_, 1-random_search.best_score_))
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))