In [2]:
# Setup code for this notebook
import random
import numpy as np
import matplotlib.pyplot as plt
# This is a bit of magic gto make matplotlib figures appear inline
# in the notebook rather than in a new window
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
I use the loading function from course code from Stanford University
Run get_datasets.sh in terminal to download the datasets, or download from Alex Krizhevsky.
get_datasets.sh
# Get CIFAR10 wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz tar -xzvf cifar-10-python.tar.gz rm cifar-10-python.tar.gz
The results of the downloading is showed in following figure.
In [3]:
# Write function to load the cifar-10 data
# The original code is from http://cs231n.github.io/assignment1/
# The function is in data_utils.py file for reusing.
import cPickle as pickle
import numpy as np
import os
def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, 'r') as f:
datadict = pickle.load(f)
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,6):
f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte
In [4]:
from algorithms.data_utils import load_CIFAR10
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
def get_CIFAR10_data(num_training=49000, num_val=1000, num_test=10000, show_sample=True):
"""
Load the CIFAR-10 dataset, and divide the sample into training set, validation set and test set
"""
cifar10_dir = 'datasets/datasets-cifar-10/cifar-10-batches-py/'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# subsample the data for validation set
mask = xrange(num_training, num_training + num_val)
X_val = X_train[mask]
y_val = y_train[mask]
mask = xrange(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = xrange(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
return X_train, y_train, X_val, y_val, X_test, y_test
def subset_classes_data(classes):
# Subset 'plane' and 'car' classes to perform logistic regression
idxs = np.logical_or(y_train_raw == 0, y_train_raw == 1)
X_train = X_train_raw[idxs, :]
y_train = y_train_raw[idxs]
# validation set
idxs = np.logical_or(y_val_raw == 0, y_val_raw == 1)
X_val = X_val_raw[idxs, :]
y_val = y_val_raw[idxs]
# test set
idxs = np.logical_or(y_test_raw == 0, y_test_raw == 1)
X_test = X_test_raw[idxs, :]
y_test = y_test_raw[idxs]
return X_train, y_train, X_val, y_val, X_test, y_test
def visualize_sample(X_train, y_train, classes, samples_per_class=7):
"""visualize some samples in the training datasets """
num_classes = len(classes)
for y, cls in enumerate(classes):
idxs = np.flatnonzero(y_train == y) # get all the indexes of cls
idxs = np.random.choice(idxs, samples_per_class, replace=False)
for i, idx in enumerate(idxs): # plot the image one by one
plt_idx = i * num_classes + y + 1 # i*num_classes and y+1 determine the row and column respectively
plt.subplot(samples_per_class, num_classes, plt_idx)
plt.imshow(X_train[idx].astype('uint8'))
plt.axis('off')
if i == 0:
plt.title(cls)
plt.show()
def preprocessing_CIFAR10_data(X_train, y_train, X_val, y_val, X_test, y_test):
# Preprocessing: reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1)) # [49000, 3072]
X_val = np.reshape(X_val, (X_val.shape[0], -1)) # [1000, 3072]
X_test = np.reshape(X_test, (X_test.shape[0], -1)) # [10000, 3072]
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis = 0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# Add bias dimension and transform into columns
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
return X_train, y_train, X_val, y_val, X_test, y_test
# Invoke the above functions to get our data
X_train_raw, y_train_raw, X_val_raw, y_val_raw, X_test_raw, y_test_raw = get_CIFAR10_data()
visualize_sample(X_train_raw, y_train_raw, classes)
subset_classes = ['plane', 'car']
X_train_raw, y_train_raw, X_val_raw, y_val_raw, X_test_raw, y_test_raw = subset_classes_data(subset_classes)
X_train, y_train, X_val, y_val, X_test, y_test = preprocessing_CIFAR10_data(X_train_raw, y_train_raw, X_val_raw, y_val_raw, X_test_raw, y_test_raw)
# As a sanity check, we print out th size of the training and test data dimenstion
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape
In [1]:
# Test the loss and gradient and compare between two implementations
from algorithms.classifiers import loss_grad_logistic_naive, loss_grad_logistic_vectorized
import time
# generate a rand weights W
W = np.random.randn(1, X_train.shape[0]) * 0.001
tic = time.time()
loss_naive, grad_naive = loss_grad_logistic_naive(W, X_train, y_train, 0)
toc = time.time()
print 'Naive loss: %f, and gradient: computed in %fs' % (loss_naive, toc - tic)
tic = time.time()
loss_vec, grad_vect = loss_grad_logistic_vectorized(W, X_train, y_train, 0)
toc = time.time()
print 'Vectorized loss: %f, and gradient: computed in %fs' % (loss_vec, toc - tic)
# Compare the gradient, because the gradient is a vector, we canuse the Frobenius norm to compare them
# the Frobenius norm of two matrices is the square root of the squared sum of differences of all elements
diff = np.linalg.norm(grad_naive - grad_vect, ord='fro')
# Randomly choose some gradient to check
idxs = np.random.choice(X_train.shape[0], 10, replace=False)
print idxs
print grad_naive[0, idxs]
print grad_vect[0, idxs]
print 'Gradient difference between naive and vectorized version is: %f' % diff
del loss_naive, loss_vec, grad_naive
In [5]:
# file: algorithms/gradient_check.py
def grad_check_sparse(f, x, analytic_grad, num_checks):
"""
sample a few random elements and only return numerical
in this dimensions.
"""
h = 1e-5
print x.shape
for i in xrange(num_checks):
ix = tuple([randrange(m) for m in x.shape])
print ix
x[ix] += h # increment by h
fxph = f(x) # evaluate f(x + h)
x[ix] -= 2 * h # increment by h
fxmh = f(x) # evaluate f(x - h)
x[ix] += h # reset
grad_numerical = (fxph - fxmh) / (2 * h)
grad_analytic = analytic_grad[ix]
rel_error = abs(grad_numerical - grad_analytic) / (abs(grad_numerical) + abs(grad_analytic))
print 'numerical: %f analytic: %f, relative error: %e' % (grad_numerical, grad_analytic, rel_error)
In [6]:
# Check gradient using numerical gradient along several randomly chosen dimenstion
from algorithms.gradient_check import grad_check_sparse
f = lambda w: loss_grad_logistic_vectorized(w, X_train, y_train, 0)[0]
grad_numerical = grad_check_sparse(f, W, grad_vect, 10)
In [8]:
# Training logistic regression classifier using SGD
from algorithms.classifiers import Logistic
# using BGD algorithm
logistic_bgd = Logistic()
tic = time.time()
losses_bgd = logistic_bgd.train(X_train, y_train, method='bgd', batch_size=200, learning_rate=1e-6,
reg = 1e2, num_iters=1000, verbose=True, vectorized=True)
toc = time.time()
print 'Traning time for BGD with vectorized version is %f \n' % (toc - tic)
# Compute the accuracy of training data and validation data using Logistic.predict function
y_train_pred_bgd = logistic_bgd.predict(X_train)[0]
print 'Training accuracy: %f' % (np.mean(y_train == y_train_pred_bgd))
y_val_pred_bgd = logistic_bgd.predict(X_val)[0]
print 'Validation accuracy: %f' % (np.mean(y_val == y_val_pred_bgd))
# # using SGD algorithm
logistic_sgd = Logistic()
tic = time.time()
losses_sgd = logistic_sgd.train(X_train, y_train, method='sgd', batch_size=200, learning_rate=1e-6,
reg = 1e2, num_iters=1000, verbose=True, vectorized=True)
toc = time.time()
print 'Traning time for SGD with vectorized version is %f \n' % (toc - tic)
y_train_pred_sgd = logistic_sgd.predict(X_train)[0]
print 'Training accuracy: %f' % (np.mean(y_train == y_train_pred_sgd))
y_val_pred_sgd = logistic_sgd.predict(X_val)[0]
print 'Validation accuracy: %f' % (np.mean(y_val == y_val_pred_sgd))
In [8]:
# A useful degugging strategy is to plot the loss
# as a function of iteration number:
from ggplot import *
qplot(xrange(len(losses_bgd)), losses_bgd) + labs(x='Iteration number', y='BGD Loss value')
Out[8]:
In [9]:
qplot(xrange(len(losses_sgd)), losses_sgd) + labs(x='Iteration number', y='SGD Loss value')
Out[9]:
From the computing results, we can see that:
In [10]:
# Using validation set to tuen hyperparameters, i.e., learning rate and regularization strength
learning_rates = [1e-5, 1e-8]
regularization_strengths = [10e2, 10e4]
# Result is a dictionary mapping tuples of the form (learning_rate, regularization_strength)
# to tuples of the form (training_accuracy, validation_accuracy). The accuracy is simply the fraction
# of data points that are correctly classified.
results = {}
best_val = -1
best_logistic = None
# Choose the best hyperparameters by tuning on the validation set
i = 0
interval = 5
for learning_rate in np.linspace(learning_rates[0], learning_rates[1], num=interval):
i += 1
print 'The current iteration is %d/%d' % (i, interval)
for reg in np.linspace(regularization_strengths[0], regularization_strengths[1], num=interval):
logistic = Logistic()
logistic.train(X_train, y_train, method='sgd', batch_size=200, learning_rate=learning_rate,
reg = reg, num_iters=1000, verbose=False, vectorized=True)
y_train_pred = logistic.predict(X_train)[0]
y_val_pred = logistic.predict(X_val)[0]
train_accuracy = np.mean(y_train == y_train_pred)
val_accuracy = np.mean(y_val == y_val_pred)
results[(learning_rate, reg)] = (train_accuracy, val_accuracy)
if val_accuracy > best_val:
best_val = val_accuracy
best_logistic = logistic
else:
pass
# Print out the results
for learning_rate, reg in sorted(results):
train_accuracy,val_accuracy = results[(learning_rate, reg)]
print 'learning rate %e and regularization %e, \n \
the training accuracy is: %f and validation accuracy is: %f.\n' % (learning_rate, reg, train_accuracy, val_accuracy)
In [16]:
# Iterate the process num_iter times
num_iter = 5
reg_val_accu = []
unreg_val_accu = []
reg_test_accu = []
unreg_test_accu = []
for i in xrange(num_iter):
print 'The iteration %d/%d' % (i, num_iter)
logistic_reg = Logistic()
logistic_unreg = Logistic()
losses_temp1 = logistic_reg.train(X_train, y_train, method='sgd', batch_size=200, learning_rate=2.507500e-06,
reg = 1.000000e+03, num_iters=1000, verbose=False, vectorized=True)
losses_temp2 = logistic_unreg.train(X_train, y_train, method='sgd', batch_size=200, learning_rate=2.507500e-06,
reg = 0, num_iters=1000, verbose=False, vectorized=True)
y_val_pred_reg = logistic_reg.predict(X_val)[0]
y_val_pred_unreg = logistic_unreg.predict(X_val)[0]
y_test_pred_reg = logistic_reg.predict(X_test)[0]
y_test_pred_unreg = logistic_unreg.predict(X_test)[0]
reg_val_accu.append(np.mean(y_val_pred_reg == y_val))
unreg_val_accu.append(np.mean(y_val_pred_unreg == y_val))
reg_test_accu.append(np.mean(y_test_pred_reg == y_test))
unreg_test_accu.append(np.mean(y_test_pred_unreg == y_test))
print 'Regularized validation accuracy: ', reg_val_accu
print 'Unregularized validation accuracy: ', unreg_val_accu
print 'Regularized test accuracy: ', reg_test_accu
print 'Unregularized test accuracy: ', unreg_test_accu
In [17]:
y_test_predict_result = best_logistic.predict(X_test)
y_test_predict = y_test_predict_result[0]
test_accuracy = np.mean(y_test == y_test_predict)
print 'The test accuracy is: %f' % test_accuracy
y_test_scores = y_test_predict_result[1]
print y_test_scores
qplot(xrange(len(y_test_scores)), y_test_scores) + labs(x='Sample number', y='Scores')
Out[17]:
In [18]:
idx_wrong = np.flatnonzero(y_test != y_test_predict) # 1-dimenstion array
X_test_wrong = X_test_raw[idx_wrong]
y_test_wrong = y_test_raw[idx_wrong]
X_test_right = X_test_raw[-idx_wrong]
y_test_right = y_test_raw[-idx_wrong]
scores_wrong = y_test_scores[idx_wrong]
scores_right = y_test_scores[-idx_wrong]
print X_test_wrong.shape
print X_test_raw.shape
print y_test_wrong.shape
print y_test_raw.shape
qplot(xrange(len(scores_wrong)), scores_wrong) + labs(x='Sample number', y='Wrong Scores')
qplot(xrange(len(scores_right)), scores_right) + labs(x='Sample number', y='Right Scores')
Out[18]:
In [19]:
# show the first 10 image
# clases = ['plane', 'car']
classes = ['plane', 'car']
visualize_sample(X_test_wrong, y_test_wrong, classes)
visualize_sample(X_test_right, y_test_right, classes)
In [20]:
plt.hist(scores_wrong)
plt.xlabel('Wrong Scores')
Out[20]:
In [21]:
plt.hist(scores_right)
plt.xlabel('Right Scores')
Out[21]:
In [22]:
In [ ]: