Adapt from the Stanford CS231n assignment1, find the original version on the course website.
In this exercise we will:
In [1]:
import random
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.data_utils import load_CIFAR10
from cs231n.gradient_check import grad_check_sparse
# plotting setting
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
In [2]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the linear classifier. These are the same steps as we used for the
SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# subsample the data
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
# Preprocessing: reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis = 0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# add bias dimension and transform into columns
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
return X_train, y_train, X_val, y_val, X_test, y_test
# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape:', X_train.shape)
print('Train labels shape:', y_train.shape)
print('Validation data shape:', X_val.shape)
print('Validation labels shape:', y_val.shape)
print('Test data shape:', X_test.shape)
print('Test labels shape:', y_test.shape)
In [3]:
class LinearClassifier:
def __init__(self):
self.W = None
def train(
self, X, y,
learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200,
verbose=False, seed=None
):
"""Train this linear classifier using stochastic gradient descent.
Inputs:
- X: D x N array of training data. Each training point is a
D-dimensional column.
- y: 1-dimensional array of length N with labels 0...K-1 for K classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) regularization strength.
- num_iters: (integer) number of steps to take when optimizing.
- batch_size: (integer) number of training examples to use
at each step.
- verbose: (boolean) If true, print progress during optimization.
Outputs:
A list containing the value of the loss function at each training
iteration.
"""
dim, num_train = X.shape
# assume y takes values 0...K-1 where K is number of classes
num_classes = np.max(y) + 1
if self.W is None:
# lazily initialize W
self.W = np.random.randn(num_classes, dim) * 0.001
batch_rs = np.random.RandomState(seed)
# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
batch_ix = batch_rs.choice(
np.arange(num_train),
size=batch_size, replace=True
)
X_batch = X[:, batch_ix]
y_batch = y[batch_ix]
# evaluate loss and gradient, internally use self.W
loss, grad = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)
# perform parameter update
self.W -= grad * learning_rate
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
return loss_history
def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.
Inputs:
- X: D x N array of training data. Each column is a D-dimensional point.
Returns:
- y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
score_pred = self.W.dot(X) # shape: C x N
y_pred = score_pred.argmax(axis=0)
return y_pred
def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.
Inputs:
- X_batch: D x N array of data; each column is a data point.
- y_batch: 1-dimensional array of length N with labels 0...K-1, for K classes.
- reg: (float) regularization strength.
Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
raise NotImplementedError
In [4]:
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)
Inputs:
- W: C x D array of weights
- X: D x N array of data. Data are D-dimensional columns
- y: 1-dimensional array of length N with labels 0...K-1, for K classes
- reg: (float) regularization strength
Returns:
a tuple of:
- loss as single float
- gradient with respect to weights W, an array of same size as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape[1]
f = W.dot(X) # shape: C x N
p = np.zeros(num_train, dtype=np.float)
for i in range(num_train):
f_i = f[:, i].copy() # shape C x 1
f_i -= np.max(f_i) # improve numerical stability
f_i = np.exp(f_i)
x_i = X[:, i]
all_class_p_i = f_i / np.sum(f_i)
p[i] = all_class_p_i[y[i]]
# Update gradient
# all_class_p_i no used later, don't copy
dw_x_weight_i = all_class_p_i
dw_x_weight_i[y[i]] -= 1
dW -= dw_x_weight_i[:, np.newaxis] * x_i[np.newaxis, :]
loss += np.mean(-np.log(p))
# Add regularization
loss += 0.5 * reg * np.sum(W * W)
# Gradient
# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW /= -num_train
dW += reg * W
return loss, dW
In [5]:
%%time
# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(10, 3073) * 0.0001
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)
# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))
In [6]:
%%time
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0)
# As we did for the SVM, use numeric gradient checking as a debugging tool.
# The numeric gradient should be close to the analytic gradient.
f = lambda w: softmax_loss_naive(w, X_train, y_train, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad, 5) # increase 5 here to check for more times
In [7]:
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_train = X.shape[1]
_train_ix = np.arange(num_train) # for sample coord 0...N-1
f = W.dot(X) # shape: C x N
f -= np.max(f, axis=0) # improve numerical stability
f = np.exp(f)
p = f / np.sum(f, axis=0) # shape: C x N
# loss function
loss += np.mean(-np.log(p[y, _train_ix]))
loss += 0.5 * reg * np.sum(W * W)
# gradient
# ref: http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression
dW_x_weight = p # no use p later, don't copy
dW_x_weight[y, _train_ix] -= 1
# CxD -= CxN dot NxD
dW -= dW_x_weight.dot(X.T)
dW /= -num_train
dW += reg * W
return loss, dW
In [8]:
# Now that we have a naive implementation of the softmax loss function and its gradient,
# implement a vectorized version in softmax_loss_vectorized.
# The two versions should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_train, y_train, 0.00001)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))
tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_train, y_train, 0.00001)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))
# The losses should match but your vectorized implementation should be much faster.
# The loss is a single number, so it is easy to compare the values computed
# by the two implementations. The gradient on the other hand is a matrix, so
# we use the Frobenius norm to compare them.
grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)
In [9]:
class Softmax(LinearClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """
def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
In [10]:
# Now implement SGD in LinearSVM.train() function and run it with the code below
softmax = Softmax()
tic = time.time()
loss_hist = softmax.train(
X_train, y_train,
learning_rate = 5e-8, reg=1e3,
# better params:
# learning_rate=5e-7, reg=5e4,
num_iters=1500, seed=9527, verbose=True
)
toc = time.time()
print('That took %fs' % (toc - tic))
In [11]:
# A useful debugging strategy is to plot the loss as a function of
# iteration number:
plt.plot(loss_hist)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()
In [12]:
y_train_pred = softmax.predict(X_train)
print('training accuracy: {:.3%}'.format(np.mean(y_train == y_train_pred)))
y_val_pred = softmax.predict(X_val)
print('validation accuracy: {:.3%}'.format(np.mean(y_val == y_val_pred)))
In [13]:
# evaluate on test set
# Evaluate the best svm on test set
y_test_pred = softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))
In [14]:
# Visualize the learned weights for each class
w = softmax.W[:,:-1] # strip out the bias
w = w.reshape(10, 32, 32, 3)
w_min, w_max = np.min(w), np.max(w)
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for i in range(10):
plt.subplot(2, 5, i + 1)
# Rescale the weights to be between 0 and 255
wimg = 255.0 * (w[i].squeeze() - w_min) / (w_max - w_min)
plt.imshow(wimg.astype('uint8'))
plt.axis('off')
plt.title(classes[i])
plt.show()
In [ ]: