In [ ]:
# This tutorial covers your simplest neural network: a multilayer perceptron (MLP)
# Also known as feedforward neural network.
# We will learn to classify MNIST handwritten digit images into their correct label (0-9).
from IPython.display import Image
# First, let's load our data and take a look!
import pickle
# Load our data
# Download and unzip pickled version from here: http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
(train_x, train_y), (valid_x, valid_y), (test_x, test_y) = pickle.load(open('data/mnist.pkl', 'r'))
print "Shapes:"
print train_x.shape, train_y.shape
print valid_x.shape, valid_y.shape
print test_x.shape, test_y.shape
print "--------------"
print "Example input:"
print train_x[0]
print "Example label:"
print train_y[0]
In [ ]:
# Show example images - using tile_raster_images helper function from OpenDeep to get 28x28 image from 784 array.
from utils import tile_raster_images
from PIL import Image as pil_img
input_images = train_x[:25]
im = pil_img.fromarray(
tile_raster_images(input_images,
img_shape=(28, 28),
tile_shape=(1, 25),
tile_spacing=(1, 1))
)
im.save("some_mnist_numbers.png")
Image(filename="some_mnist_numbers.png")
In [ ]:
# Cool, now we know a little about the input data, let's design the MLP to work with it!
# An MLP looks like this: input -> hiddens -> output classification
# Each stage is just a matrix multiplication with a nonlinear function applied after.
# Your basic Theano imports.
import theano
import theano.tensor as T
# Inputs are matrices where rows are examples and columns are pixels - so create a symbolic Theano matrix.
x = T.matrix('x')
In [ ]:
# Now let's start building the equation for our MLP!
# The first transformation is the input x -> hidden layer h.
# We defined this transformation with h = tanh(x.dot(W_x) + b_h)
# where the learnable model parameters are W_x and b_h.
# Therefore, we will need a weights matrix W_x and a bias vector b_h.
# W_x has shape (input_size, hidden_size) and b_h has shape (hidden_size,).
# Initialization is important in deep learning; we want something random so the model doesn't get stuck early.
# Many papers in this subject, but for now we will just use a normal distribution with mean=0 and std=0.05.
# Another good option for tanh layers is to use a uniform distribution with interval +- sqrt(6/sum(shape)).
# These are hyperparameters to play with.
# Bias starting as zero is fine.
import numpy.random as rng
W_x = numpy.asarray(rng.normal(loc=0.0, scale=.05, size=(28*28, 500)), dtype=theano.config.floatX)
b_h = numpy.zeros(shape=(500,), dtype=theano.config.floatX)
# To update a variable used in an equation (for example, while learning),
# Theano needs it to be in a special wrapper called a shared variable.
# These are the model parameters for our first hidden layer!
W_x = theano.shared(W_x, name="W_x")
b_h = theano.shared(b_h, name="b_h")
# Now, we can finally write the equation to give our symbolic hidden layer h!
h = T.tanh(
T.dot(x, W_x) + b_h
)
# Side note - if we used softmax instead of tanh for the activation, this would be performing logistic regression!
In [ ]:
# We have the hidden layer h, let's put that softmax layer on top for classification output y!
# Same deal as before, the transformation is defined as:
# y = softmax(h.dot(W_h) + b_y)
# where the learnable parameters are W_h and b_y.
# W_h has shape (hidden_size, output_size) and b_y has shape (output_size,).
# We will use the same random initialization strategy as before.
W_h = numpy.asarray(rng.normal(loc=0.0, scale=.05, size=(500, 10)), dtype=theano.config.floatX)
b_y = numpy.zeros(shape=(10,), dtype=theano.config.floatX)
# Don't forget to make them shared variables!
W_h = theano.shared(W_h, name="W_h")
b_y = theano.shared(b_y, name="b_y")
# Now write the equation for the output!
y = T.nnet.softmax(
T.dot(h, W_h) + b_y
)
# The output (due to softmax) is a vector of class probabilities.
# To get the output class 'guess' from the model, just take the index of the highest probability!
y_hat = T.argmax(y, axis=1)
# That's everything! Just four model parameters and one input variable.
In [ ]:
# The variable y_hat represents the output of running our model, but we need a cost function to use for training.
# For a softmax (probability) output, we want to maximize the likelihood of P(Y=y|X).
# This means we want to minimize the negative log-likelihood cost! (For a primer, see machine learning Coursera.)
# Cost functions always need the truth outputs to compare against (this is supervised learning).
# From before, we saw the labels were a vector of ints - so let's make a symbolic variable for this!
correct_labels = T.ivector("labels") # integer vector
# Now we can compare our output probability from y with the true labels.
# Because the labels are integers, we will want to make an indexing mask to pick out the probabilities
# our model thought was the likelihood of the correct label.
log_likelihood = T.log(y)[T.arange(correct_labels.shape[0]), correct_labels]
# We use mean instead of sum to be less dependent on batch size (better for flexibility)
cost = -T.mean(log_likelihood)
In [ ]:
# Easiest way to train neural nets is with Stochastic Gradient Descent
# This takes each example, calculates the gradient, and changes the model parameters a small amount
# in the direction of the gradient.
# Fancier add-ons to stochastic gradient descent will reduce the learning rate over time, add a momentum
# factor to the parameters, etc.
# Before we can start training, we need to know what the gradients are.
# Luckily we don't have to do any math! Theano has symbolic auto-differentiation which means it can
# calculate the gradients for arbitrary equations with respect to a cost and parameters.
parameters = [W_x, b_h, W_h, b_y]
gradients = T.grad(cost, parameters)
# Now gradients contains the list of derivatives: [d_cost/d_W_x, d_cost/d_b_h, d_cost/d_W_h, d_cost/d_b_y]
# One last thing we need to do before training is to use these gradients to update the parameters!
# Remember how parameters are shared variables? Well, Theano uses something called updates
# which are just pairs of (shared_variable, new_variable_expression) to change its value.
# So, let's create these updates to show how we change the parameter values during training with gradients!
# We use a learning rate to make small steps over time.
learning_rate = 0.01
train_updates = [(param, param - learning_rate*gradient) for param, gradient in zip(parameters, gradients)]
In [ ]:
# Now we can create a Theano function that takes in real inputs and trains our model.
f_train = theano.function(inputs=[x, correct_labels], outputs=cost, updates=train_updates, allow_input_downcast=True)
# For testing purposes, we don't want to use updates to change the parameters - so create a separate function!
# We also care more about the output guesses, so let's return those instead of the cost.
# error = sum(T.neq(y_hat, correct_labels))/float(y_hat.shape[0])
f_test = theano.function(inputs=[x], outputs=y_hat, allow_input_downcast=True)
In [ ]:
# Our training can begin!
# The two hyperparameters we have for this part are minibatch size (how many examples to process in parallel)
# and the total number of passes over all examples (epochs).
batch_size = 100
epochs = 30
# Given our batch size, compute how many batches we can fit into each data set
train_batches = len(train_x) / batch_size
valid_batches = len(valid_x) / batch_size
test_batches = len(test_x) / batch_size
# Our main training loop!
for epoch in range(epochs):
print epoch+1, ":",
train_costs = []
train_accuracy = []
for i in range(train_batches):
# Grab our minibatch of examples from the whole train set.
batch_x = train_x[i*batch_size:(i+1)*batch_size]
batch_labels = train_y[i*batch_size:(i+1)*batch_size]
# Compute the costs from the train function (which also updates the parameters)
costs = f_train(batch_x, batch_labels)
# Compute the predictions from the test function (which does not update parameters)
preds = f_test(batch_x)
# Compute the accuracy of our predictions against the correct batch labels
acc = sum(preds==batch_labels)/float(len(batch_labels))
train_costs.append(costs)
train_accuracy.append(acc)
# Show the mean cost and accuracy across minibatches (the entire train set!)
print "cost:", numpy.mean(train_costs), "\ttrain:", str(numpy.mean(train_accuracy)*100)+"%",
valid_accuracy = []
for i in range(valid_batches):
batch_x = valid_x[i*batch_size:(i+1)*batch_size]
batch_labels = valid_y[i*batch_size:(i+1)*batch_size]
preds = f_test(batch_x)
acc = sum(preds==batch_labels)/float(len(batch_labels))
valid_accuracy.append(acc)
print "\tvalid:", str(numpy.mean(valid_accuracy)*100)+"%",
test_accuracy = []
for i in range(test_batches):
batch_x = test_x[i*batch_size:(i+1)*batch_size]
batch_labels = test_y[i*batch_size:(i+1)*batch_size]
preds = f_test(batch_x)
acc = sum(preds==batch_labels)/float(len(batch_labels))
test_accuracy.append(acc)
print "\ttest:", str(numpy.mean(test_accuracy)*100)+"%"
In [ ]: