In [1]:
import numpy as np # For matrix/vector operations
import gzip # For un-zipping the MNIST data
import pickle # For de-serializing the MNIST data
import matplotlib.pyplot as pyplot # For graphing
In [2]:
"""
We use the serialized and zipped copy of the MNIST data available at Michael Nielson's
repository:
https://github.com/mnielsen/neural-networks-and-deep-learning
"""
# Unzip the MNIST Data
with gzip.open('mnist.pkl.gz', 'rb') as f:
# Unpickle (de-serialize) the
mnist_data = pickle.load(f, encoding='bytes');
"""
mnist_data = 3-tuple containing (mnist_training, mnist_validation, mnist_testing)
mnist_training = 2-tuple containing (training_data, training_labels)
training_data = [50,000, 784] ndarray of data for each training example
training_labels = [50,000] ndarray of labels for the training_data
mnist_validation = 2-tuple containing (validation_data, validation_labels)
validation_data = [10,000, 784] ndarray of data for each validation image
validation_labels = [10,000] ndarray of labels for the training_data
mnist_testing = 2-tuple containing (testing_data, testing_labels)
testing_data = [10,000, 784] ndarray of data for each testing image
testing_labels = [10,000] ndarray of labels for testing data
"""
training_data = mnist_data[0][0];
training_labels = mnist_data[0][1];
validation_data = mnist_data[1][0];
validation_labels = mnist_data[1][1];
testing_data = mnist_data[2][0];
testing_labels = mnist_data[2][1];
# Concatenate lables to end of each training example to work with the implementation
# of the train() function
# Now training_examples, validation_examples, and testing_examples are all
# [50,000, 785] or [10,000, 785] arrays where the last column contains the labels
training_examples = np.column_stack((training_data, training_labels));
validation_examples = np.column_stack((validation_data, validation_labels));
testing_examples = np.column_stack((testing_data, testing_labels));
In [3]:
"""
Simple_NN
Class to create a simple neural network
Learning Algorithm: stochastic gradient descent with backpropagation
Activation Function: sigmoid
Cost Function: MSE
"""
class Simple_NN(object):
"""
INITIALIZE THE NETWORK
"""
def __init__(self, layers, activation_function="sigmoid", cost_function="MSE"):
"""
self.layers is a list of numbers where the ith number how many neurons are in
the ith layer of the network.
"""
self.layers = layers;
self.num_layers = len(layers);
"""
self.weights[Layer - 1, input_neuron, output_neuron] =
List of weight matrices for each layer.
self.biases[Layer - 1, neuron] =
List of vectors with biases for each neuron
FOR EXAMPLE:
self.weights[layer, j, i] = weight going into the jth neuron of the lth layer
from the ith neuron of the (l-1)st layer
self.biases[layer, k] = bias on the kth nuron of the lth layer
NOTE: layer 0 is the input layer, so self.weights[0] is the weights going into layer 1
"""
self.weights = [];
self.biases = [];
self.Z = [];
self.activations = [];
# Create matrices with correct dimensions
for layer_num in range(1, self.num_layers):
self.weights.append(np.random.randn(layers[layer_num], layers[layer_num - 1]));
self.biases.append(np.random.randn(layers[layer_num]));
"""
self.activation = string specifying what activation function the neurons will use.
The options are:
sigmoid (default)
self.cost_function = string specifying hat cost function will be used for the
network.
The options are:
MSE (default)
"""
self.activation_function = activation_function;
self.cost_function = cost_function;
self.errors = [];
self.gradC_b = [];
self.gradC_w = [];
"""
TRAINING
Train the network using stochastic gradient descent and backpropagation.
Training data should be given in the following format:
[x11, x12, ..., x1i, y1
x21, x22, ..., x2i, y2
...
xm1, xm1, ..., xmi, ym]
Where each row corrsponds to a training example with i data points
"""
def train(self, training_data, testing_data, batch_size, num_epochs, learning_rate):
for epoch in range(num_epochs):
# Randomize the order of training examples
np.random.shuffle(training_data);
# Separate inputs from outputs
inputs = training_data[:, :-1];
outputs = training_data[:, -1];
testing_inputs = testing_data[:, :-1];
testing_labels = testing_data[:, -1];
# For each epoch, loop through each batch to use as training data
for batch in range(len(training_data))[::batch_size]:
"""
For each batch, we calculate activations and use the backpropagation
algorithm to change the weights and biases using gradient descent
Create matrix out of all training inputs in the batch
To apply W to all input vectors, we can multiply WX where
X is the ixm matrix containing all m training examples as columns
"""
X = inputs[batch : batch + batch_size];
Y = outputs[batch : batch + batch_size];
X = np.transpose(X);
# FEEDFORWARD
self.evaluate(X, training=True);
# BACKPROPAGATION
self.backpropagate(X, Y);
# GRADIENT DESCENT
self.grad_descent();
# For each epoch, compute number of correct outputs out of testing data
print("Epoch %d: %s" % \
((epoch + 1), self.test(testing_inputs, testing_labels)));
"""
EVALUATE
Takes an input column vector X and computes the output of the network
"""
def evaluate(self, X, training=False):
"""
self.Z[layer, neuron, training_example] =
List of vectors with weighted inputs to the neurons
self.activations[layer, neuron, training_Example] =
List of vectors with activations for each neuron
"""
self.Z = [];
self.activations = [];
# Calclate outputs going forwards through the network
for layer in range(self.num_layers - 1):
if layer == 0:
# Feed inputs to the network
prev_activations = X;
else:
prev_activations = self.activations[layer - 1];
# If training, X will be a 2D matrix [input, training_example]. In this case,
# bias_matrix where each column is a copy of the bias vector is needed
# to add bias terms for each training example.
if (training):
one_vector = np.ones(np.shape(X)[1]);
bias_matrix = np.outer(self.biases[layer], one_vector);
else:
# When evaluating single training example, we want a 1D bias vector
bias_matrix = self.biases[layer];
self.Z.append(np.dot(self.weights[layer], prev_activations) + bias_matrix);
self.activations.append(self.activation(self.Z[layer]));
"""
BACKPROPAGATE
Using input vector X and output vector Y, calculates the following instance variables:
self.errors[layer, neuron, training_example] - error in each neuron
self.gradC_b[layer] - average gradient wrt biases over all training examples
self.gradC_w[layer, input_neuron, output_neuron] - average gradient wrt weights
"""
def backpropagate(self, X, Y):
# Calculate output error matrix so the [i, j]th entry contains the
# error for the ith neuron in the output layer for the jth training
# example
output_error = np.multiply(
self.cost_derivative(self.activations[-1], Y),
self.activation_derivative(self.Z[-1]));
# Backpropogate: we create the errors matrix which is indexed
# in the form errors[layer, neuron, training_example]
self.errors = [output_error];
# Note that in the loop, we use negative subscripts to go through the
# layers from output towards input. We therefore start at layer [-2], the
# second to last layer
for layer in range(2, self.num_layers):
# For each layer, calculate errors in previous layer
previous_errors = np.multiply(
np.dot(
np.transpose(self.weights[-layer + 1]),
self.errors[0]),
self.activation_derivative(self.Z[-layer]));
# Add previous errors to the beginning of the error matrix list
self.errors.insert(0, previous_errors);
# Calculate gradients of cost function
self.gradC_b = [];
self.gradC_w = [];
for layer in range(self.num_layers - 1):
"""
gradC_b[layer, neuron] =
Gradient of cost wrt biases for a layer is just the
vector of errors for that layer.
When we compute the average over all training examples using
np.average(matrix, 1), we get a column vector. In order to
correctly subtract this from the row-vector of biases,
we transpose the gradient vectors.
"""
self.gradC_b.append(np.average(self.errors[layer],1));
"""
sum_of_weights[layer, j, k] will contain the partial derivative
of cost wrt the weight from the kth neuron in layer - 1 to the jth
neuron in layer summed over all training examples. That is,
sum_of_weightes[layer, j, k] = [sum over training examples dC/dw_j,k]
"""
if (layer == 0):
prev_activations = X;
else:
prev_activations = self.activations[layer - 1];
sum_of_weights = np.dot(
self.errors[layer],
np.transpose(prev_activations));
"""
gradC_w [layer, input_neuron, output_neuron] is also
averaged over all traininge examples in the batch
"""
self.gradC_w.append((1 / batch_size) * sum_of_weights);
"""
GRADIENT DESCENT
Use the gradients computed in backpropagate() to update the weights and
biases of the network.
"""
def grad_descent(self):
for layer in range(self.num_layers - 1):
self.biases[layer] = np.subtract(self.biases[layer],
np.transpose(np.multiply(learning_rate, self.gradC_b[layer])));
self.weights[layer] = np.subtract(self.weights[layer],
np.multiply(learning_rate, self.gradC_w[layer]));
"""
ACTIVATION FUNCTION
For this network, we use the sigmoid function to calculate neuron activation
In general, we assume the input z will be a matrix where the [i,j]th entry is the
ith neuron in the jth training example
"""
def activation(self, z):
if (self.activation_function == "sigmoid"):
return 1.0 / (1 + np.exp(-z));
def activation_derivative(self, z):
if (self.activation_function == "sigmoid"):
return np.multiply((1 - self.activation(z)), self.activation(z));
"""
COST FUNCTION
We assume that activations is a 2D matrix where each column corresponds to the activations
for a specific training example and each row corresponds to a specific neuron
We assume for now that outputs for the network are disjoint categories, so only
one output neuron should fire at a time. The output_matrix function turns the output
vector where each entry corresponds to a training example into a matrix where each
column corresponds to a training example, with only one nonzero entry per column
corresponding to the output for that example
Both calculate_cost and cost_derivative return a row vector where the ith entry
is the cost/cost derivative for training example i
"""
def output_matrix(self, activations, outputs):
# output_matrix[neuron, training_example]
output_matrix = np.zeros((activations.shape[0], outputs.shape[0]));
for training_example, output in enumerate(outputs):
output_matrix[int(output), training_example] = 1;
return output_matrix;
def calculate_cost(self, activations, outputs):
outputs_matrix = self.output_matrix(activations, outputs);
if (self.cost_function == "MSE"):
squared_errors = np.square(activations - output_matrix);
# Average sum of squared errors over each training example
return 0.5 * np.average(np.sum(squared_errors, 0));
"""
Derivative of cost functions with respect to activations
Each entry [i, j] in the resulting matrix will be the gradient of the cost function
with respect to the activation of the ith neuron in the jth training example
"""
def cost_derivative(self, activations, outputs):
output_matrix = self.output_matrix(activations, outputs);
if (self.cost_function == "MSE"):
return (activations - output_matrix);
"""
PREDICT
predict(x) - Given a data vector x, returns an integer for which class the network
predicts x belongs to. That is, the integer returned is the neuron in the
output layer with the maximum activation, indexed starting at 0.
"""
def predict(self, x):
self.evaluate(x);
return np.argmax(self.activations[-1]);
"""
display_prediction(x) shows an image of the input with the network prediction.
Height and width can be customized, but their defualts are set for the MNIST images.
The show parameter also allows you to not immediately show the prediction, allowing
for subplotting.
"""
def display_prediction(self, x, height=28, width=28, show=True):
prediction = self.predict(x);
image = np.reshape(x, (height,width));
pyplot.title('Prediction: %d' % prediction, fontsize = 12, color = 'white');
pyplot.imshow(image, cmap='Greys_r');
if (show):
pyplot.show();
"""
display_predictions(index) will create a figure with 9 images and their predicted
digits. The images are selected from the testing images beginning with the image
with the given index. Indices must be between 0-10,000
"""
def display_predictions(self, index):
for example_num in range(1, 10):
subplot = 330 + example_num;
pyplot.subplot(subplot);
test.display_prediction(testing_data[index + example_num], show=False);
pyplot.axis('off');
pyplot.tight_layout();
pyplot.show();
"""
test(testing_data, testing_labels) - predicts class for all data in
testing_data and computes accuracy as a percentage using
"""
def test(self, testing_data, testing_labels):
total_examples = len(testing_labels);
total_correct = 0;
for example in range(total_examples):
if (testing_labels[example] == self.predict(testing_data[example])):
total_correct = total_correct + 1;
accuracy = total_correct / total_examples;
return (str(total_correct) + " / " + str(total_examples) + " = " + str(accuracy))
"""
print_network prints the weights, biases, and activations for the entire network.
An argument of 1 enables debugging mode, which prints more information,
including errors and gradients.
"""
def print_network(self, debug=0):
print("Number of layers: ");
print(self.num_layers);
print("Weights: ")
for layer in self.weights:
print(layer)
print("\nBiases:" )
for layer in self.biases:
print(layer)
print("\nActivations:")
for layer in self.activations:
print(layer);
# Print extra info about the network
if (debug == 1):
print("\nWeighted Inputs:")
for layer in self.Z:
print(layer)
print("\nErrors:")
for layer in self.errors:
print(layer)
print("\nBias Gradients:")
for layer in self.gradC_b:
print(layer)
print("\nWeight Gradients")
for layer in self.gradC_w:
print(layer)
"""
print_activations will print the output activations
"""
def print_activations(self, index=0):
image = testing_data[index];
test.predict(image);
print("Output Activations:")
for i in range(0, 10):
print("%d: %f" % (i, test.activations[1][i]))
In [4]:
"""
CREATE AND TRAIN NETWORK
"""
test = Simple_NN([784, 30, 10]);
batch_size = 10;
num_epochs = 5;
learning_rate = 3;
test.train(training_examples, validation_examples, batch_size, num_epochs, learning_rate);
print("Done Training")
In [5]:
"""
TEST NETWORK
"""
result = test.test(testing_data, testing_labels);
print(result)
In [6]:
"""
VISUALIZE RESULTS
"""
index =1670;
test.display_predictions(index);