In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
In this notebook, we will create a Neural Network model from scratch to solve a Multi-Class Classification problem using TensorFlow. We are going to use the popular MNIST dataset (Grayscale images of hand-written digits from 0 to 9). If you already have some idea on how to create and train models using a Keras but want to dive a bit into the lower level workings of Neural Networks, then this notebook would, hopefully, be useful to you.
In [0]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
assert tf.__version__.startswith('2')
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
print('TensorFlow version:', tf.__version__)
print('Is Executing Eagerly?', tf.executing_eagerly())
Next, let's define a class called NNModel for our neural network model. This class will encompass functionality typical to neural networks like forward propagation, computing costs and so on. After this class is ready, we will be able to use an instance of this class to simply call a method which will train the model on the training set and then return predictions on a given test set. When the class is initialized, we can pass on a list of number of nodes for the various layers including for the input and the output layers. All the layers are going to be densely connected. If, for example, we have 4 dimensional feature vectors, 3 output classes and want to use 2 hidden layers with 8 nodes each, we will pass on a list of number of nodes when instantiating the class like this:
layers = [4, 8, 8, 3]
model = NNModel(layers)
In [0]:
class NNModel():
def __init__(self, layers):
self.costs = [] # for storing in-training costs
self.W = [] # for storing trainable weights
self.layers = layers # a list of layers: each item is number of nodes
self.L = len(layers) # total number of layers including input & output
In [0]:
zeros_init = tf.zeros_initializer()
class NNModel(NNModel):
def initialize_params(self):
for layer in range(1, self.L):
self.W.append(tf.Variable(tf.random.normal([self.layers[layer],
self.layers[layer-1]])))
To perform one step of forward propagation, we will define the function given below. One step means computing on one single batch of training examples. First, we compute the linear outputs followed by computation of the activation outputs. In this model, we are only going to use the relu activation function. Note that the output layer does not need to have this activation since a probability distribution can be automatically calculate when using the categorical_crossentropy loss later. When the computations for all the layers are done, the final linear output is returned by this function.
In [0]:
class NNModel(NNModel):
def forward_prop(self, x_batch):
A = []
Z = []
# compute linear and activation outputs for all the nodes
A.append(tf.transpose(a=x_batch))
for layer in range(1, self.L):
Z.append(tf.matmul(self.W[layer-1], A[layer-1]))
if layer != self.L - 1: # No activation is applied to the output layer
A.append(tf.nn.relu(Z[layer-1]))
return tf.transpose(a=Z[self.L-2])
In [0]:
class NNModel(NNModel):
def predict(self, x_batch):
Z = self.forward_prop(x_batch)
return tf.argmax(input=Z, axis=1)
Finally, we will write a function to run the training loop. This function will take arguments for the training set, the test set, number of epochs and batch size. We will first initialize the weights, then create use the Adam Optimizer algorithm provided in TensorFlow.
The training loop itself uses a GradientTape context for each batch. In this context, we will calculate costs for all the batches calling the categorical_crossentropy loss function given in TensorFlow which can take a logits tensor if the from_logits argument is set to True. Then we will use the context to calculate gradients and update the weights.
In [0]:
class NNModel(NNModel):
def train(self, x_train, y_train, x_test, y_test, epochs=10, batch_size=128):
self.initialize_params()
m = x_train.shape[0]
optimizer = tf.optimizers.Adam()
for epoch in range(epochs):
epoch_cost = 0
for batch in range(int(m/batch_size)):
x_batch = x_train[(batch*batch_size):(batch*batch_size+batch_size)]
y_batch = y_train[(batch*batch_size):(batch*batch_size+batch_size)]
# Compute the cost for this batch within the GradientTape context
with tf.GradientTape() as tape:
Z = self.forward_prop(x_batch)
batch_loss = tf.losses.categorical_crossentropy(y_batch, Z,
from_logits=True)
batch_cost = tf.reduce_mean(batch_loss)
# Use the GradientTape context to automatically compute gradients
grads = tape.gradient(batch_cost, self.W)
optimizer.apply_gradients(zip(grads, self.W))
epoch_cost += batch_cost
self.costs.append(epoch_cost.numpy())
print('Epoch {}/{}. Cost: {:.2f}'.format(epoch+1, epochs,
epoch_cost.numpy()))
preds = self.predict(x_test)
return preds
Now that the NNModel class is complete, let's import the dataset. We will also convert the labels to their one-hot encoded representations, normalize the pixel values for all examples (You can also try normalizing by first subtracting the mean from all pixel values and then dividing by the total range of values but simply dividing by the total range also works well). Finally, we will reshape the examples on both the sets to unroll then from 28 by 28 arrays to 784 dimensional vectors.
In [0]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_test_orig = x_test
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
x_train = x_train / 255.
x_test = x_test / 255.
x_train = np.float32(x_train)
x_train = np.reshape(x_train, (60000, 784))
x_test = np.float32(x_test)
x_test = np.reshape(x_test, (10000, 784))
Now all we have to do is instantiate a model, pass in a list of layers with the number of units we want for each layer and call the train() method. We will need to pass the training set and the test set. The model will return predictions on the test set after the training is complete.
In [0]:
model = NNModel([784, 128, 128, 10])
preds = model.train(x_train, y_train, x_test, y_test)
In [0]:
plt.plot(range(10), model.costs)
plt.xlabel('Epochs')
plt.ylabel('Cost')
plt.show()
In [0]:
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5, 5, i + 1)
plt.imshow(x_test_orig[i], cmap='binary')
plt.xticks([])
plt.yticks([])
pred = np.squeeze(preds[i])
label = np.argmax(y_test[i])
if pred == label:
col = 'g'
else:
col = 'r'
plt.xlabel('Pred: {} Label: {}'.format(pred, label), color=col)
plt.show()
You should get pretty decent predictions - for me, only 1 out of 25 was a wrong prediction. That may be different for you, of course, but you should get largely correct predictions.