Momentum

By Victor Zhong

In the tutorial on Feed Forward Neural Network, we finished with the following implementation:



In [30]:

    
import numpy as np
import matplotlib.pyplot as plt

class NeuralNetwork:

    def sigmoid(self, z):
        return np.divide(1, 1 + np.exp( np.multiply(-1, z) ))

    def __init__(self, A, I, H, O):
        self.A = A
        self.W = np.multiply(np.random.normal(-0.001, 0.001, A.shape), A)
        self.I = I
        self.H = H
        self.O = O
        self.M = I+H+O

        assert self.A.shape[0] == self.A.shape[1]
        assert self.A.shape[0] == I+H+O

        print 'creating network with', I, 'input units,', H, 'hidden units,', 'and', O, 'output units'

    def hidden_units(self):
        return range(self.I, self.I+self.H)

    def output_units(self):
        return range(self.I+self.H, self.M)

    def _create_X_g(self, example_X):

        T = example_X.shape[0]

        X = np.zeros((T, self.M))
        # The inbound activity of the bias is not needed
        X[:, 0] = 1

        # The inbound activity of a input unit is the input
        X[:, 1:self.I] = example_X

        g = np.zeros(X.shape)

        # The outbound activity of the bias is always 1
        g[:, 0] = 1

        # The outbound activity of a input unit is the input
        g[:, 1:self.I] = example_X

        return X, g


    def forward_propagate(self, X, g):

        for j in self.hidden_units():

            # the pre-nonlinearity activity is equal to the weighted sum of the inbound post-nonlinearity activities
            X[:, j] = np.dot(g, self.W[:,j])

            # the post-nonlinearity activity is equal to the sigmoidal output of the pre-nonlinearity activity
            g[:, j] = self.sigmoid( X[:,j] )

        for j in self.output_units():

            X[:, j] = np.dot(g, self.W[:,j]) # same as above
            g[:, j] = X[:, j]  # linear output units

        return X, g

    def backpropagate(self, X, Y, g):

        T = X.shape[0]

        # initialize the error derivatives to be zero
        dJ_dX = np.zeros((T, self.M))

        for m in reversed(self.output_units()):
            dJ_dX[:, self.I+self.H:self.M] = g[:, self.I+self.H:self.M] - Y

        for m in reversed(self.hidden_units()):
            gprime = np.multiply(g[:,m], 1-g[:,m])
            dJ_dX[:, m] = np.multiply( np.dot(dJ_dX[:, m+1:self.M], np.transpose(self.W[m, m+1:self.M])), gprime)

        dJ_dW = np.dot(np.transpose(g), dJ_dX)
        dJ_dW = np.multiply(dJ_dW, self.A) / T

        return dJ_dW

    def compute_cost(self, target, guess):
        temp = target - guess
        return np.mean(np.multiply(temp, temp))

    def train(self, example_X, Y, learning_rate, num_iterations):

        X, g = self._create_X_g(example_X)

        T = X.shape[0]

        print 'training using', T, 'examples and', num_iterations, 'iterations with learning rate of', learning_rate

        costs = np.zeros(num_iterations)

        iteration = 0
        while (iteration < num_iterations):
            X, g = self.forward_propagate(X, g)

            costs[iteration] = self.compute_cost(Y, np.reshape(g[:, -1], (np.size(g[:, -1]), 1)))

            dJ_dW = self.backpropagate(X, Y, g)

            self.W = self.W - np.multiply(learning_rate, dJ_dW)

            iteration += 1

        return g, costs

One aspect of the definition above is that we optimize purely based on a gradient approach. The idea is that the gradient yields the direction of steepest ascent on a smooth error surface, hence to reduce the error we shall incrementally and iteratively step in the opposite direction of the gradient (gradient descent). Without graphics in hand, I will attempt to describe how the gradient approach doesn't always yield the most optimal approach, and attempt to briefly describe the momentum approach.

Suppose we are given an error surface that resembles a valley, the center of which is the steepest point. Further suppose that we descend down this valley via gradient descent. We can see that the most optimal valley for this strategy is one that is a circular bowl, for regardless of our location, the reverse of the gradient points directly towards the center of the valley. Conversely, suppose we were unfortunate as to be located slightly off the far end of a long, oval valley, then stepping towards our reverse-gradient direciton would simply cause us to bounce along the major axis of this oval valley. In the latter case, alot of our movement is wasted - we really want to proceed long the major axis of the valley, but we end up bouncing around the minor axis instead.

The momentum concept aims to improve the descent algorithm in the case of such error surfaces. The idea is that we give the weight update process momentum - the weight update in this iteration carries a similar velocity as the weight update in the last iteration.

In the first iteration, we initialize the momentum $V$ as follows:

$$ V = \nabla J $$

In each subsequent iteration, we update the momentum and the weights as follows:

$$ V \leftarrow \alpha V + (1 - \alpha) \nabla J $$$$ W \leftarrow W + \eta V $$

This can be implemented as follows:



In [41]:

    
class NeuralNetwork:

    def sigmoid(self, z):
        return np.divide(1, 1 + np.exp( np.multiply(-1, z) ))

    def __init__(self, A, I, H, O):
        self.A = A
        self.W = np.multiply(np.random.normal(-0.001, 0.001, A.shape), A)
        self.I = I
        self.H = H
        self.O = O
        self.M = I+H+O
        self.momentum = 0.

        assert self.A.shape[0] == self.A.shape[1]
        assert self.A.shape[0] == I+H+O

        print 'creating network with', I, 'input units,', H, 'hidden units,', 'and', O, 'output units'

    def hidden_units(self):
        return range(self.I, self.I+self.H)

    def output_units(self):
        return range(self.I+self.H, self.M)

    def _create_X_g(self, example_X):

        T = example_X.shape[0]

        X = np.zeros((T, self.M))
        # The inbound activity of the bias is not needed
        X[:, 0] = 1

        # The inbound activity of a input unit is the input
        X[:, 1:self.I] = example_X

        g = np.zeros(X.shape)

        # The outbound activity of the bias is always 1
        g[:, 0] = 1

        # The outbound activity of a input unit is the input
        g[:, 1:self.I] = example_X

        return X, g


    def forward_propagate(self, X, g):

        for j in self.hidden_units():

            # the pre-nonlinearity activity is equal to the weighted sum of the inbound post-nonlinearity activities
            X[:, j] = np.dot(g, self.W[:,j])

            # the post-nonlinearity activity is equal to the sigmoidal output of the pre-nonlinearity activity
            g[:, j] = self.sigmoid( X[:,j] )

        for j in self.output_units():

            X[:, j] = np.dot(g, self.W[:,j]) # same as above
            g[:, j] = X[:, j]  # linear output units

        return X, g

    def backpropagate(self, X, Y, g):

        T = X.shape[0]

        # initialize the error derivatives to be zero
        dJ_dX = np.zeros((T, self.M))

        for m in reversed(self.output_units()):
            dJ_dX[:, self.I+self.H:self.M] = g[:, self.I+self.H:self.M] - Y

        for m in reversed(self.hidden_units()):
            gprime = np.multiply(g[:,m], 1-g[:,m])
            dJ_dX[:, m] = np.multiply( np.dot(dJ_dX[:, m+1:self.M], np.transpose(self.W[m, m+1:self.M])), gprime)

        dJ_dW = np.dot(np.transpose(g), dJ_dX)
        dJ_dW = np.multiply(dJ_dW, self.A) / T

        return dJ_dW

    def compute_cost(self, target, guess):
        temp = target - guess
        return np.mean(np.multiply(temp, temp))

    def train(self, example_X, Y, learning_rate, num_iterations, momentum_rate=0.):

        X, g = self._create_X_g(example_X)

        T = X.shape[0]

        print 'training using', T, 'examples and', num_iterations, 'iterations with learning rate of', learning_rate

        costs = np.zeros(num_iterations)

        X, g = self.forward_propagate(X, g)
        
        iteration = 0
        while (iteration < num_iterations):
            X, g = self.forward_propagate(X, g)

            costs[iteration] = self.compute_cost(Y, np.reshape(g[:, -1], (np.size(g[:, -1]), 1)))
            self.momentum = momentum_rate * self.momentum + (1-momentum_rate) * self.backpropagate(X, Y, g)
            
            self.W = self.W - np.multiply(learning_rate, self.momentum)
            #self.W = self.W - learning_rate * self.backpropagate(X, Y, g)
            iteration += 1

        return g, costs

Now let's look at how momentum impacts the learning process:



In [42]:

    
X = np.arange(0, 2*np.pi, 0.1)
Y = np.sin(X)
X = np.reshape(X, (np.size(X), 1))
Y = np.reshape(Y, (np.size(Y), 1))
plt.scatter(X, Y)
plt.show()



In [43]:

    
A = np.array([
            [0., 0., 1., 1., 1.],
            [0., 0., 1., 1., 0.],
            [0., 0., 0., 0., 1.],
            [0., 0., 0., 0., 1.],
            [0., 0., 0., 0., 0.]])

nn = NeuralNetwork(A, 2, 2, 1)
g, costs = nn.train(X,Y, learning_rate=0.02, num_iterations=50000)

nn2 = NeuralNetwork(A, 2, 2, 1)
g_with_momentum, costs_with_momentum = nn2.train(X,Y, learning_rate=0.02, num_iterations=50000, momentum_rate=0.5)

plt.plot(costs, 'b')
plt.plot(costs_with_momentum, 'r')
plt.title('cost vs. iteration')
plt.show()

plt.scatter(X, g[:, -1], color='b')
plt.scatter(X, g_with_momentum[:, -1], color='r')
plt.scatter(X,Y,color='g')
plt.show()









    



creating network with 2 input units, 2 hidden units, and 1 output units
training using 63 examples and 50000 iterations with learning rate of 0.02
creating network with 2 input units, 2 hidden units, and 1 output units
training using 63 examples and 50000 iterations with learning rate of 0.02

In this particular case, the training with momentum converged earlier to the same result in comparison to the training without momentum.