In [ ]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from classwork_auxiliary import eval_numerical_gradient,eval_numerical_gradient_array,rel_error
We will implement our neural network as a set of layers.
Basically, you can think of a module as of a something (black box) that can process input
data and produce ouput
data.
This is like applying a function which is called forward
:
output = module.forward(input)
The module should be able to perform a backward pass: to differentiate the forward
function.
More, it should be able to differentiate it if is a part of chain (chain rule).
The latter implies there is a gradient from previous step of a chain rule.
gradInput = module.backward(input, gradOutput)
Below is a base class for all future modudes. _You're not required to modify it
In [ ]:
class Module(object):
def __init__ (self):
self.output = None
self.gradInput = None
def forward(self, input):
"""
Takes an input object, and computes the corresponding output of the module.
"""
return self.updateOutput(input)
def backward(self,input, gradOutput):
"""
Performs a backpropagation step through the module, with respect to the given input.
This includes
- computing a gradient w.r.t. `input` (is needed for further backprop),
- computing a gradient w.r.t. parameters (to update parameters while optimizing).
"""
self.updateGradInput(input, gradOutput)
self.accGradParameters(input, gradOutput)
return self.gradInput
def updateOutput(self, input):
"""
Computes the output using the current parameter set of the class and input.
This function returns the result which is stored in the `output` field.
Make sure to both store the data in `output` field and return it.
"""
# The easiest case:
# self.output = input
# return self.output
pass
def updateGradInput(self, input, gradOutput):
"""
Computing the gradient of the module with respect to its own input.
This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
The shape of `gradInput` is always the same as the shape of `input`.
Make sure to both store the gradients in `gradInput` field and return it.
"""
# The easiest case:
# self.gradInput = gradOutput
# return self.gradInput
pass
def accGradParameters(self, input, gradOutput):
"""
Computing the gradient of the module with respect to its own parameters.
No need to override if module has no parameters (e.g. ReLU).
"""
pass
def zeroGradParameters(self):
"""
Zeroes `gradParams` variable if the module has params.
"""
pass
def getParameters(self):
"""
Returns a list with its parameters.
If the module does not have parameters return empty list.
"""
return []
def getGradParameters(self):
"""
Returns a list with gradients with respect to its parameters.
If the module does not have parameters return empty list.
"""
return []
def __repr__(self):
"""
Pretty printing. Should be overrided in every module if you want
to have readable description.
"""
return "Module"
In [ ]:
class Linear(Module):
"""
A module which applies a linear transformation
A common name is fully-connected layer, InnerProductLayer in caffe.
The module should work with 2D input of shape (n_samples, n_feature).
"""
def __init__(self, n_in, n_out):
super(Linear, self).__init__()
self.W = <initialize a random weight matrix of size (n_out,n_in).>
self.b = <initialize bias vector of size (n_out)>
#here we initialize gradients with zeros. We'll accumulate them later.
self.gradW = np.zeros_like(self.W)
self.gradb = np.zeros_like(self.b)
def updateOutput(self, input):
"""given X input, produce output"""
self.output = <affine transform of input using self.W and self.b. Remember to transpose W>
return self.output
def updateGradInput(self, input, gradOutput):
"""given input and dL/d_output, compute dL/d_input"""
self.gradInput = <gradient of this layer w.r.t. input. You will need gradOutput and self.W>
assert self.gradOutput.shape == input.shape,"wrong shape"
return self.gradInput
def accGradParameters(self, input, gradOutput):
"""given input and dL/d_output, compute"""
self.gradW = <compute gradient of loss w.r.t. weight matrix. You will need gradOutput and input>
assert self.gradW.shape == self.W.shape
self.gradb = <compute gradient of loss w.r.t. bias vector>
assert self.gradb.shape == self.b.shape
return self.gradW, self.gradb
def zeroGradParameters(self):
self.gradW.fill(0)
self.gradb.fill(0)
def getParameters(self):
return [self.W, self.b]
def getGradParameters(self):
return [self.gradW, self.gradb]
In [ ]:
n_in, n_out = 5, 6
x = np.random.randn(10, 6)
w = np.random.randn(17, 6)
b = np.random.randn(17)
dout = np.random.randn(10, 17)
dx_num = eval_numerical_gradient_array(lambda x: Linear(n_in, n_out, w, b).updateOutput(x), x, dout)
dw_num = eval_numerical_gradient_array(lambda w: Linear(n_in, n_out, w, b).updateOutput(x), w, dout)
db_num = eval_numerical_gradient_array(lambda b: Linear(n_in, n_out, w, b).updateOutput(x), b, dout)
dx = Linear(n_in, n_out, w, b).updateGradInput(x, dout)
dw, db = Linear(n_in, n_out, w, b).accGradParameters(x, dout)
print 'Testing Linear (errors should be < 1e-6):'
print '\t dx error: ', rel_error(dx_num, dx)
print '\t dw error: ', rel_error(dw_num, dw)
print '\t db error: ', rel_error(db_num, db)
In [ ]:
class SoftMax(Module):
def __init__(self):
super(SoftMax, self).__init__()
def updateOutput(self, input):
"""forward pass of softmax nonlinearity"""
# substract max for numerical stability
input = input - input.max(axis=1, keepdims=True)
self.output = <compute softmax forward pass>
return self.output
def updateGradInput(self, input, gradOutput):
"""backward pass of the same thing"""
exp = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
denom = exp.sum(axis=1, keepdims=True)
e = np.diag(exp.dot(gradOutput.T))
self.gradInput = - np.diag(e).dot(exp)
self.gradInput += exp * denom * gradOutput
self.gradInput /= denom**2
return self.gradInput
In [ ]:
x = np.random.randn(10, 6)
dout = np.random.randn(10, 6)
dx_num = eval_numerical_gradient_array(SoftMax().updateOutput, x, dout)
dx = SoftMax().updateGradInput(x, dout)
print 'Testing SoftMax(errors should be < 1e-6):'
print '\t dx error: ', rel_error(dx_num, dx)
You task is to implement the ClassNLLCriterion. It should implement multiclass log loss. Nevertheless there is a sum over y
(target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot.
In [ ]:
from classwork_auxiliary import Criterion
class ClassNLLCriterion(Criterion):
def updateOutput(self, input, target):
self.output = <Your code goes here>
return self.output
def updateGradInput(self, input, target):
self.gradInput = <Your code goes here>
return self.gradInput
In [ ]:
x = np.random.randn(10, 6)+5
target = np.random.randint(0, 10, x.shape[0]).reshape((-1, 1))
dx_num = eval_numerical_gradient(lambda x: ClassNLLCriterion().updateOutput(x, target), x, verbose=False)
dx = ClassNLLCriterion().updateGradInput(x, target)
print 'Testing ClassNLLCriterion (errors should be < 1e-6):'
print '\t dx error: ', rel_error(dx_num, dx)
Use this example to debug your code, start with logistic regression and then test other layers. You do not need to change anything here. This code is provided for you to test the layers. Also it is easy to use this code in MNIST task.
In [ ]:
# Generate some data
N = 500
X1 = np.random.randn(N,2) + np.array([2,2])
X2 = np.random.randn(N,2) + np.array([-2,-2])
Y = np.concatenate([np.ones(N), np.zeros(N)])[:,None]
Y = np.hstack([Y, 1-Y])
X = np.vstack([X1,X2])
plt.scatter(X[:,0],X[:,1], c = Y[:,0], cmap='hot')
Here we define a logistic regression for debugging.
In [ ]:
from classwork_auxiliary import Sequential,sgd_momentum
net = Sequential()
net.add(Linear(2, 2))
net.add(SoftMax())
criterion = ClassNLLCriterion()
Start with batch_size = 1000 to make sure every step lowers the loss, then try stochastic version.
In [ ]:
# Iptimizer params
optimizer_config = {'learning_rate' : 1e-1, 'momentum': 0.9}
optimizer_state = {}
# Looping params
n_epoch = 100
batch_size = 1000
In [ ]:
# batch generator
def get_batches( (X, Y) , batch_size):
n_samples = X.shape[0]
# Shuffle at the start of epoch
indices = np.arange(n_samples)
np.random.shuffle(indices)
for start in range(0, n_samples, batch_size):
end = min(start + batch_size, n_samples)
batch_idx = indices[start:end]
yield X[batch_idx], Y[batch_idx]
Basic training loop. Examine it.
In [ ]:
loss_history = []
for i in range(n_epoch):
for x_batch, y_batch in get_batches( (X,Y) , batch_size):
net.zeroGradParameters()
# Forward
predictions = net.forward(x_batch)
loss = criterion.forward(predictions, y_batch)
# Backward
dp = criterion.backward(predictions, y_batch)
net.backward(x_batch, dp)
# Update weights
sgd_momentum(net.getParameters(),
net.getGradParameters(),
optimizer_config,
optimizer_state)
loss_history.append(loss)
print('Current loss: %f' % loss)
if loss <= 0.01:
print ("Well done")
else:
print ("Something's wrong!")
In [ ]:
class ReLU(Module):
def __init__(self):
super(ReLU, self).__init__()
def updateOutput(self, input):
self.output = <Your code. Please remember to use np.maximum and not np.max>
return self.output
def updateGradInput(self, input, gradOutput):
self.gradInput = <gradient of loss w.r.t. input (passing through ReLU)>
return self.gradInput
In [ ]:
x = np.random.randn(10, 6)-0.5
dout = np.random.randn(10, 6)-0.5
dx_num = eval_numerical_gradient_array(ReLU().updateOutput, x, dout)
dx = ReLU().updateGradInput(x, dout)
print 'Testing ReLU (should be < 1e-6):'
print '\t dx error: ', rel_error(dx_num, dx)
Let's not try to build actuall neural network on a mnist dataset.
In [ ]:
import os
from classwork_auxiliary import load_dataset
X_train,y_train,X_test,y_test = load_dataset()
In [ ]:
from classwork_auxiliary import Sequential,sgd_momentum
In [ ]:
net = Sequential()
net.add(Linear(28*28, 10)) #you may want to replace this guy with more layers once you got the basic setup working
net.add(SoftMax())
criterion = ClassNLLCriterion()
In [ ]:
loss_train_history = []
loss_validation_history = []
optimizer_config = {'learning_rate' : 1e-1, 'momentum': 0.9}
optimizer_state = {}
In [ ]:
n_epoch=40
batch_size=1000
learning_rate=0.001 #try decreasing over time
for i in range(n_epoch):
for x_batch, y_batch in get_batches((X_train, y_train ), batch_size):
net.zeroGradParameters()
predictions = net.forward(x_batch)
loss_train = criterion.forward(predictions, y_batch)
loss_train_history.append(loss_train)
dp = criterion.backward(predictions, y_batch)
net.backward(x_batch, dp)
sgd_momentum(net.getParameters(), net.getGradParameters(), optimizer_config, optimizer_state)
test_idx = np.random.randint(0, X_test.shape[0], batch_size)
loss_test = criterion.forward(net.forward(X_test[test_idx]),y_test[test_idx])
loss_validation_history.append(loss_test)
print('epoch %s: rate = %f, loss_train = %f, loss_test = %f' % (i, learning_rate, loss_train, loss_test))
In [ ]: