Layer Gradient Checks

Here, we use numerical gradient checking to verify the backpropagation correctness of all layers in the Layers folder. We should expect to see very small nonzero values for error, as the checking process approximates the gradient numerically.


In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import LearnyMcLearnface as lml

Affine Layer

Layers/AffineLayer.py


In [2]:
affine = lml.layers.AffineLayer(30, 10, 1e-2)
test_input = np.random.randn(50, 30)
dout = np.random.randn(50, 10)
_ = affine.forward(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : affine.forward(x, affine.W, affine.b), test_input, dout)
dW_num = lml.utils.numerical_gradient_layer(lambda w : affine.forward(test_input, w, affine.b), affine.W, dout)
db_num = lml.utils.numerical_gradient_layer(lambda b : affine.forward(test_input, affine.W, b), affine.b, dout)
dx = affine.backward(dout)
print('Affine dx error:', np.max(lml.utils.relative_error(dx, dx_num)))
print('Affine dW error:', np.max(lml.utils.relative_error(affine.dW, dW_num)))
print('Affine db error:', np.max(lml.utils.relative_error(affine.db, db_num)))


Affine dx error: 7.96200384655e-08
Affine dW error: 2.70896030674e-05
Affine db error: 1.26310666338e-08

Batch Normalization Layer

Layers/BatchnormLayer.py


In [5]:
batchnorm = lml.layers.BatchnormLayer(10, 0.9)
test_input = np.random.randn(20, 10)
dout = np.random.randn(20, 10)
_ = batchnorm.forward_train(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : batchnorm.forward_train(x), test_input, dout)
dx = batchnorm.backward(dout)
print('Batchnorm dx error:', np.max(lml.utils.relative_error(dx, dx_num)))


Batchnorm dx error: 8.91957543496e-07

Dropout Layer

Layers/DropoutLayer.py


In [3]:
dropout = lml.layers.DropoutLayer(10, 0.6, seed=5684)
test_input = np.random.randn(3, 10)
dout = np.random.randn(3, 10)
_ = dropout.forward_train(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : dropout.forward_train(x), test_input, dout)
dx = dropout.backward(dout)
print('Dropout dx error:', np.max(lml.utils.relative_error(dx, dx_num)))


Dropout dx error: 3.38587871964e-12

PReLU (Parametric Rectified Linear Unit) Layer

Layers/PReLULayer.py


In [4]:
prelu = lml.layers.PReLULayer(10)
test_input = np.random.randn(50, 10)
dout = np.random.randn(50, 10)
_ = prelu.forward(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : prelu.forward(x, prelu.W), test_input, dout)
dW_num = lml.utils.numerical_gradient_layer(lambda w : prelu.forward(test_input, w), prelu.W, dout)
dx = prelu.backward(dout)
print('PReLU dx error:', np.max(lml.utils.relative_error(dx, dx_num)))
print('PReLU dW error:', np.max(lml.utils.relative_error(prelu.dW, dW_num)))


PReLU dx error: 3.27562981159e-12
PReLU dW error: 1.26310647589e-08

ReLU (Rectified Linear Unit) Layer

Layers/ReLULayer.py


In [5]:
relu = lml.layers.ReLULayer(10)
test_input = np.random.randn(50, 10)
dout = np.random.randn(50, 10)
_ = relu.forward(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : relu.forward(x), test_input, dout)
dx = relu.backward(dout)
print('ReLU dx error:', np.max(lml.utils.relative_error(dx, dx_num)))


ReLU dx error: 3.27562093902e-12

Sigmoid Layer

Layers/SigmoidLayer.py


In [6]:
sigmoid = lml.layers.SigmoidLayer(10)
test_input = np.random.randn(50, 10)
dout = np.random.randn(50, 10)
_ = sigmoid.forward(test_input)
dx_num = lml.utils.numerical_gradient_layer(lambda x : sigmoid.forward(x), test_input, dout)
dx = sigmoid.backward(dout)
print('Sigmoid dx error:', np.max(lml.utils.relative_error(dx, dx_num)))


Sigmoid dx error: 7.4464101776e-11

Softmax Loss Layer

Layers/SoftmaxLossLayer.py


In [7]:
softmax = lml.layers.SoftmaxLossLayer(10)
test_scores = np.random.randn(50, 10)
test_classes = np.random.randint(1, 10, 50)
_, dx = softmax.loss(test_scores, test_classes)
dx_num = lml.utils.numerical_gradient(lambda x : softmax.loss(x, test_classes)[0], test_scores)
print('Softmax backprop error:', np.max(lml.utils.relative_error(dx, dx_num)))


Softmax backprop error: 3.65136793454e-07

SVM Loss Layer

Layers/SVMLossLayer.py


In [8]:
svm = lml.layers.SVMLossLayer(10)
test_scores = np.random.randn(50, 10)
test_classes = np.random.randint(1, 10, 50)
_, dx = svm.loss(test_scores, test_classes)
dx_num = lml.utils.numerical_gradient(lambda x : svm.loss(x, test_classes)[0], test_scores)
print('SVM backprop error:', np.max(lml.utils.relative_error(dx, dx_num)))


SVM backprop error: 3.0387355051e-09

Tanh Layer

Layers/TanhLayer.py

tanh = lml.layers.TanhLayer(10) testinput = np.random.randn(50, 10) dout = np.random.randn(50, 10) = tanh.forward(test_input) dx_num = lml.utils.numerical_gradient_layer(lambda x : tanh.forward(x), test_input, dout) dx = tanh.backward(dout) print('Tanh dx error:', np.max(lml.utils.relative_error(dx, dx_num)))

Full Model Gradient Checks

Two Layer Network

This is a gradient check for a simple example network with the following architecture: Affine, ReLU, Affine, Softmax


In [9]:
opts = {
    'input_dim' : 10,
    'data_type' : np.float64
}

nn = lml.NeuralNetwork(opts)
nn.add_layer('Affine', {'neurons':10, 'weight_scale':5e-2})
nn.add_layer('ReLU', {})
nn.add_layer('Affine', {'neurons':10, 'weight_scale':5e-2})
nn.add_layer('SoftmaxLoss', {})
test_scores = np.random.randn(20, 10)
test_classes = np.random.randint(1, 10, 20)
loss, dx = nn.backward(test_scores, test_classes)

print('With regularization off:')
f = lambda _: nn.backward(test_scores, test_classes)[0]
d_b1_num = lml.utils.numerical_gradient(f, nn.layers[0].b, accuracy=1e-8)
d_W1_num = lml.utils.numerical_gradient(f, nn.layers[0].W, accuracy=1e-8)
print('Weight 1 error:', np.max(lml.utils.relative_error(nn.layers[0].dW, d_W1_num)))
print('Bias 1 error:', np.max(lml.utils.relative_error(nn.layers[0].db, d_b1_num)))

d_b2_num = lml.utils.numerical_gradient(f, nn.layers[2].b, accuracy=1e-8)
d_W2_num = lml.utils.numerical_gradient(f, nn.layers[2].W, accuracy=1e-8)
print('Weight 2 error:', np.max(lml.utils.relative_error(nn.layers[2].dW, d_W2_num)))
print('Bias 2 error:', np.max(lml.utils.relative_error(nn.layers[2].db, d_b2_num)))

print('With regularization at lambda = 1.0:')
f = lambda _: nn.backward(test_scores, test_classes, reg_param=1.0)[0]
d_b1_num = lml.utils.numerical_gradient(f, nn.layers[0].b, accuracy=1e-8)
d_W1_num = lml.utils.numerical_gradient(f, nn.layers[0].W, accuracy=1e-8)
print('Weight 1 error:', np.max(lml.utils.relative_error(nn.layers[0].dW, d_W1_num)))
print('Bias 1 error:', np.max(lml.utils.relative_error(nn.layers[0].db, d_b1_num)))

d_b2_num = lml.utils.numerical_gradient(f, nn.layers[2].b, accuracy=1e-8)
d_W2_num = lml.utils.numerical_gradient(f, nn.layers[2].W, accuracy=1e-8)
print('Weight 2 error:', np.max(lml.utils.relative_error(nn.layers[2].dW, d_W2_num)))
print('Bias 2 error:', np.max(lml.utils.relative_error(nn.layers[2].db, d_b2_num)))


With regularization off:
Weight 1 error: 0.000232671597498
Bias 1 error: 6.69321159941e-06
Weight 2 error: 3.11573480685e-05
Bias 2 error: 8.05419756858e-06
With regularization at lambda = 1.0:
Weight 1 error: 0.000310478687826
Bias 1 error: 6.69321159941e-06
Weight 2 error: 0.00012777803333
Bias 2 error: 8.05419756858e-06

Multilayer Fully Connected Network with Augmentations


In [18]:
opts = {
    'input_dim' : 10,
    'data_type' : np.float64,
    'init_scheme' : 'xavier'
}
nn = lml.NeuralNetwork(opts)
nn.add_layer('Affine', {'neurons':10})
nn.add_layer('Batchnorm', {'decay':0.9})
nn.add_layer('PReLU', {})
nn.add_layer('Dropout', {'dropout_param':0.85, 'seed':5684})
nn.add_layer('Affine', {'neurons':10})
nn.add_layer('Batchnorm', {'decay':0.7})
nn.add_layer('PReLU', {})
nn.add_layer('Dropout', {'dropout_param':0.90, 'seed':5684})
nn.add_layer('Affine', {'neurons':10})
nn.add_layer('Batchnorm', {'decay':0.8})
nn.add_layer('PReLU', {})
nn.add_layer('Dropout', {'dropout_param':0.95, 'seed':5684})
nn.add_layer('SoftmaxLoss', {})
test_scores = np.random.randn(20, 10)
test_classes = np.random.randint(1, 10, 20)
loss, dx = nn.backward(test_scores, test_classes)

f = lambda _: nn.backward(test_scores, test_classes, reg_param=0.7)[0]
d_b1_num = lml.utils.numerical_gradient(f, nn.layers[0].b, accuracy=1e-8)
d_W1_num = lml.utils.numerical_gradient(f, nn.layers[0].W, accuracy=1e-8)
print('Weight 1 error:', np.max(lml.utils.relative_error(nn.layers[0].dW, d_W1_num)))
print('Bias 1 error:', np.max(lml.utils.relative_error(nn.layers[0].db, d_b1_num)))

d_gamma1_num = lml.utils.numerical_gradient(f, nn.layers[1].gamma, accuracy=1e-8)
d_beta1_num = lml.utils.numerical_gradient(f, nn.layers[1].beta, accuracy=1e-8)
print('Gamma 1 error:', np.max(lml.utils.relative_error(nn.layers[1].dgamma, d_gamma1_num)))
print('Beta 1 error:', np.max(lml.utils.relative_error(nn.layers[1].dbeta, d_beta1_num)))

d_r1_num = lml.utils.numerical_gradient(f, nn.layers[2].W, accuracy=1e-8)
print('Rectifier 1 error:', np.max(lml.utils.relative_error(nn.layers[2].dW, d_r1_num)))

d_b1_num = lml.utils.numerical_gradient(f, nn.layers[4].b, accuracy=1e-8)
d_W1_num = lml.utils.numerical_gradient(f, nn.layers[4].W, accuracy=1e-8)
print('Weight 2 error:', np.max(lml.utils.relative_error(nn.layers[4].dW, d_W1_num)))
print('Bias 2 error:', np.max(lml.utils.relative_error(nn.layers[4].db, d_b1_num)))

d_gamma2_num = lml.utils.numerical_gradient(f, nn.layers[5].gamma, accuracy=1e-8)
d_beta2_num = lml.utils.numerical_gradient(f, nn.layers[5].beta, accuracy=1e-8)
print('Gamma 2 error:', np.max(lml.utils.relative_error(nn.layers[5].dgamma, d_gamma2_num)))
print('Beta 2 error:', np.max(lml.utils.relative_error(nn.layers[5].dbeta, d_beta2_num)))

d_r2_num = lml.utils.numerical_gradient(f, nn.layers[6].W, accuracy=1e-8)
print('Rectifier 2 error:', np.max(lml.utils.relative_error(nn.layers[6].dW, d_r2_num)))

d_b1_num = lml.utils.numerical_gradient(f, nn.layers[8].b, accuracy=1e-8)
d_W1_num = lml.utils.numerical_gradient(f, nn.layers[8].W, accuracy=1e-8)
print('Weight 3 error:', np.max(lml.utils.relative_error(nn.layers[8].dW, d_W1_num)))
print('Bias 3 error:', np.max(lml.utils.relative_error(nn.layers[8].db, d_b1_num)))

d_gamma3_num = lml.utils.numerical_gradient(f, nn.layers[9].gamma, accuracy=1e-8)
d_beta3_num = lml.utils.numerical_gradient(f, nn.layers[9].beta, accuracy=1e-8)
print('Gamma 3 error:', np.max(lml.utils.relative_error(nn.layers[9].dgamma, d_gamma3_num)))
print('Beta 3 error:', np.max(lml.utils.relative_error(nn.layers[9].dbeta, d_beta3_num)))

d_r3_num = lml.utils.numerical_gradient(f, nn.layers[10].W, accuracy=1e-8)
print('Rectifier 3 error:', np.max(lml.utils.relative_error(nn.layers[10].dW, d_r3_num)))


Weight 1 error: 8.28487521302e-06
Bias 1 error: 1.56125112838e-09
Gamma 1 error: 1.69510018079e-05
Beta 1 error: 2.38134081683e-06
Rectifier 1 error: 0.000153965816538
Weight 2 error: 9.65103604759e-06
Bias 2 error: 3.77302356025e-09
Gamma 2 error: 1.81235947865e-06
Beta 2 error: 0.000133279190625
Rectifier 2 error: 3.04310565055e-06
Weight 3 error: 2.28544553579e-05
Bias 3 error: 2.86229373536e-09
Gamma 3 error: 4.36018210461e-06
Beta 3 error: 1.3767299458e-06
Rectifier 3 error: 4.48608977908e-05

In [ ]: