Author(s): kozyr@google.com
We show how to train a very simple neural network from scratch (we're using nothing but numpy
!).
In [0]:
import numpy as np
# Set up the data and network:
n_outputs = 5 # We're attempting to learn XOR in this example, so our inputs and outputs will be the same.
n_hidden_units = 10 # We'll use a single hidden layer with this number of hidden units in it.
n_obs = 500 # How many observations of the XOR input to output vector will we use for learning?
# How quickly do we want to update our weights?
learning_rate = 0.1
# How many times will we try to use each observation to improve the weights?
epochs = 10 # Think of this as iterations if you like.
# Set random seed so that the exercise works out the same way for everyone:
np.random.seed(42)
In [0]:
# Create the inputs:
training_vectors = np.random.binomial(1, 0.5, (n_obs, n_outputs))
# Each row is a binary vector to learn from.
print('One instance with ' + str(n_outputs) + ' features: ' + str(training_vectors[0]))
# Create the correct XOR outputs (t is for target):
xor_training_vectors = training_vectors ^ 1 # This is just XOR, everything is deterministic.
print('Correct label (simply XOR): ' + str(xor_training_vectors[0]))
In [0]:
# Define an activation function and its derivative:
def activ(x):
# We'll use a sigmoid function:
return 1.0 / (1.0 + np.exp(-x))
def activ_prime(x):
# Derivative of the sigmoid function:
# d/dx 1 / (1 + exp(-x)) = -(-exp(-x)) * (1 + exp(-x)) ^ (-2)
return np.exp(-x) / ((1.0 + np.exp(-x)) ** 2)
# Define a loss function and its derivative wrt predictions:
def loss(prediction, truth):
# We'll choose cross entropy loss for this demo.
return -np.mean(truth * np.log(prediction) + (1 - truth) * np.log(1 - prediction))
def loss_prime(prediction, truth):
# Derivative (elementwise) of cross entropy loss wrt prediction.
# d/dy (-t log(y) - (1-t)log(1-y)) = -t/y + (1-t)/(1-y) = -(t-ty-y+ty) = y - t
return prediction - truth
In [0]:
# Simplest way to initialize is to choose weights uniformly at random between -1 and 1:
weights1 = np.random.uniform(low=-1, high=1, size=(n_outputs, n_hidden_units))
weights2 = np.random.uniform(low=-1, high=1, size=(n_hidden_units, n_outputs))
# Note: there are much better ways to initialize weights, but our goal is simplicity here.
In [0]:
def forward_prop(x, w1, w2):
"""Implements forward propagation.
Args:
x: the input vector.
w1: first set of weights mapping the input to layer 1.
w2: second set of weights mapping layer 1 to layer 2.
Returns:
u1: unactivated unit values from layer 1 in forward prop
u2: unactivated unit values from layer 2 in forward prop
a1: activated unit values from layer 1 in forward prop
a2: activated unit values from layer 2 in forward prop
lab: the output label
"""
u1 = np.dot(x, w1) # u for unactivated weighted sum unit (other authors might prefer to call it z)
a1 = activ(u1) # a for activated unit
u2 = np.dot(a1, w2)
a2 = activ(u2)
# Let's output predicted labels too, but converting continuous a2 to binary:
lab = (a2 > 0.5).astype(int)
return u1, u2, a1, a2, lab
In [0]:
def back_prop(x, t, u1, u2, a1, a2, w1, w2):
"""Implements backward propagation.
Args:
x: the input vector
t: the desired output vector.
u1: unactivated unit values from layer 1 in forward prop
u2: unactivated unit values from layer 2 in forward prop
a1: activated unit values from layer 1 in forward prop
a2: activated unit values from layer 2 in forward prop
w1: first set of weights mapping the input to layer 1.
w2: second set of weights mapping layer 1 to layer 2.
Returns:
d1: gradients for weights w1, used for updating w1
d2: gradients for weights w2, used for updating w2
"""
e2 = loss_prime(a2, t) # e is for error; this is the "error" effect in the final layer
d2 = np.outer(a1, e2) # d is for delta; this is the gradient value for updating weights w2
e1 = np.dot(w2, e2) * activ_prime(u1) # e is for error
d1 = np.outer(x, e1) # d is for delta; this is the gradient update for the first set of weights w1
return d1, d2 # We only need the updates outputted
In [0]:
# Train
for epoch in range(epochs):
loss_tracker = []
for i in range(training_vectors.shape[0]):
# Input one obs at a time to become x = binary_vectors[i] (inputs) and t = xor_vectors[i] (targets)
# Forward propagation:
u1, u2, a1, a2, labels = forward_prop(training_vectors[i], weights1, weights2)
# Backward propagation:
d1, d2 = back_prop(training_vectors[i], xor_training_vectors[i],
u1, u2, a1, a2, weights1, weights2)
# Update the weights:
weights1 -= learning_rate * d1
weights2 -= learning_rate * d2
loss_tracker.append(loss(prediction=a2, truth=xor_training_vectors[i]))
print 'Epoch: %d, Average Loss: %.8f' % (epoch+1, np.mean(loss_tracker))
In [0]:
# Print performance to screen:
def get_performance(n_valid, w1, w2):
"""Computes performance and prints it to screen.
Args:
n_valid: number of validation instances we'd like to simulate.
w1: first set of weights mapping the input to layer 1.
w2: second set of weights mapping layer 1 to layer 2.
Returns:
None
"""
flawless_tracker = []
validation_vectors = np.random.binomial(1, 0.5, (n_valid, n_outputs))
xor_validation_vectors = validation_vectors ^ 1
for i in range(n_valid):
u1, u2, a1, a2, labels = forward_prop(validation_vectors[i], w1, w2)
if i < 3:
print('********')
print('Challenge ' + str(i + 1) + ': ' + str(validation_vectors[i]))
print('Predicted ' + str(i + 1) + ': ' + str(labels))
print('Correct ' + str(i + 1) + ': ' + str(xor_validation_vectors[i]))
instance_score = (np.array_equal(labels, xor_validation_vectors[i]))
flawless_tracker.append(instance_score)
print('\nProportion of flawless instances on ' + str(n_valid) +
' new examples: ' + str(round(100*np.mean(flawless_tracker),0)) + '%')
In [0]:
get_performance(5000, weights1, weights2)