Basics of Neural Networks - Numpy Demo

Author(s): kozyr@google.com

We show how to train a very simple neural network from scratch (we're using nothing but numpy!).

Setup

Identical to keras version


In [0]:
import numpy as np

# Set up the data and network:
n_outputs = 5  # We're attempting to learn XOR in this example, so our inputs and outputs will be the same.
n_hidden_units = 10  # We'll use a single hidden layer with this number of hidden units in it.
n_obs = 500  # How many observations of the XOR input to output vector will we use for learning?

# How quickly do we want to update our weights?
learning_rate = 0.1

# How many times will we try to use each observation to improve the weights?
epochs = 10  # Think of this as iterations if you like.

# Set random seed so that the exercise works out the same way for everyone:
np.random.seed(42)

Create some data to learn from

Identical to keras version


In [0]:
# Create the inputs:
training_vectors = np.random.binomial(1, 0.5, (n_obs, n_outputs))
# Each row is a binary vector to learn from.
print('One instance with ' + str(n_outputs) + ' features: ' + str(training_vectors[0]))

# Create the correct XOR outputs (t is for target):
xor_training_vectors = training_vectors ^ 1  # This is just XOR, everything is deterministic.
print('Correct label (simply XOR):    ' + str(xor_training_vectors[0]))

Select activation and loss functions

Only in numpy version


In [0]:
# Define an activation function and its derivative:
def activ(x):
  # We'll use a sigmoid function:
  return 1.0 / (1.0 + np.exp(-x))

def activ_prime(x):
  # Derivative of the sigmoid function:
  # d/dx 1 / (1 + exp(-x)) = -(-exp(-x)) * (1 + exp(-x)) ^ (-2)
  return  np.exp(-x) / ((1.0 + np.exp(-x)) ** 2)
  
# Define a loss function and its derivative wrt predictions:
def loss(prediction, truth):
  # We'll choose cross entropy loss for this demo.
  return -np.mean(truth * np.log(prediction) + (1 - truth) * np.log(1 - prediction))

def loss_prime(prediction, truth):
  # Derivative (elementwise) of cross entropy loss wrt prediction.
  # d/dy (-t log(y) - (1-t)log(1-y)) = -t/y + (1-t)/(1-y) = -(t-ty-y+ty) = y - t
  return prediction - truth

Initialize the weights

Only in numpy version


In [0]:
# Simplest way to initialize is to choose weights uniformly at random between -1 and 1:
weights1 = np.random.uniform(low=-1, high=1, size=(n_outputs, n_hidden_units))
weights2 = np.random.uniform(low=-1, high=1, size=(n_hidden_units, n_outputs))
# Note: there are much better ways to initialize weights, but our goal is simplicity here.

Forward propagation

Only in numpy version


In [0]:
def forward_prop(x, w1, w2):
  """Implements forward propagation.
  
  Args:
    x: the input vector.
    w1: first set of weights mapping the input to layer 1.
    w2: second set of weights mapping layer 1 to layer 2.

  Returns:
    u1: unactivated unit values from layer 1 in forward prop
    u2: unactivated unit values from layer 2 in forward prop
    a1: activated unit values from layer 1 in forward prop          
    a2: activated unit values from layer 2 in forward prop
    lab: the output label
  """
  u1 = np.dot(x, w1)  # u for unactivated weighted sum unit (other authors might prefer to call it z)
  a1 = activ(u1)  # a for activated unit
  u2 = np.dot(a1, w2)
  a2 = activ(u2)
  # Let's output predicted labels too, but converting continuous a2 to binary:
  lab = (a2 > 0.5).astype(int)
  return u1, u2, a1, a2, lab

Backward propagation

Only in numpy version


In [0]:
def back_prop(x, t, u1, u2, a1, a2, w1, w2):
  """Implements backward propagation.
  
  Args:
    x: the input vector
    t: the desired output vector.
    u1: unactivated unit values from layer 1 in forward prop
    u2: unactivated unit values from layer 2 in forward prop
    a1: activated unit values from layer 1 in forward prop          
    a2: activated unit values from layer 2 in forward prop
    w1: first set of weights mapping the input to layer 1.
    w2: second set of weights mapping layer 1 to layer 2.
  Returns: 
    d1: gradients for weights w1, used for updating w1
    d2: gradients for weights w2, used for updating w2
  """
  e2 = loss_prime(a2, t)  # e is for error; this is the "error" effect in the final layer
  d2 = np.outer(a1, e2)  # d is for delta; this is the gradient value for updating weights w2
  e1 = np.dot(w2, e2) * activ_prime(u1)  # e is for error
  d1 = np.outer(x, e1)  # d is for delta; this is the gradient update for the first set of weights w1
  return d1, d2  # We only need the updates outputted

Train the neural network!

Only in numpy version


In [0]:
# Train
for epoch in range(epochs):
  loss_tracker = []

  for i in range(training_vectors.shape[0]):
    # Input one obs at a time to become x = binary_vectors[i] (inputs) and t = xor_vectors[i] (targets)
    # Forward propagation:
    u1, u2, a1, a2, labels = forward_prop(training_vectors[i], weights1, weights2)
    # Backward propagation:
    d1, d2 = back_prop(training_vectors[i], xor_training_vectors[i],
                       u1, u2, a1, a2, weights1, weights2)
    # Update the weights:
    weights1 -= learning_rate * d1
    weights2 -= learning_rate * d2
    loss_tracker.append(loss(prediction=a2, truth=xor_training_vectors[i]))

  print 'Epoch: %d, Average Loss: %.8f' % (epoch+1, np.mean(loss_tracker))

Validate

Almost identical to keras version


In [0]:
# Print performance to screen:
def get_performance(n_valid, w1, w2):
  """Computes performance and prints it to screen.
  
  Args:
    n_valid: number of validation instances we'd like to simulate.
    w1: first set of weights mapping the input to layer 1.
    w2: second set of weights mapping layer 1 to layer 2.
          
  Returns:
    None
  """
  flawless_tracker = []
  validation_vectors = np.random.binomial(1, 0.5, (n_valid, n_outputs))
  xor_validation_vectors = validation_vectors ^ 1

  for i in range(n_valid):
    u1, u2, a1, a2, labels = forward_prop(validation_vectors[i], w1, w2)
    if i < 3:
      print('********')
      print('Challenge ' + str(i + 1) + ': ' + str(validation_vectors[i]))
      print('Predicted ' + str(i + 1) + ': ' + str(labels))
      print('Correct   ' + str(i + 1) + ': ' + str(xor_validation_vectors[i]))
    instance_score = (np.array_equal(labels, xor_validation_vectors[i]))
    flawless_tracker.append(instance_score)
    
  print('\nProportion of flawless instances on ' + str(n_valid) +
        ' new examples: ' + str(round(100*np.mean(flawless_tracker),0)) + '%')

In [0]:
get_performance(5000, weights1, weights2)