Part of iPyMacLern project.

Copyright (C) 2016 by Eka A. Kurniawan

eka.a.kurniawan(ta)gmail(tod)com

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Display Settings


In [1]:
# Display graph inline
%matplotlib inline

# Display graph in 'retina' format for Mac with retina display. Others, use PNG or SVG format.
%config InlineBackend.figure_format = 'retina'
#%config InlineBackend.figure_format = 'PNG'
#%config InlineBackend.figure_format = 'SVG'

Tested On


In [2]:
import sys
print("Python %s" % sys.version)


Python 3.5.2 (default, Jun 27 2016, 03:10:38) 
[GCC 4.2.1 Compatible Apple LLVM 7.0.2 (clang-700.1.81)]

In [3]:
import numpy as np
print("NumPy %s" % np.__version__)


NumPy 1.11.1

In [4]:
import scipy
import scipy.io as sio
print("SciPy %s" % scipy.__version__)


SciPy 0.18.0

Imports


In [5]:
import time

Table of Contents


Dataset

Load dataset from MATLAB formated dataset.$^{[1]}$


In [6]:
def read_dataset(N):
    dataset = sio.loadmat('data.mat')
    struct = dataset['data']
    data = struct[0,0]
    
    print("Dataset Info")
    print("------------")
    
    num_dimensions = data['trainData'].shape[0]
    print("Number dimensions                        :%18s" % num_dimensions)
    D = num_dimensions - 1
    print("Number input dimensions                  :%18s" % D)
    
    # Get training data
    print("Number training data                     :%18s" % data['trainData'].shape[1])
    print("Mini batches size                        :%18s" % N)
    M = int(data['trainData'].shape[1] / N)
    print("Number mini batches                      :%18s" % M)
    
    training_input = data['trainData'][:D, :(N*M)].reshape((D, M, N)) - 1
    print("Training input shape                     :%18s" % str(training_input.shape))
    training_target = data['trainData'][D, :(N*M)].reshape(1, M, N) - 1
    print("Training target shape                    :%18s" % str(training_target.shape))
    
    # Get validation data
    validation_input = data['validData'][:D, :] - 1
    print("Validation input shape                   :%18s" % str(validation_input.shape))
    validation_target = data['validData'][D, :] - 1
    print("Validation target shape                  :%18s" % str(validation_target.shape))
    
    # Get testing data
    testing_input = data['testData'][:D, :] - 1
    print("Testing input shape                      :%18s" % str(testing_input.shape))
    testing_target = data['testData'][D, :] - 1
    print("Testing target shape                     :%18s" % str(testing_target.shape))
    
    # Get vocabulary
    vocabulary = [word[0] for word in data['vocab'][0]]
    print("Vocabulary size                          :%18s" % len(vocabulary))
    
    return training_input, training_target, validation_input, validation_target, \
           testing_input, testing_target, vocabulary

In [7]:
training_input, training_target, validation_input, validation_target, \
    testing_input, testing_target, vocabulary = read_dataset(100)


Dataset Info
------------
Number dimensions                        :                 4
Number input dimensions                  :                 3
Number training data                     :            372550
Mini batches size                        :               100
Number mini batches                      :              3725
Training input shape                     :    (3, 3725, 100)
Training target shape                    :    (1, 3725, 100)
Validation input shape                   :        (3, 46568)
Validation target shape                  :          (46568,)
Testing input shape                      :        (3, 46568)
Testing target shape                     :          (46568,)
Vocabulary size                          :               250

Training


In [8]:
def perform_forwardpropagation(input_batch, \
                               embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
                               hidden_layer_bias, output_layer_bias):
    
    # Setup neural network parameters
    num_words, mini_batch_size = input_batch.shape
    vocabulary_size, embedding_layer_size = embedding_layer_weights.shape
    hidden_layer_size = hidden_layer_weights.shape[1]
    
    # Compute embedding layer state
    embedding_layer_state = np.transpose( \
        embedding_layer_weights[ \
            input_batch.reshape(num_words * mini_batch_size)]).reshape( \
                embedding_layer_size * num_words, mini_batch_size)
    
    # Compute hidden layer state
    hidden_layer_state = 1 / (1 + np.exp(-np.dot( \
                np.transpose(hidden_layer_weights), embedding_layer_state) + hidden_layer_bias))

    # Compute output layer state
    inputs_to_softmax = np.dot(np.transpose(output_layer_weights), hidden_layer_state) + output_layer_bias
    max_inputs_to_softmax = np.max(inputs_to_softmax, axis=0)
    inputs_to_softmax = inputs_to_softmax - max_inputs_to_softmax
    output_layer_state = np.exp(inputs_to_softmax)
    output_layer_state = output_layer_state / np.sum(output_layer_state, axis=0)
    
    return embedding_layer_state, hidden_layer_state, output_layer_state

In [9]:
def perform_training(num_epochs, mini_batch_size):
    tic = time.time()
    
    # Setup neural network and learning parameters
    lmda = 0.1                      # learning rate
    momentum = 0.9
    embedding_layer_size = 50
    hidden_layer_size = 200
    init_sigma = 0.01               # standard deviation of initial weight
    
    # Setup display parameters
    training_interval = 100
    validation_interval = 1000

    # Read dataset
    training_input, training_target, validation_input, validation_target, \
        testing_input, testing_target, vocabulary = read_dataset(mini_batch_size)

    # Setup neural network data
    print("")
    print("Neural Network Info")
    print("-------------------")
        
    [input_layer_size, num_mini_batches, mini_batch_size] = training_input.shape
    output_layer_size = len(vocabulary)

    # Setup embedding layer
    embedding_layer_weights = init_sigma * np.random.randn(output_layer_size, embedding_layer_size)
    print("Embedding layer weights shape            :%18s" % str(embedding_layer_weights.shape))
    embedding_layer_weights_delta = np.zeros([output_layer_size, embedding_layer_size])
    print("Embedding layer weights delta shape      :%18s" % str(embedding_layer_weights_delta.shape))
    embedding_layer_weights_gradient = np.zeros([output_layer_size, embedding_layer_size])
    print("Embedding layer weights gradient shape   :%18s" % str(embedding_layer_weights_gradient.shape))
    
    # Setup hidden layer
    hidden_layer_weights = init_sigma * np.random.randn(input_layer_size * embedding_layer_size, hidden_layer_size)
    print("Hidden layer weights shape               :%18s" % str(hidden_layer_weights.shape))
    hidden_layer_weights_delta = np.zeros([input_layer_size * embedding_layer_size, hidden_layer_size])
    print("Hidden layer weights delta shape         :%18s" % str(hidden_layer_weights_delta.shape))
    hidden_layer_bias = np.zeros((hidden_layer_size, 1))
    print("Hidden layer bias shape                  :%18s" % str(hidden_layer_bias.shape))
    hidden_layer_bias_delta = np.zeros((hidden_layer_size, 1))
    print("Hidden layer bias delta shape            :%18s" % str(hidden_layer_bias_delta.shape))

    # Setup output layer
    output_layer_weights = init_sigma * np.random.randn(hidden_layer_size, output_layer_size)
    print("Output layer weights shape               :%18s" % str(output_layer_weights.shape))
    output_layer_weights_delta = np.zeros([hidden_layer_size, output_layer_size])
    print("Output layer weights delta shape         :%18s" % str(output_layer_weights_delta.shape))
    output_layer_bias = np.zeros((output_layer_size, 1))
    print("Output layer bias shape                  :%18s" % str(output_layer_bias.shape))
    output_layer_bias_delta = np.zeros((output_layer_size, 1))
    print("Output layer bias delta shape            :%18s" % str(output_layer_bias_delta.shape))
    
    expansion = np.eye(output_layer_size)
    print("Expansion shape                          :%18s" % str(expansion.shape))

    # Training epoch
    count = 0
    tiny = np.exp(-30)
    for epoch in range(num_epochs):
        
        print("")
        print("Epoch", epoch + 1)
        this_chunk_CE = 0
        trainset_CE = 0
        
        # Training mini batch
        for m in range(num_mini_batches):
            
            # Perform forwardpropagation
            # --------------------------
            training_input_batch = training_input[:, m, :]
            embedding_layer_state, hidden_layer_state, output_layer_state = \
                perform_forwardpropagation(training_input_batch, \
                                           embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
                                           hidden_layer_bias, output_layer_bias)

            # Compute cost
            training_target_batch = training_target[:, m, :][0]
            expanded_training_target_batch = expansion[:, training_target_batch]
            error_derivative = output_layer_state - expanded_training_target_batch

            # Compute cross entrophy (CE)
            CE = -np.sum(expanded_training_target_batch * np.log(output_layer_state + tiny)) / mini_batch_size

            # Display cross entrophy (CE)
            count = count + 1
            this_chunk_CE = this_chunk_CE + ((CE - this_chunk_CE) / count)
            if (m % training_interval) == 0:
                print("Batch %5d Training CE %6.3f" % (m, this_chunk_CE))
                count = 0
                this_chunk_CE = 0
            trainset_CE = trainset_CE + ((CE - trainset_CE) / (m + 1));
            
            # Perform backpropagation
            # -----------------------

            # Output layer backpropagation
            output_layer_weights_gradient = np.dot(hidden_layer_state, np.transpose(error_derivative))
            output_layer_bias_gradient = error_derivative.sum(axis=1, keepdims=True)
            backpropagation_derivative_1 = np.dot(output_layer_weights, error_derivative) * \
                hidden_layer_state * (1 - hidden_layer_state)

            # Hidden layer backpropagation
            hidden_layer_weights_gradient = np.dot(embedding_layer_state, np.transpose(backpropagation_derivative_1))
            hidden_layer_bias_gradient = backpropagation_derivative_1.sum(axis=1, keepdims=True)
            backpropagation_derivative_2 = np.dot(hidden_layer_weights, backpropagation_derivative_1)

            # Update embedding layer weights
            embedding_layer_weights_gradient[:] = 0
            for w in range(input_layer_size):
                embedding_layer_weights_gradient = embedding_layer_weights_gradient + \
                    np.dot(expansion[:, training_input_batch[w, :]], \
                           np.transpose(backpropagation_derivative_2[w*embedding_layer_size : \
                                                                     ((w+1)*embedding_layer_size), :]))
            embedding_layer_weights_delta = (momentum * embedding_layer_weights_delta) + \
                (embedding_layer_weights_gradient / mini_batch_size)
            embedding_layer_weights = embedding_layer_weights - (lmda * embedding_layer_weights_delta)

            # Update hidden layer weights
            hidden_layer_weights_delta = (momentum * hidden_layer_weights_delta) + \
                (hidden_layer_weights_gradient / mini_batch_size)
            hidden_layer_weights = hidden_layer_weights - (lmda * hidden_layer_weights_delta)

            # Update output layer weights
            output_layer_weights_delta = (momentum * output_layer_weights_delta) + \
                (output_layer_weights_gradient / mini_batch_size)
            output_layer_weights = output_layer_weights - (lmda * output_layer_weights_delta)

            # Update hidden layer bias
            hidden_layer_bias_delta = (momentum * hidden_layer_bias_delta) + \
                (hidden_layer_bias_gradient / mini_batch_size)
            hidden_layer_bias = hidden_layer_bias - (lmda * hidden_layer_bias_delta)

            # Update output layer bias
            output_layer_bias_delta = (momentum * output_layer_bias_delta) + \
                (output_layer_bias_gradient / mini_batch_size)
            output_layer_bias = output_layer_bias - (lmda * output_layer_bias_delta)
            
            # Perform validation
            # ------------------
            if (m % validation_interval) == 0:
                embedding_layer_state, hidden_layer_state, output_layer_state = \
                    perform_forwardpropagation(validation_input, \
                                               embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
                                               hidden_layer_bias, output_layer_bias)
                validation_data_size = validation_input.shape[1]
                expanded_validation_target = expansion[:, validation_target]
                CE = -np.sum(expanded_validation_target * np.log(output_layer_state + tiny)) / validation_data_size
                print("############################## Validation CE %6.3f" % CE)
            
            
    # Perform testing
    # ---------------
    embedding_layer_state, hidden_layer_state, output_layer_state = \
        perform_forwardpropagation(testing_input, \
                                   embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
                                   hidden_layer_bias, output_layer_bias)
    testing_data_size = testing_input.shape[1]
    expanded_test_target = expansion[:, testing_target]
    CE = -np.sum(expanded_test_target * np.log(output_layer_state + tiny)) / testing_data_size
    print("")
    print("##############################")
    print("Final testing CE %6.3f" % CE)
    print("##############################")
    
    toc = time.time()
    print("")
    print("Total runtime %.2f seconds" % (toc - tic))
    
    model = {
        "embedding_layer_weights": embedding_layer_weights,
        "hidden_layer_weights": hidden_layer_weights,
        "output_layer_weights": output_layer_weights,
        "hidden_layer_bias": hidden_layer_bias, 
        "output_layer_bias": output_layer_bias, 
        "vocabulary": vocabulary,    
    }

    return model

In [10]:
model = perform_training(num_epochs=2, mini_batch_size=100)


Dataset Info
------------
Number dimensions                        :                 4
Number input dimensions                  :                 3
Number training data                     :            372550
Mini batches size                        :               100
Number mini batches                      :              3725
Training input shape                     :    (3, 3725, 100)
Training target shape                    :    (1, 3725, 100)
Validation input shape                   :        (3, 46568)
Validation target shape                  :          (46568,)
Testing input shape                      :        (3, 46568)
Testing target shape                     :          (46568,)
Vocabulary size                          :               250

Neural Network Info
-------------------
Embedding layer weights shape            :         (250, 50)
Embedding layer weights delta shape      :         (250, 50)
Embedding layer weights gradient shape   :         (250, 50)
Hidden layer weights shape               :        (150, 200)
Hidden layer weights delta shape         :        (150, 200)
Hidden layer bias shape                  :          (200, 1)
Hidden layer bias delta shape            :          (200, 1)
Output layer weights shape               :        (200, 250)
Output layer weights delta shape         :        (200, 250)
Output layer bias shape                  :          (250, 1)
Output layer bias delta shape            :          (250, 1)
Expansion shape                          :        (250, 250)

Epoch 1
Batch     0 Training CE  5.516
############################## Validation CE  5.330
Batch   100 Training CE  4.596
Batch   200 Training CE  4.533
Batch   300 Training CE  4.508
Batch   400 Training CE  4.533
Batch   500 Training CE  4.516
Batch   600 Training CE  4.569
Batch   700 Training CE  4.581
Batch   800 Training CE  4.541
Batch   900 Training CE  4.633
Batch  1000 Training CE  4.595
############################## Validation CE  4.575
Batch  1100 Training CE  4.608
Batch  1200 Training CE  4.676
Batch  1300 Training CE  4.659
Batch  1400 Training CE  4.684
Batch  1500 Training CE  4.619
Batch  1600 Training CE  4.584
Batch  1700 Training CE  4.466
Batch  1800 Training CE  4.406
Batch  1900 Training CE  4.343
Batch  2000 Training CE  4.290
############################## Validation CE  4.362
Batch  2100 Training CE  4.297
Batch  2200 Training CE  4.262
Batch  2300 Training CE  4.262
Batch  2400 Training CE  4.143
Batch  2500 Training CE  4.054
Batch  2600 Training CE  4.029
Batch  2700 Training CE  3.962
Batch  2800 Training CE  3.863
Batch  2900 Training CE  3.822
Batch  3000 Training CE  3.676
############################## Validation CE  3.720
Batch  3100 Training CE  3.663
Batch  3200 Training CE  3.604
Batch  3300 Training CE  3.521
Batch  3400 Training CE  3.508
Batch  3500 Training CE  3.456
Batch  3600 Training CE  3.464
Batch  3700 Training CE  3.365

Epoch 2
Batch     0 Training CE  0.137
############################## Validation CE  3.363
Batch   100 Training CE  3.382
Batch   200 Training CE  3.371
Batch   300 Training CE  3.320
Batch   400 Training CE  3.290
Batch   500 Training CE  3.250
Batch   600 Training CE  3.288
Batch   700 Training CE  3.264
Batch   800 Training CE  3.206
Batch   900 Training CE  3.201
Batch  1000 Training CE  3.205
############################## Validation CE  3.178
Batch  1100 Training CE  3.163
Batch  1200 Training CE  3.180
Batch  1300 Training CE  3.129
Batch  1400 Training CE  3.173
Batch  1500 Training CE  3.155
Batch  1600 Training CE  3.122
Batch  1700 Training CE  3.110
Batch  1800 Training CE  3.118
Batch  1900 Training CE  3.088
Batch  2000 Training CE  3.049
############################## Validation CE  3.096
Batch  2100 Training CE  3.084
Batch  2200 Training CE  3.078
Batch  2300 Training CE  3.043
Batch  2400 Training CE  3.045
Batch  2500 Training CE  3.042
Batch  2600 Training CE  3.069
Batch  2700 Training CE  3.071
Batch  2800 Training CE  3.030
Batch  2900 Training CE  3.032
Batch  3000 Training CE  2.977
############################## Validation CE  2.994
Batch  3100 Training CE  2.987
Batch  3200 Training CE  2.964
Batch  3300 Training CE  2.946
Batch  3400 Training CE  2.987
Batch  3500 Training CE  2.971
Batch  3600 Training CE  2.978
Batch  3700 Training CE  2.936

##############################
Final testing CE  2.947
##############################

Total runtime 44.72 seconds

Find Nearest Words


In [11]:
def display_nearest_words(word, model, k):
    embedding_layer_weights = model["embedding_layer_weights"]
    vocabulary = model["vocabulary"]
    
    if word in vocabulary:
        idx = vocabulary.index(word)
    else:
        return None
    
    vocabulary_size = len(vocabulary)
    word_representation = embedding_layer_weights[idx, :]
    square_distance = np.sum(np.square(embedding_layer_weights - word_representation), axis=1)
    top_k_sorted_indices = np.argsort(square_distance)[1:k+1]
    for i in top_k_sorted_indices:
        print("%12s %10.3f" % (vocabulary[i], square_distance[i]))

In [12]:
display_nearest_words('city', model, k=10)


      school      0.252
       music      0.304
     country      0.307
     company      0.328
       house      0.337
       state      0.381
  government      0.391
     program      0.406
      market      0.422
      states      0.426

References

  1. G. Hinton, 2016. Neural Networks for Machine Learning. Week 5 Programming Assignment 2: Learning Word Representations. University of Toronto. Coursera. https://www.coursera.org/learn/neural-networks