Part of iPyMacLern project.
Copyright (C) 2016 by Eka A. Kurniawan
eka.a.kurniawan(ta)gmail(tod)com
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
In [1]:
# Display graph inline
%matplotlib inline
# Display graph in 'retina' format for Mac with retina display. Others, use PNG or SVG format.
%config InlineBackend.figure_format = 'retina'
#%config InlineBackend.figure_format = 'PNG'
#%config InlineBackend.figure_format = 'SVG'
In [2]:
import sys
print("Python %s" % sys.version)
In [3]:
import numpy as np
print("NumPy %s" % np.__version__)
In [4]:
import scipy
import scipy.io as sio
print("SciPy %s" % scipy.__version__)
In [5]:
import time
Load dataset from MATLAB formated dataset.$^{[1]}$
In [6]:
def read_dataset(N):
dataset = sio.loadmat('data.mat')
struct = dataset['data']
data = struct[0,0]
print("Dataset Info")
print("------------")
num_dimensions = data['trainData'].shape[0]
print("Number dimensions :%18s" % num_dimensions)
D = num_dimensions - 1
print("Number input dimensions :%18s" % D)
# Get training data
print("Number training data :%18s" % data['trainData'].shape[1])
print("Mini batches size :%18s" % N)
M = int(data['trainData'].shape[1] / N)
print("Number mini batches :%18s" % M)
training_input = data['trainData'][:D, :(N*M)].reshape((D, M, N)) - 1
print("Training input shape :%18s" % str(training_input.shape))
training_target = data['trainData'][D, :(N*M)].reshape(1, M, N) - 1
print("Training target shape :%18s" % str(training_target.shape))
# Get validation data
validation_input = data['validData'][:D, :] - 1
print("Validation input shape :%18s" % str(validation_input.shape))
validation_target = data['validData'][D, :] - 1
print("Validation target shape :%18s" % str(validation_target.shape))
# Get testing data
testing_input = data['testData'][:D, :] - 1
print("Testing input shape :%18s" % str(testing_input.shape))
testing_target = data['testData'][D, :] - 1
print("Testing target shape :%18s" % str(testing_target.shape))
# Get vocabulary
vocabulary = [word[0] for word in data['vocab'][0]]
print("Vocabulary size :%18s" % len(vocabulary))
return training_input, training_target, validation_input, validation_target, \
testing_input, testing_target, vocabulary
In [7]:
training_input, training_target, validation_input, validation_target, \
testing_input, testing_target, vocabulary = read_dataset(100)
In [8]:
def perform_forwardpropagation(input_batch, \
embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
hidden_layer_bias, output_layer_bias):
# Setup neural network parameters
num_words, mini_batch_size = input_batch.shape
vocabulary_size, embedding_layer_size = embedding_layer_weights.shape
hidden_layer_size = hidden_layer_weights.shape[1]
# Compute embedding layer state
embedding_layer_state = np.transpose( \
embedding_layer_weights[ \
input_batch.reshape(num_words * mini_batch_size)]).reshape( \
embedding_layer_size * num_words, mini_batch_size)
# Compute hidden layer state
hidden_layer_state = 1 / (1 + np.exp(-np.dot( \
np.transpose(hidden_layer_weights), embedding_layer_state) + hidden_layer_bias))
# Compute output layer state
inputs_to_softmax = np.dot(np.transpose(output_layer_weights), hidden_layer_state) + output_layer_bias
max_inputs_to_softmax = np.max(inputs_to_softmax, axis=0)
inputs_to_softmax = inputs_to_softmax - max_inputs_to_softmax
output_layer_state = np.exp(inputs_to_softmax)
output_layer_state = output_layer_state / np.sum(output_layer_state, axis=0)
return embedding_layer_state, hidden_layer_state, output_layer_state
In [9]:
def perform_training(num_epochs, mini_batch_size):
tic = time.time()
# Setup neural network and learning parameters
lmda = 0.1 # learning rate
momentum = 0.9
embedding_layer_size = 50
hidden_layer_size = 200
init_sigma = 0.01 # standard deviation of initial weight
# Setup display parameters
training_interval = 100
validation_interval = 1000
# Read dataset
training_input, training_target, validation_input, validation_target, \
testing_input, testing_target, vocabulary = read_dataset(mini_batch_size)
# Setup neural network data
print("")
print("Neural Network Info")
print("-------------------")
[input_layer_size, num_mini_batches, mini_batch_size] = training_input.shape
output_layer_size = len(vocabulary)
# Setup embedding layer
embedding_layer_weights = init_sigma * np.random.randn(output_layer_size, embedding_layer_size)
print("Embedding layer weights shape :%18s" % str(embedding_layer_weights.shape))
embedding_layer_weights_delta = np.zeros([output_layer_size, embedding_layer_size])
print("Embedding layer weights delta shape :%18s" % str(embedding_layer_weights_delta.shape))
embedding_layer_weights_gradient = np.zeros([output_layer_size, embedding_layer_size])
print("Embedding layer weights gradient shape :%18s" % str(embedding_layer_weights_gradient.shape))
# Setup hidden layer
hidden_layer_weights = init_sigma * np.random.randn(input_layer_size * embedding_layer_size, hidden_layer_size)
print("Hidden layer weights shape :%18s" % str(hidden_layer_weights.shape))
hidden_layer_weights_delta = np.zeros([input_layer_size * embedding_layer_size, hidden_layer_size])
print("Hidden layer weights delta shape :%18s" % str(hidden_layer_weights_delta.shape))
hidden_layer_bias = np.zeros((hidden_layer_size, 1))
print("Hidden layer bias shape :%18s" % str(hidden_layer_bias.shape))
hidden_layer_bias_delta = np.zeros((hidden_layer_size, 1))
print("Hidden layer bias delta shape :%18s" % str(hidden_layer_bias_delta.shape))
# Setup output layer
output_layer_weights = init_sigma * np.random.randn(hidden_layer_size, output_layer_size)
print("Output layer weights shape :%18s" % str(output_layer_weights.shape))
output_layer_weights_delta = np.zeros([hidden_layer_size, output_layer_size])
print("Output layer weights delta shape :%18s" % str(output_layer_weights_delta.shape))
output_layer_bias = np.zeros((output_layer_size, 1))
print("Output layer bias shape :%18s" % str(output_layer_bias.shape))
output_layer_bias_delta = np.zeros((output_layer_size, 1))
print("Output layer bias delta shape :%18s" % str(output_layer_bias_delta.shape))
expansion = np.eye(output_layer_size)
print("Expansion shape :%18s" % str(expansion.shape))
# Training epoch
count = 0
tiny = np.exp(-30)
for epoch in range(num_epochs):
print("")
print("Epoch", epoch + 1)
this_chunk_CE = 0
trainset_CE = 0
# Training mini batch
for m in range(num_mini_batches):
# Perform forwardpropagation
# --------------------------
training_input_batch = training_input[:, m, :]
embedding_layer_state, hidden_layer_state, output_layer_state = \
perform_forwardpropagation(training_input_batch, \
embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
hidden_layer_bias, output_layer_bias)
# Compute cost
training_target_batch = training_target[:, m, :][0]
expanded_training_target_batch = expansion[:, training_target_batch]
error_derivative = output_layer_state - expanded_training_target_batch
# Compute cross entrophy (CE)
CE = -np.sum(expanded_training_target_batch * np.log(output_layer_state + tiny)) / mini_batch_size
# Display cross entrophy (CE)
count = count + 1
this_chunk_CE = this_chunk_CE + ((CE - this_chunk_CE) / count)
if (m % training_interval) == 0:
print("Batch %5d Training CE %6.3f" % (m, this_chunk_CE))
count = 0
this_chunk_CE = 0
trainset_CE = trainset_CE + ((CE - trainset_CE) / (m + 1));
# Perform backpropagation
# -----------------------
# Output layer backpropagation
output_layer_weights_gradient = np.dot(hidden_layer_state, np.transpose(error_derivative))
output_layer_bias_gradient = error_derivative.sum(axis=1, keepdims=True)
backpropagation_derivative_1 = np.dot(output_layer_weights, error_derivative) * \
hidden_layer_state * (1 - hidden_layer_state)
# Hidden layer backpropagation
hidden_layer_weights_gradient = np.dot(embedding_layer_state, np.transpose(backpropagation_derivative_1))
hidden_layer_bias_gradient = backpropagation_derivative_1.sum(axis=1, keepdims=True)
backpropagation_derivative_2 = np.dot(hidden_layer_weights, backpropagation_derivative_1)
# Update embedding layer weights
embedding_layer_weights_gradient[:] = 0
for w in range(input_layer_size):
embedding_layer_weights_gradient = embedding_layer_weights_gradient + \
np.dot(expansion[:, training_input_batch[w, :]], \
np.transpose(backpropagation_derivative_2[w*embedding_layer_size : \
((w+1)*embedding_layer_size), :]))
embedding_layer_weights_delta = (momentum * embedding_layer_weights_delta) + \
(embedding_layer_weights_gradient / mini_batch_size)
embedding_layer_weights = embedding_layer_weights - (lmda * embedding_layer_weights_delta)
# Update hidden layer weights
hidden_layer_weights_delta = (momentum * hidden_layer_weights_delta) + \
(hidden_layer_weights_gradient / mini_batch_size)
hidden_layer_weights = hidden_layer_weights - (lmda * hidden_layer_weights_delta)
# Update output layer weights
output_layer_weights_delta = (momentum * output_layer_weights_delta) + \
(output_layer_weights_gradient / mini_batch_size)
output_layer_weights = output_layer_weights - (lmda * output_layer_weights_delta)
# Update hidden layer bias
hidden_layer_bias_delta = (momentum * hidden_layer_bias_delta) + \
(hidden_layer_bias_gradient / mini_batch_size)
hidden_layer_bias = hidden_layer_bias - (lmda * hidden_layer_bias_delta)
# Update output layer bias
output_layer_bias_delta = (momentum * output_layer_bias_delta) + \
(output_layer_bias_gradient / mini_batch_size)
output_layer_bias = output_layer_bias - (lmda * output_layer_bias_delta)
# Perform validation
# ------------------
if (m % validation_interval) == 0:
embedding_layer_state, hidden_layer_state, output_layer_state = \
perform_forwardpropagation(validation_input, \
embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
hidden_layer_bias, output_layer_bias)
validation_data_size = validation_input.shape[1]
expanded_validation_target = expansion[:, validation_target]
CE = -np.sum(expanded_validation_target * np.log(output_layer_state + tiny)) / validation_data_size
print("############################## Validation CE %6.3f" % CE)
# Perform testing
# ---------------
embedding_layer_state, hidden_layer_state, output_layer_state = \
perform_forwardpropagation(testing_input, \
embedding_layer_weights, hidden_layer_weights, output_layer_weights, \
hidden_layer_bias, output_layer_bias)
testing_data_size = testing_input.shape[1]
expanded_test_target = expansion[:, testing_target]
CE = -np.sum(expanded_test_target * np.log(output_layer_state + tiny)) / testing_data_size
print("")
print("##############################")
print("Final testing CE %6.3f" % CE)
print("##############################")
toc = time.time()
print("")
print("Total runtime %.2f seconds" % (toc - tic))
model = {
"embedding_layer_weights": embedding_layer_weights,
"hidden_layer_weights": hidden_layer_weights,
"output_layer_weights": output_layer_weights,
"hidden_layer_bias": hidden_layer_bias,
"output_layer_bias": output_layer_bias,
"vocabulary": vocabulary,
}
return model
In [10]:
model = perform_training(num_epochs=2, mini_batch_size=100)
In [11]:
def display_nearest_words(word, model, k):
embedding_layer_weights = model["embedding_layer_weights"]
vocabulary = model["vocabulary"]
if word in vocabulary:
idx = vocabulary.index(word)
else:
return None
vocabulary_size = len(vocabulary)
word_representation = embedding_layer_weights[idx, :]
square_distance = np.sum(np.square(embedding_layer_weights - word_representation), axis=1)
top_k_sorted_indices = np.argsort(square_distance)[1:k+1]
for i in top_k_sorted_indices:
print("%12s %10.3f" % (vocabulary[i], square_distance[i]))
In [12]:
display_nearest_words('city', model, k=10)