In [1]:
import numpy as np
np.random.seed(0)
from scipy.io import loadmat
from scipy import optimize
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.image import NonUniformImage
from matplotlib import cm
matplotlib.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
file_path = '../course_materials/ex4data1.mat'
data = loadmat(file_path)
eights_file_path = '../course_materials/ex4weights.mat'
weights = loadmat(eights_file_path)
print(weights['Theta1'].shape)
print(weights['Theta2'].shape)
print(type(weights))
In [3]:
def get_data(file_path):
data = loadmat(file_path)
X = data['X']
y = data['y']
return X, y
def get_β(layer):
'''Generate β-matrix for every layer in Neural Network'''
β_set = ()
for i in range(len(layer)-1):
# recommendation from Andrew Ng window is ±(6/(inLayer + outLayer))**0.5
low, high = -(6/(layer[i]+layer[i+1]))**0.5, (6/(layer[i]+layer[i+1]))**0.
β_set += (np.random.uniform(low,high,(layer[i+1], layer[i]+1)),)
# β_set += (np.zeros((outLayer, inLayer+1)),)
return β_set
def flatten_β(β_set):
β_flat = β_set[0].flatten()
for β in β_set[1:]:
β_flat = np.concatenate((β_flat, β.flatten()), axis=-1)
return β_flat
def reshape_β(β, layer):
splitIndex = 0
splitIndices = []
for i in range(len(layer)-2):
splitIndex += (layer[i]+1)*layer[i+1]
splitIndices += [splitIndex]
splitβ = np.split(β, splitIndices)
reshapedβ = ()
for i in range(len(splitβ)):
reshapedβ += (splitβ[i].reshape(layer[i+1],layer[i]+1),)
return reshapedβ
def get_sigmoid(z):
return 1/(1+np.exp(-z))
def forward_propagation(β_flat, layer, X_flat, n_samples):
'''Forward Propagation is the hypothesis function for Neural Networks'''
β_set = reshape_β(β_flat, layer)
# H_0 (5000, 400)
H = X_flat.reshape(n_samples, -1)
# Z_H = ()
H_byLayer = ()
for β in β_set:
# print(H.shape)
# Z_l (5000, k_l); l is the number of layers [0, ...,l]; k is the number of neurons in a layer l [1,...,k]
Z = np.dot(np.insert(H, 0, 1, axis=1), β.T)
# H_l (5000, k_l); l is the number of layers [0, ...,l]; k is the number of neurons in a layer l [1,...,k]
H = get_sigmoid(Z)
# Z_H += ((Z, H),)
H_byLayer += (H,)
# H_2 (5000, 10)
return H_byLayer
def get_sigmoid_gradient(Z):
return get_sigmoid(Z)*(1-get_sigmoid(Z))
def cost_function(β_flat, layer, X_flat, n_samples, y, yUnique, λ = 0.):
X = X_flat.reshape(n_samples, -1)
Y = np.array([yUnique]* y.shape[0]) == y
β_set = reshape_β(β_flat, layer)
J = 0
for n in range(n_samples):
x_n = X[n:n+1,:]
y_n = Y[n:n+1,:]
# hypothesis vector h_n(1, 10)
h_n = forward_propagation(β_flat, layer, x_n, 1)[len(β_set)-1]
# cost function scalar j_n(1, 1) = y_n(1, 10)*h_n.T(10, 1)
j_n = (- np.dot(y_n, np.log(h_n).T) - np.dot((1-y_n), np.log(1-h_n).T))
J += j_n
# regularisation term (R)
cummulativeR = 0
for β in β_set:
cummulativeR += np.sum(β*β) #element-wise multiplication
cummulativeR *= λ/(2*n_samples)
return J[0][0]/n_samples + cummulativeR
The input-data matrix X(5000, 400) is comprised of 5000 digit images 20 by 20 pixels (400 pixels).
The output-data vector Y(5000,1) is comprised of 5000 assigned digits (1 through 10; 10 represents figure '0').
The neural network in this work has 1 input layer (400 neurons), one hidden layer (25 neurons) and an output layer (10 neurons).
To initialise a simple neural network, one has to do the following:
In [4]:
# Set number of neurons in every layer (including input and output layers)
layer = 400, 25, 10
# Extract and flatten input matrix X
X, y = get_data(file_path)
n_samples, n_variables = X.shape
X_flat = X.flatten()
yUnique = np.unique(y)
# Initialise Beat matrix
β_test = flatten_β((weights['Theta1'], weights['Theta2']))
β_initial = flatten_β(get_β(layer))
print(X.shape)
print(y.shape)
for β in get_β(layer): print(β.shape)
In [5]:
# either transformed Y or y together with yUnique can be suplied to a function
# Y = np.array([yUnique]* y.shape[0]) == y
# print(Y[0:0+1,:].shape)
In [6]:
print(forward_propagation(β_test, layer, X_flat, n_samples)[1].shape)
print(forward_propagation(β_test, layer, X[0:0+1,:], 1)[1].shape)
In [7]:
print(X.shape)
print(X[0][None,:].shape)
# cost_function(β_test, layer, X.flatten(), n_samples, y, yUnique, λ = 0.)
cost_function(β_test, layer, X[0:5000][None,:].flatten(), 5000, y, yUnique, λ = 0.)
Out[7]:
In [8]:
print(cost_function(β_test, layer, X_flat, n_samples, y, yUnique, λ = 0.))
print(cost_function(β_test, layer, X_flat, n_samples, y, yUnique, λ = 1.))
print(cost_function(β_test, layer, X[0][None,:].flatten(), 1, y, yUnique, λ = 0.))
print(cost_function(β_initial, layer, X_flat, n_samples, y, yUnique, λ = 1.))
In [9]:
def back_propagation(β_flat, layer, X_flat, n_samples, y, yUnique):
Y = np.array([yUnique]* y.shape[0]) == y
β_set = reshape_β(β_flat, layer)
deltaSet = ()
# hypothesis matrix E(5000, 10)
H = forward_propagation(β_flat, layer, X_flat, n_samples)
# error matrix E(5000, 10)
E = H[len(layer)-2] - Y
for l in reversed(range(len(layer)-1)):
E = np.dot(E*get_sigmoid_gradient(H[l]), β_set[l])[:,1:]
deltaSet = (np.dot(H[l].T, np.insert(E, 0, 1, axis=1)),) + deltaSet
flatDelta = flatten_β(deltaSet)
return β_flat + flatDelta/n_samples
In [10]:
Y = np.array([yUnique]* y.shape[0]) == y
# print(Y.shape)
β_set = reshape_β(β_test, layer)
# print(len(β_set))
deltaSet = ()
# hypothesis matrix E(5000, 10)
H = forward_propagation(β_test, layer, X_flat, n_samples)
# print (len(H))
# error matrix E(5000, 10)
E = H[len(layer)-2] - Y
# print(E.shape)
for l in reversed(range(len(layer)-1)):
E = np.dot(E*get_sigmoid_gradient(H[l]), β_set[l])[:,1:]
print(E.shape)
deltaSet = (np.dot(H[l].T, np.insert(E, 0, 1, axis=1)),) + deltaSet
print(len(deltaSet))
print(deltaSet[0].shape)
print(deltaSet[1].shape)
flatDelta = flatten_β(deltaSet)
print(β_test.shape)
f = β_test + flatDelta/n_samples
f[3915]
Out[10]:
In [11]:
β_initial = flatten_β(get_β(layer))
a = back_propagation(β_test, layer, X_flat, n_samples, y, yUnique)
print(a.shape)
print(a[3915])
print(np.sum(a))
print(cost_function(a,layer, X_flat, n_samples, y, yUnique, λ = 0.))
In [13]:
def check_gradient(β_flat, layer, X_flat, n_samples, y, yUnique, epsilon):
for i in np.random.randint(β_flat.size, size=10):
epsilonVector = np.zeros(β_flat.size)
epsilonVector[i] = epsilon
gradient = back_propagation(β_flat, layer, X_flat, n_samples, y, yUnique)
βPlus = βMinus = β_flat
# βPlus = β + epsilonVector
βPlus += epsilonVector
costPlus = cost_function(βPlus,layer, X, n_samples, y, yUnique, λ = 0.)
# βMinus = β - epsilonVector
βMinus -= epsilonVector
costMinus = cost_function(βMinus,layer, X, n_samples, y, yUnique, λ = 0.)
approximateGradient = (costPlus-costMinus)/(2*epsilon)
print (i, '\t', approximateGradient, '\t', gradient[i])
epsilon = 0.0001
check_gradient(β_test, layer, X_flat, n_samples, y, yUnique, epsilon)
In [14]:
def optimise_β_1(β_flat, X_flat, n_samples, y, yUnique, λ=0.):
β_optimised = optimize.minimize(cost_function, β_flat, args=(layer, X_flat, n_samples, y, yUnique),
method=None, jac=back_propagation, options={'maxiter':50})
# β_optimised = optimize.fmin_cg(cost_function, fprime=back_propagation, x0=β_flat,
# args=(layer, X_flat, n_samples, y, yUnique),
# maxiter=50,disp=True,full_output=True)
return(β_optimised)
In [15]:
def optimise_β_2(β_flat, X_flat, n_samples, y, yUnique, λ=0.):
# β_optimised = optimize.minimize(cost_function, β_flat, args=(layer, X_flat, n_samples, y, yUnique),
# method=None, jac=back_propagation, options={'maxiter':50})
β_optimised = optimize.fmin_cg(cost_function, fprime=back_propagation, x0=β_flat,
args=(layer, X_flat, n_samples, y, yUnique),
maxiter=50,disp=True,full_output=True)
return(β_optimised)
In [17]:
a = optimise_β_1(β_initial, X_flat, n_samples, y, yUnique, λ=0.)
In [18]:
b = optimise_β_2(β_initial, X_flat, n_samples, y, yUnique, λ=0.)
In [19]:
def quality_control(β_optimised, layer, X_flat, n_samples, y, yUnique, λ = 0.):
X = X_flat.reshape(n_samples,-1)
yAssignmentVector = []
misAssignedIndex = []
for n in range(n_samples):
x = X[n]
yAssignment = np.argmax(forward_propagation(β_optimised, layer, X[n], 1)[1]) + 1
if yAssignment == y[n]:
yAssignmentVector += [True]
else:
yAssignmentVector += [False]
misAssignedIndex += [n]
return (sum(yAssignmentVector)/n_samples)
In [20]:
# neuralNetworkClassifier(, X_flat, n_samples, y, yUnique, λ=0.)
quality_control(a['x'], layer, X_flat, n_samples, y, yUnique, λ = 0.)
Out[20]:
In [21]:
quality_control(b[0], layer, X_flat, n_samples, y, yUnique, λ = 0.)
Out[21]:
In [ ]: