In [1]:
import numpy as np
np.random.seed(0)

from scipy.io import loadmat
from scipy import optimize

import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.image import NonUniformImage
from matplotlib import cm
matplotlib.style.use('ggplot')
%matplotlib inline

%load_ext autoreload
%autoreload 2

0 Data Structure


In [2]:
file_path = '../course_materials/ex4data1.mat'
data = loadmat(file_path)
eights_file_path = '../course_materials/ex4weights.mat'
weights = loadmat(eights_file_path)
print(weights['Theta1'].shape)
print(weights['Theta2'].shape)
print(type(weights))


(25, 401)
(10, 26)
<class 'dict'>

1 Nural Network

1.1 Forward Porpagation


In [3]:
def get_data(file_path):
    data = loadmat(file_path)
    X = data['X']
    y = data['y']
    return X, y

def get_β(layer):
    '''Generate β-matrix for every layer in Neural Network'''
    β_set = ()
    for i in range(len(layer)-1):
#         recommendation from Andrew Ng window is ±(6/(inLayer + outLayer))**0.5
        low, high = -(6/(layer[i]+layer[i+1]))**0.5, (6/(layer[i]+layer[i+1]))**0.
        β_set += (np.random.uniform(low,high,(layer[i+1], layer[i]+1)),)
#         β_set += (np.zeros((outLayer, inLayer+1)),)
    return β_set

def flatten_β(β_set):
    β_flat = β_set[0].flatten()
    for β in β_set[1:]:
        β_flat = np.concatenate((β_flat, β.flatten()), axis=-1)
    return β_flat

def reshape_β(β, layer):
    splitIndex = 0
    splitIndices = []
    for i in range(len(layer)-2):
        splitIndex += (layer[i]+1)*layer[i+1]
        splitIndices += [splitIndex]
    splitβ = np.split(β, splitIndices)
    reshapedβ = ()
    for i in range(len(splitβ)):
        reshapedβ += (splitβ[i].reshape(layer[i+1],layer[i]+1),)
    return reshapedβ
    
def get_sigmoid(z):
    return 1/(1+np.exp(-z))

def forward_propagation(β_flat, layer, X_flat, n_samples):
    '''Forward Propagation is the hypothesis function for Neural Networks'''
    β_set = reshape_β(β_flat, layer)
#     H_0 (5000, 400)
    H = X_flat.reshape(n_samples, -1)
#     Z_H = ()
    H_byLayer = ()
    for β in β_set:
#         print(H.shape)
#         Z_l (5000, k_l); l is the number of layers [0, ...,l]; k is the number of neurons in a layer l [1,...,k]
        Z = np.dot(np.insert(H, 0, 1, axis=1), β.T)
#         H_l (5000, k_l); l is the number of layers [0, ...,l]; k is the number of neurons in a layer l [1,...,k]
        H = get_sigmoid(Z)
#         Z_H += ((Z, H),)
        H_byLayer += (H,)
#     H_2 (5000, 10)
    return H_byLayer

def get_sigmoid_gradient(Z):
    return get_sigmoid(Z)*(1-get_sigmoid(Z))

def cost_function(β_flat, layer, X_flat, n_samples, y, yUnique, λ = 0.):
    X = X_flat.reshape(n_samples, -1)
    Y = np.array([yUnique]* y.shape[0]) == y
    β_set = reshape_β(β_flat, layer)
    J = 0
    for n in range(n_samples):
        x_n = X[n:n+1,:]
        y_n = Y[n:n+1,:]
#         hypothesis vector h_n(1, 10)
        h_n = forward_propagation(β_flat, layer, x_n, 1)[len(β_set)-1]
#         cost function scalar j_n(1, 1) = y_n(1, 10)*h_n.T(10, 1)
        j_n = (- np.dot(y_n, np.log(h_n).T) - np.dot((1-y_n), np.log(1-h_n).T))
        J += j_n
#     regularisation term (R)
    cummulativeR = 0
    for β in β_set:
        cummulativeR += np.sum(β*β) #element-wise multiplication
    cummulativeR *= λ/(2*n_samples)
    return J[0][0]/n_samples + cummulativeR

1.1.1 Neural Network Initialisation

The input-data matrix X(5000, 400) is comprised of 5000 digit images 20 by 20 pixels (400 pixels).
The output-data vector Y(5000,1) is comprised of 5000 assigned digits (1 through 10; 10 represents figure '0').
The neural network in this work has 1 input layer (400 neurons), one hidden layer (25 neurons) and an output layer (10 neurons). To initialise a simple neural network, one has to do the following:

  1. set the number of neurons in every layer (including input and output layers)
  2. extract and flatten input matrix X
  3. transform output Y
  4. initialise Beat matrix

In [4]:
# Set number of neurons in every layer (including input and output layers)
layer = 400, 25, 10
# Extract and flatten input matrix X
X, y = get_data(file_path)
n_samples, n_variables = X.shape
X_flat = X.flatten()
yUnique = np.unique(y)
# Initialise Beat matrix
β_test = flatten_β((weights['Theta1'], weights['Theta2']))
β_initial = flatten_β(get_β(layer))
print(X.shape)
print(y.shape)
for β in get_β(layer): print(β.shape)


(5000, 400)
(5000, 1)
(25, 401)
(10, 26)

1.1.2 Forward-Propagation Test


In [5]:
# either transformed Y or y together with yUnique can be suplied to a function
# Y = np.array([yUnique]* y.shape[0]) == y
# print(Y[0:0+1,:].shape)

In [6]:
print(forward_propagation(β_test, layer, X_flat, n_samples)[1].shape)
print(forward_propagation(β_test, layer, X[0:0+1,:], 1)[1].shape)


(5000, 10)
(1, 10)

In [7]:
print(X.shape)
print(X[0][None,:].shape)
# cost_function(β_test, layer, X.flatten(), n_samples, y, yUnique, λ = 0.)
cost_function(β_test, layer, X[0:5000][None,:].flatten(), 5000, y, yUnique, λ = 0.)


(5000, 400)
(1, 400)
Out[7]:
0.2876291651613188

1.1.3 Cost-Function Test

The outputs of the cost_function should be as follows:<br> β_test, X, λ=0. — 0.287629 (Andrew Ng)<br> β_test, X, λ=1. — 0.383770 (Andrew Ng)<br> β_test, X, λ=0. — 0.0345203898838<br> β_initial, X, λ=1. — 65.5961451562


In [8]:
print(cost_function(β_test, layer, X_flat, n_samples, y, yUnique, λ = 0.))
print(cost_function(β_test, layer, X_flat, n_samples, y, yUnique, λ = 1.))
print(cost_function(β_test, layer, X[0][None,:].flatten(), 1, y, yUnique, λ = 0.))
print(cost_function(β_initial, layer, X_flat, n_samples, y, yUnique, λ = 1.))


0.2876291651613188
0.38448779624289386
0.03452038988383936
65.5961451562497

1.2 Back Propagation

$\delta^l = H^l - Y$
$\delta^{l-1} = (\beta^{l-1})^T\delta^l\cdot g'(h^{l-1})$


In [9]:
def back_propagation(β_flat, layer, X_flat, n_samples, y, yUnique):
    Y = np.array([yUnique]* y.shape[0]) == y
    β_set = reshape_β(β_flat, layer)

    deltaSet = ()
#     hypothesis matrix E(5000, 10)
    H = forward_propagation(β_flat, layer, X_flat, n_samples)
#     error matrix E(5000, 10)
    E = H[len(layer)-2] - Y
    for l in reversed(range(len(layer)-1)):
        E = np.dot(E*get_sigmoid_gradient(H[l]), β_set[l])[:,1:]
        deltaSet = (np.dot(H[l].T, np.insert(E, 0, 1, axis=1)),) + deltaSet
    flatDelta = flatten_β(deltaSet)
    return β_flat + flatDelta/n_samples

In [10]:
Y = np.array([yUnique]* y.shape[0]) == y
# print(Y.shape)
β_set = reshape_β(β_test, layer)
# print(len(β_set))
deltaSet = ()
#     hypothesis matrix E(5000, 10)
H = forward_propagation(β_test, layer, X_flat, n_samples)
# print (len(H))
#     error matrix E(5000, 10)
E = H[len(layer)-2] - Y
# print(E.shape)
for l in reversed(range(len(layer)-1)):
    E = np.dot(E*get_sigmoid_gradient(H[l]), β_set[l])[:,1:]
    print(E.shape)
    deltaSet = (np.dot(H[l].T, np.insert(E, 0, 1, axis=1)),) + deltaSet
print(len(deltaSet))
print(deltaSet[0].shape)
print(deltaSet[1].shape)
flatDelta = flatten_β(deltaSet)
print(β_test.shape)
f = β_test + flatDelta/n_samples
f[3915]


(5000, 25)
(5000, 400)
2
(25, 401)
(10, 26)
(10285,)
Out[10]:
0.14300247142022265

In [11]:
β_initial = flatten_β(get_β(layer))
a = back_propagation(β_test, layer, X_flat, n_samples, y, yUnique)
print(a.shape)
print(a[3915])
print(np.sum(a))
print(cost_function(a,layer, X_flat, n_samples, y, yUnique, λ = 0.))


(10285,)
0.14300247142022265
-77.68979297139275
0.32830159921277324

In [13]:
def check_gradient(β_flat, layer, X_flat, n_samples, y, yUnique, epsilon):
    for i in np.random.randint(β_flat.size, size=10):
        epsilonVector = np.zeros(β_flat.size)
        epsilonVector[i] = epsilon
        
        gradient = back_propagation(β_flat, layer, X_flat, n_samples, y, yUnique)
        
        βPlus = βMinus = β_flat
#         βPlus = β + epsilonVector
        βPlus += epsilonVector
        costPlus = cost_function(βPlus,layer, X, n_samples, y, yUnique, λ = 0.)
#         βMinus = β - epsilonVector
        βMinus -= epsilonVector
        costMinus = cost_function(βMinus,layer, X, n_samples, y, yUnique, λ = 0.)
        approximateGradient = (costPlus-costMinus)/(2*epsilon)
        print (i, '\t', approximateGradient, '\t', gradient[i])

epsilon = 0.0001
check_gradient(β_test, layer, X_flat, n_samples, y, yUnique, epsilon)


7014 	 3.416238375519853e-05 	 -0.0281727493987568
3905 	 1.523521031554509e-06 	 0.016395658791554347
1589 	 -6.722464251929239e-06 	 0.022441850182514538
7949 	 -0.0001419845119254859 	 0.45346675743341425
9639 	 1.564271212561863e-06 	 -0.005225646712133616
33 	 4.110603524232204e-06 	 -0.01368051128447383
6058 	 2.2626955864524234e-07 	 -0.0007352434658607858
1981 	 3.0074051582396066e-06 	 -0.008847181911195554
611 	 5.569122940585203e-06 	 -0.027306578772908468
3848 	 -5.17336948280267e-05 	 0.17538764386857092

In [14]:
def optimise_β_1(β_flat, X_flat, n_samples, y, yUnique, λ=0.):

    β_optimised = optimize.minimize(cost_function, β_flat, args=(layer, X_flat, n_samples, y, yUnique),
                                      method=None, jac=back_propagation, options={'maxiter':50})

#     β_optimised = optimize.fmin_cg(cost_function, fprime=back_propagation, x0=β_flat,
#                                      args=(layer, X_flat, n_samples, y, yUnique),
#                                      maxiter=50,disp=True,full_output=True)
    return(β_optimised)

In [15]:
def optimise_β_2(β_flat, X_flat, n_samples, y, yUnique, λ=0.):

#     β_optimised = optimize.minimize(cost_function, β_flat, args=(layer, X_flat, n_samples, y, yUnique),
#                                       method=None, jac=back_propagation, options={'maxiter':50})

    β_optimised = optimize.fmin_cg(cost_function, fprime=back_propagation, x0=β_flat,
                                     args=(layer, X_flat, n_samples, y, yUnique),
                                     maxiter=50,disp=True,full_output=True)
    return(β_optimised)

In [17]:
a = optimise_β_1(β_initial, X_flat, n_samples, y, yUnique, λ=0.)

In [18]:
b = optimise_β_2(β_initial, X_flat, n_samples, y, yUnique, λ=0.)


Warning: Desired error not necessarily achieved due to precision loss.
         Current function value: 5.279260
         Iterations: 2
         Function evaluations: 90
         Gradient evaluations: 78

In [19]:
def quality_control(β_optimised, layer, X_flat, n_samples, y, yUnique, λ = 0.):
    X = X_flat.reshape(n_samples,-1)
    yAssignmentVector = []
    misAssignedIndex = []
    for n in range(n_samples):
        x = X[n]
        yAssignment =  np.argmax(forward_propagation(β_optimised, layer, X[n], 1)[1]) + 1
        if yAssignment == y[n]:
            yAssignmentVector += [True]
        else:
            yAssignmentVector += [False]
            misAssignedIndex += [n]
    return (sum(yAssignmentVector)/n_samples)

In [20]:
# neuralNetworkClassifier(, X_flat, n_samples, y, yUnique, λ=0.)
quality_control(a['x'], layer, X_flat, n_samples, y, yUnique, λ = 0.)


Out[20]:
0.1

In [21]:
quality_control(b[0], layer, X_flat, n_samples, y, yUnique, λ = 0.)


Out[21]:
0.1

In [ ]: