Part of iPyMacLern project.
Copyright (C) 2015 by Eka A. Kurniawan
eka.a.kurniawan(ta)gmail(tod)com
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
In [1]:
import sys
print("Python %d.%d.%d" % (sys.version_info.major, \
sys.version_info.minor, \
sys.version_info.micro))
In [2]:
import numpy as np
print("NumPy %s" % np.__version__)
In [3]:
import scipy
import scipy.io as sio
from scipy.special import expit
from scipy.optimize import minimize
print("SciPy %s" % scipy.__version__)
In [4]:
import matplotlib
import matplotlib.pyplot as plt
print("matplotlib %s" % matplotlib.__version__)
In [5]:
import time
import math
from random import randint
import matplotlib.cm as cm
import pylab
In [6]:
# Display graph inline
%matplotlib inline
# Display graph in 'retina' format for Mac with retina display. Others, use PNG or SVG format.
%config InlineBackend.figure_format = 'retina'
#%config InlineBackend.figure_format = 'PNG'
#%config InlineBackend.figure_format = 'SVG'
Randomly display digit images with the labels from training examples.
In [7]:
def display_images(images, labels, ttl_subplots=10):
m = len(images)
f, axarr = plt.subplots(ttl_subplots, ttl_subplots, figsize=(8, 8))
for pi in range(ttl_subplots):
for pj in range(ttl_subplots):
i = randint(0, m-1)
axarr[pi, pj].set_title(str(y[i][0]))
axarr[pi, pj].imshow(np.resize(images[i], (20,20)).transpose(),
cmap = cm.Greys_r)
axarr[pi, pj].axes.get_xaxis().set_visible(False)
axarr[pi, pj].axes.get_yaxis().set_visible(False)
axarr[pi, pj].axes.get_xaxis().set_ticks([])
axarr[pi, pj].axes.get_yaxis().set_ticks([])
plt.tight_layout()
plt.show()
In [8]:
training_examples = sio.loadmat('ex4data1.mat')
X = training_examples['X']
y = training_examples['y']
In [9]:
display_images(X, y, ttl_subplots=10)
In [10]:
sigmoid = lambda z: 1 / (1 + math.exp(-z))
sigmoid_v = np.vectorize(sigmoid, otypes=[np.float])
expit function is built-in function for sigmoid function.
In [11]:
zs = np.linspace(-10.0, 10.0, num=201)
gs = expit(zs)
In [12]:
plt.scatter(zs, gs, color='red', s=2)
plt.title('Logistic/Sigmoid Function')
plt.xlabel('zs')
plt.ylabel('gs')
plt.show()
Visualisation of $-log(h_\theta(x))$ for $y = 1$
In [13]:
nlog = lambda x: -math.log(x)
nlog_v = np.vectorize(nlog, otypes=[np.float])
In [14]:
htx = np.linspace(0.01, 0.99, num=200)
lhtx = nlog_v(htx)
In [15]:
plt.scatter(htx, lhtx, color='red', s=2)
plt.title('Cost Function for y=1')
plt.xlabel('$h_\Theta(x)$')
plt.ylabel('$-log(h_\Theta(x))$')
plt.show()
Visualisation of $-log(1-h_\theta(x))$ for $y = 0$
In [16]:
nlog_inv = lambda x: -math.log(1 - x)
nlog_inv_v = np.vectorize(nlog_inv, otypes=[np.float])
In [17]:
htx = np.linspace(0.01, 0.99, num=200)
lhtx = nlog_inv_v(htx)
In [18]:
plt.scatter(htx, lhtx, color='red', s=2)
plt.title('Cost Function for y=0')
plt.xlabel('$h_\Theta(x)$')
plt.ylabel('$-log(1-h_\Theta(x))$')
plt.show()
Mathematical representation of the cost function.
Cost function for all 10 digit lables ($K$ = 10).
Finally, add regularisation to the cost function.
First derivation of sigmoid function.
In [19]:
sgrad_in = np.random.rand(5000, 25)
Use our own build using above formula.
In [20]:
sigmoid_grad_v = np.vectorize(lambda z: sigmoid_v(z) * (1 - sigmoid_v(z)),
otypes=[np.float])
In [21]:
sigmoid_grad_v(np.array([0,1,2,0]))
Out[21]:
In [22]:
tic = time.time()
sgv_out = sigmoid_grad_v(sgrad_in)
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
sgv_out.shape
Out[22]:
Use built-in SciPy expit sigmoid function.
In [23]:
expit_grad_v = np.vectorize(lambda z: expit(z) * (1 - expit(z)),
otypes=[np.float])
In [24]:
expit_grad_v(np.array([0,1,2,0]))
Out[24]:
In [25]:
tic = time.time()
egv_out = expit_grad_v(sgrad_in)
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
egv_out.shape
Out[25]:
Use built-in SciPy PDF sigmoid gradient function.
The CDF of the logistic distribution is the sigmoid function. It is available as the cdf method of scipy.stats.logistic, but cdf eventually calls expit, so there is no point in using that method. You can use the pdf method to compute the derivative of the sigmoid function, or the _pdf method which has less overhead, but "rolling your own" is faster
In [26]:
from scipy.stats import logistic
In [27]:
logistic._pdf(np.array([0,1,2,0]))
Out[27]:
In [28]:
tic = time.time()
sgpdfv_out = logistic._pdf(sgrad_in)
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
sgpdfv_out.shape
Out[28]:
Delta for output layer ($\delta_k^{(3)}$) is calculated for each digit label ($k$) of all digits ($K$).
$$ \delta_k^{(3)} = a_k^{(3)} - y_k $$Delta for output layer ($\delta^{(2)}$) is calculated as follow.
$$ \delta^{(2)} = (\theta^{(2)})^T \delta^{(3)} \times g'(z^{(2)}) $$
In [29]:
def compute_cost_grad(parameters,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda):
# Extract parameters
theta1_size = hidden_layer_size * (input_layer_size + 1)
theta1 = parameters[:theta1_size].reshape(hidden_layer_size, input_layer_size + 1)
theta2 = parameters[theta1_size:].reshape(num_labels, hidden_layer_size + 1)
theta1_wob = theta1[:, 1:] # Theta 1 without bias unit
theta2_wob = theta2[:, 1:] # Theta 2 without bias unit
# Compute cost function (feedforward)
z2 = np.dot(Xb, theta1.transpose())
a2 = expit(z2)
a2b = np.column_stack([np.ones(m), a2]) # a2 with bias unit
z3 = np.dot(a2b, theta2.transpose())
h_theta = expit(z3)
J = (sum(sum((y_k * nlog_v(h_theta)) + ((1 - y_k) * nlog_inv_v(h_theta)))) / m) + \
((sum(sum(theta1_wob ** 2)) + sum(sum(theta2_wob ** 2))) * lmda / (2 * m))
# Compute gradient using backpropagation algorithm
delta_3 = h_theta - y_k
delta_2 = np.dot(theta2.transpose(), delta_3.transpose()).transpose()[:,1:] * logistic._pdf(z2)
theta1_grad = (np.dot(delta_2.transpose(), Xb) / m) + np.hstack([np.zeros((hidden_layer_size,1)), ((lmda / m) * theta1_wob)])
theta2_grad = (np.dot(delta_3.transpose(), a2b) / m) + np.hstack([np.zeros((num_labels,1)), ((lmda / m) * theta2_wob)])
grad = np.hstack([theta1_grad.flatten(), theta2_grad.flatten()])
return J, grad
In [30]:
input_layer_size = 400 # 20x20px digit image as 400 units input layer
hidden_layer_size = 25 # 25 units hidden layer
num_labels = 10 # 10 labels or units output layer
In [31]:
training_examples = sio.loadmat('ex4data1.mat')
X = training_examples['X']
y = training_examples['y'].transpose()[0]
m = len(X) # total training samples
Xb = np.column_stack([np.ones(m), X]) # X with bias unit
y_k = np.zeros((m, num_labels))
y_k[np.arange(m), y-1] = 1
In [32]:
X.shape # total training samples and input features
Out[32]:
In [33]:
y.shape # total training samples and output variables
Out[33]:
In [34]:
m
Out[34]:
In [35]:
# Get precomputed parameters for testing
parameters = sio.loadmat('ex4weights.mat')
theta1 = parameters['Theta1']
theta2 = parameters['Theta2']
parameters = np.hstack([theta1.flatten(), theta2.flatten()])
In [36]:
theta1.shape # total hidden units and input units plus one bias unit
Out[36]:
In [37]:
theta2.shape # total output units and hidden units plus one bias unit
Out[37]:
Generate random parameters at the size of output by input plus one units with the value in between $-\epsilon_{init}$ and $\epsilon_{init}$. For learning efficiency, $\epsilon_{init}$ is setted to be 0.12.
In [38]:
def generate_random_parameters(L_in, L_out, epsilon_init=0.12):
return np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
In [39]:
epsilon_init = 0.12
initial_theta1 = generate_random_parameters(input_layer_size, hidden_layer_size, epsilon_init)
initial_theta2 = generate_random_parameters(hidden_layer_size, num_labels, epsilon_init)
initial_parameters = np.hstack([initial_theta1.flatten(), initial_theta2.flatten()])
In [40]:
initial_theta1.shape
Out[40]:
In [41]:
initial_theta2.shape
Out[41]:
In [42]:
initial_parameters.shape
Out[42]:
In [43]:
input_layer_size = 400 # 20x20px digit image as 400 units input layer
hidden_layer_size = 25 # 25 units hidden layer
num_labels = 10 # 10 labels or units output layer
training_examples = sio.loadmat('ex4data1.mat')
X = training_examples['X']
y = training_examples['y'].transpose()[0]
m = len(X) # total training samples
Xb = np.column_stack([np.ones(m), X]) # X with bias unit
y_k = np.zeros((m, num_labels))
y_k[np.arange(m), y-1] = 1
# Get precomputed parameters for testing
parameters = sio.loadmat('ex4weights.mat')
theta1 = parameters['Theta1']
theta2 = parameters['Theta2']
parameters = np.hstack([theta1.flatten(), theta2.flatten()])
Xb = np.column_stack([np.ones(m), X]) # X with bias unit
y_k = np.zeros((m, num_labels))
y_k[np.arange(m), y-1] = 1
In [44]:
lmda = 0 # regularisation weight (lambda) = 0
# expected J = 0.287629
tic = time.time()
J, grad = compute_cost_grad(parameters,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda)
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
In [45]:
J
Out[45]:
In [46]:
grad
Out[46]:
In [47]:
lmda = 1 # regularisation weight (lambda) = 1
# expected J = 0.383769
tic = time.time()
J, grad = compute_cost_grad(parameters,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda)
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
In [48]:
J
Out[48]:
In [49]:
grad
Out[49]:
In [50]:
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
lmda = 3 # regularisation weight (lambda)
Generate gradients using compute_cost_grad function.
In [51]:
def generate_debug_weight(fan_out, fan_in):
w = np.zeros((fan_out, 1 + fan_in))
return np.reshape(np.sin(range(w.size)), w.shape) / 10
In [52]:
# Generate dummy parameters
theta1 = generate_debug_weight(hidden_layer_size, input_layer_size)
theta2 = generate_debug_weight(num_labels, hidden_layer_size)
parameters = np.hstack([theta1.flatten(), theta2.flatten()])
# Generate dummy input
X = np.zeros((m, input_layer_size))
X = np.reshape(np.sin(range(X.size)), X.shape) / 10
y = 1 + np.mod(range(m), num_labels)
Xb = np.column_stack([np.ones(m), X]) # X with bias unit
y_k = np.zeros((m, num_labels))
y_k[np.arange(m), y-1] = 1
cost, grad = compute_cost_grad(parameters,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda)
Generate expected gradient using finite differences.
In [53]:
def compute_numerical_gradient(theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 1e-4
for p in range(len(theta)):
perturb[p] = e
loss1, g = compute_cost_grad(theta - perturb,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda)
loss2, g = compute_cost_grad(theta + perturb,
input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
In [54]:
numgrad = compute_numerical_gradient(parameters)
Compare our backpropagation gradient with expected numerical gradient.
In [55]:
grad, numgrad
Out[55]:
Visual comparison.
In [56]:
plt.scatter(range(len(numgrad)), numgrad, label='finite differences',
color='red', marker="o", s=50)
plt.scatter(range(len(grad)), grad, label='backpropagation',
color='blue', marker="+", s=50)
plt.xlabel('#parameter')
plt.ylabel('gradient')
plt.legend()
plt.show()
Numerical comparison. Good result should be less than $10^{-9}$.
In [59]:
np.linalg.norm(numgrad-grad) / np.linalg.norm(numgrad+grad) # Should be less then 1e-9
Out[59]:
In [60]:
input_layer_size = 400 # 20x20px digit image as 400 units input layer
hidden_layer_size = 25 # 25 units hidden layer
num_labels = 10 # 10 labels or units output layer
training_examples = sio.loadmat('ex4data1.mat')
X = training_examples['X']
y = training_examples['y'].transpose()[0]
m = len(X) # total training samples
lmda = 1 # regularisation weight (lambda)
Xb = np.column_stack([np.ones(m), X]) # X with bias unit
y_k = np.zeros((m, num_labels))
y_k[np.arange(m), y-1] = 1
tic = time.time()
parameters_result = minimize(fun=compute_cost_grad, x0=initial_parameters,
method='Newton-CG', jac =True,
args=(input_layer_size, hidden_layer_size, num_labels,
Xb, y_k, m, lmda),
options={'maxiter': 50})
toc = time.time()
print('Runtime: %s seconds' % int(toc - tic))
In [61]:
trained_parameters = parameters_result['x']
In [62]:
theta1_size = hidden_layer_size * (input_layer_size + 1)
theta1 = trained_parameters[:theta1_size].reshape(hidden_layer_size, input_layer_size + 1)
theta2 = trained_parameters[theta1_size:].reshape(num_labels, hidden_layer_size + 1)
In [63]:
def display_weights(weights):
ttl_subplots = 5
f, axarr = plt.subplots(ttl_subplots, ttl_subplots, figsize=(8, 8))
for pi in range(ttl_subplots):
for pj in range(ttl_subplots):
i = pi * 5 + pj
axarr[pi, pj].imshow(np.resize(weights[i], (20,20)).transpose(),
cmap = cm.Greys_r)
axarr[pi, pj].axes.get_xaxis().set_visible(False)
axarr[pi, pj].axes.get_yaxis().set_visible(False)
axarr[pi, pj].axes.get_xaxis().set_ticks([])
axarr[pi, pj].axes.get_yaxis().set_ticks([])
plt.tight_layout()
plt.show()
In [64]:
display_weights(theta1[:,1:])
In [65]:
def display_image(i, size=5):
plt.figure(figsize = (size,size))
plt.imshow(np.resize(X[i], (20,20)).transpose(), cmap = cm.Greys_r)
plt.axis('off')
plt.show()
In [66]:
def compute_feedforward(x, theta1, theta2):
h1 = expit(np.dot(np.hstack([1, x]), theta1.transpose()))
h2 = expit(np.dot(np.hstack([1, h1]), theta2.transpose()))
return h2
In [67]:
def plot_output(h):
plt.bar(range(num_labels), h, color='gray', width=0.7, align="center")
plt.xticks(range(num_labels), list(map(str, list(range(1, num_labels)) + [0])))
plt.ylim([0.0,1.0])
plt.xlabel('label')
plt.ylabel('h')
plt.show()
In [68]:
input_layer_size = 400 # 20x20px digit image as 400 units input layer
hidden_layer_size = 25 # 25 units hidden layer
num_labels = 10 # 10 labels or units output layer
training_examples = sio.loadmat('ex4data1.mat')
X = training_examples['X']
y = training_examples['y'].transpose()[0]
m = len(X) # total training samples
In [69]:
i = randint(0, m)
display_image(i, 5)
print("Expected label: %d" % y[i])
In [70]:
h2 = compute_feedforward(X[i], theta1, theta2)
plot_output(h2)
In [71]:
# Get prediction labels
a2 = expit(np.dot(Xb, theta1.transpose()))
a2b = np.column_stack([np.ones(m), a2]) # a2 with bias unit
h_theta = expit(np.dot(a2b, theta2.transpose()))
# Compare predicted labels to the given labels and calculate the accuracy
prediction_labels = np.argmax(h_theta, axis=1)+1
accuracy = np.mean(prediction_labels == y) * 100
print("Training accuracy: %.2f%%" % accuracy)
In [72]:
mispredicted_indices = np.where(prediction_labels != y)[0]
mispredicted_indices
Out[72]:
In [78]:
for mispredicted_indiex in mispredicted_indices:
display_image(mispredicted_indiex, 2)
mispredicted_h2 = compute_feedforward(X[mispredicted_indiex], theta1, theta2)
plot_output(mispredicted_h2)
print("Predicted %d should be %d\n\n" % (prediction_labels[mispredicted_indiex], y[mispredicted_indiex]))