In [17]:
%matplotlib inline
In [18]:
import scipy.optimize
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_mldata
In [19]:
def normalize_features(train, test):
"""Normalizes train set features to a standard normal distribution
(zero mean and unit variance). The same procedure is then applied
to the test set features.
"""
train_mean = train.mean(axis=0)
# +0.1 to avoid division by zero in this specific case
train_std = train.std(axis=0) + 0.1
train = (train - train_mean) / train_std
test = (test - train_mean) / train_std
return train, test
First get and preprocess the data.
In [20]:
# get data: contains 70k samples of which the last 10k are meant for testing
mnist = fetch_mldata('MNIST original', data_home='./data')
# prepare for concat
y_all = mnist.target[:, np.newaxis]
# intercept term to be added
intercept = np.ones_like(y_all)
# normalize the data (zero mean and unit variance)
train_normalized, test_normalized = normalize_features(
mnist.data[:60000, :],
mnist.data[60000:, :],
)
# concat intercept, X, and y so that shuffling is easier in a next step
train_all = np.hstack((
intercept[:60000],
train_normalized,
y_all[:60000],
))
test_all = np.hstack((
intercept[60000:],
test_normalized,
y_all[60000:],
))
I don't think this randomization step is really needed in our case, but let's stick with the ufldl tutorial here.
In [21]:
np.random.shuffle(train_all)
np.random.shuffle(test_all)
Now prepare the final train and test datasets. Let's only pick the data for the digits 0 and 1.
In [22]:
# train data
train_X = train_all[np.logical_or(train_all[:, -1] == 0, train_all[:, -1] == 1), :-1]
train_y = train_all[np.logical_or(train_all[:, -1] == 0, train_all[:, -1] == 1), -1]
# test data
test_X = test_all[np.logical_or(test_all[:, -1] == 0, test_all[:, -1] == 1), :-1]
test_y = test_all[np.logical_or(test_all[:, -1] == 0, test_all[:, -1] == 1), -1]
In [23]:
def sigmoid(z):
return 1 / (1 + np.exp(-z))
In [24]:
def cost_function(theta, X, y):
h = sigmoid(X.dot(theta))
return -sum(y * np.log(h) + (1 - y) * np.log(1 - h))
In [25]:
def gradient(theta, X, y):
errors = sigmoid(X.dot(theta)) - y
return errors.dot(X)
In [27]:
J_history = []
t0 = time.time()
res = scipy.optimize.minimize(
fun=cost_function,
x0=np.random.rand(train_X.shape[1]) * 0.001,
args=(train_X, train_y),
method='L-BFGS-B',
jac=gradient,
options={'maxiter': 100, 'disp': True},
callback=lambda x: J_history.append(cost_function(x, train_X, train_y)),
)
t1 = time.time()
print('Optimization took {s} seconds'.format(s=t1 - t0))
optimal_theta = res.x
In [28]:
plt.plot(J_history, marker='o')
plt.xlabel('Iterations')
plt.ylabel('J(theta)')
Out[28]:
In [29]:
def accuracy(theta, X, y):
correct = np.sum(np.equal(y, (sigmoid(X.dot(theta))) > 0.5))
return correct / y.size
In [30]:
print('Training accuracy: {acc}'.format(acc=accuracy(res.x, train_X, train_y)))
print('Test accuracy: {acc}'.format(acc=accuracy(res.x, test_X, test_y)))
I actually had to use the L-BFGS-B optimization method for it to work.
Had I used the expected BFGS method, nan and inf values due to log(0) would have made trouble.
Why? I can think of two reasons:
0, the log(0) expression is still evaluated by numpy. And unfortunately: 0 * np.nan = np.nan.One way to counteract those issues is to substitute the troubling values:
In [31]:
def safe_log(x, nan_substitute=-1e+4):
l = np.log(x)
l[np.logical_or(np.isnan(l), np.isinf(l))] = nan_substitute
return l
In [32]:
def cost_function_safe(theta, X, y):
h = sigmoid(X.dot(theta))
return -sum(y * safe_log(h) + (1 - y) * safe_log(1 - h))
In [33]:
J_history = []
t0 = time.time()
res = scipy.optimize.minimize(
fun=cost_function_safe,
x0=np.random.rand(train_X.shape[1]) * 0.001,
args=(train_X, train_y),
method='BFGS',
jac=gradient,
options={'maxiter': 100, 'disp': True},
callback=lambda x: J_history.append(cost_function_safe(x, train_X, train_y)),
)
t1 = time.time()
print('Optimization took {s} seconds'.format(s=t1 - t0))
optimal_theta = res.x
Notice that the above optimization procedure doesn't converge due to the substitutions (which doesn't allow the gradients to further improve (= get smaller) at some point). Therefore, it used all 100 allowed iterations.
In [34]:
plt.plot(J_history, marker='o')
plt.xlabel('Iterations')
plt.ylabel('J(theta)')
Out[34]:
In [35]:
print('Training accuracy: {acc}'.format(acc=accuracy(res.x, train_X, train_y)))
print('Test accuracy: {acc}'.format(acc=accuracy(res.x, test_X, test_y)))