In [17]:
%matplotlib inline

In [18]:
import scipy.optimize
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_mldata

In [19]:
def normalize_features(train, test):
    """Normalizes train set features to a standard normal distribution
    (zero mean and unit variance). The same procedure is then applied
    to the test set features.
    """
    train_mean = train.mean(axis=0)
    # +0.1 to avoid division by zero in this specific case
    train_std = train.std(axis=0) + 0.1
    
    train = (train - train_mean) / train_std
    test = (test - train_mean) / train_std
    return train, test

First get and preprocess the data.


In [20]:
# get data: contains 70k samples of which the last 10k are meant for testing
mnist = fetch_mldata('MNIST original', data_home='./data')

# prepare for concat
y_all = mnist.target[:, np.newaxis]

# intercept term to be added
intercept = np.ones_like(y_all)

# normalize the data (zero mean and unit variance)
train_normalized, test_normalized = normalize_features(
    mnist.data[:60000, :],
    mnist.data[60000:, :],
)

# concat intercept, X, and y so that shuffling is easier in a next step
train_all = np.hstack((
    intercept[:60000],
    train_normalized,
    y_all[:60000],
))
test_all = np.hstack((
    intercept[60000:],
    test_normalized,
    y_all[60000:],
))

I don't think this randomization step is really needed in our case, but let's stick with the ufldl tutorial here.


In [21]:
np.random.shuffle(train_all)
np.random.shuffle(test_all)

Now prepare the final train and test datasets. Let's only pick the data for the digits 0 and 1.


In [22]:
# train data
train_X = train_all[np.logical_or(train_all[:, -1] == 0, train_all[:, -1] == 1), :-1]
train_y = train_all[np.logical_or(train_all[:, -1] == 0, train_all[:, -1] == 1), -1]

# test data
test_X = test_all[np.logical_or(test_all[:, -1] == 0, test_all[:, -1] == 1), :-1]    
test_y = test_all[np.logical_or(test_all[:, -1] == 0, test_all[:, -1] == 1), -1]

In [23]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [24]:
def cost_function(theta, X, y):
    h = sigmoid(X.dot(theta))
    return -sum(y * np.log(h) + (1 - y) * np.log(1 - h))

In [25]:
def gradient(theta, X, y):
    errors = sigmoid(X.dot(theta)) - y
    return errors.dot(X)

In [27]:
J_history = []

t0 = time.time()
res = scipy.optimize.minimize(
    fun=cost_function,
    x0=np.random.rand(train_X.shape[1]) * 0.001,
    args=(train_X, train_y),
    method='L-BFGS-B',
    jac=gradient,
    options={'maxiter': 100, 'disp': True},
    callback=lambda x: J_history.append(cost_function(x, train_X, train_y)),
)
t1 = time.time()

print('Optimization took {s} seconds'.format(s=t1 - t0))
optimal_theta = res.x


Optimization took 1.385227918624878 seconds
/Users/fhartl/anaconda/envs/py34/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: RuntimeWarning: divide by zero encountered in log
  app.launch_new_instance()
/Users/fhartl/anaconda/envs/py34/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: RuntimeWarning: invalid value encountered in multiply
  app.launch_new_instance()
/Users/fhartl/anaconda/envs/py34/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: RuntimeWarning: overflow encountered in exp
  from IPython.kernel.zmq import kernelapp as app

In [28]:
plt.plot(J_history, marker='o')
plt.xlabel('Iterations')
plt.ylabel('J(theta)')


Out[28]:
<matplotlib.text.Text at 0x109f83e80>

In [29]:
def accuracy(theta, X, y):
    correct = np.sum(np.equal(y, (sigmoid(X.dot(theta))) > 0.5))
    return correct / y.size

In [30]:
print('Training accuracy: {acc}'.format(acc=accuracy(res.x, train_X, train_y)))
print('Test accuracy: {acc}'.format(acc=accuracy(res.x, test_X, test_y)))


Training accuracy: 1.0
Test accuracy: 0.9995271867612293

Looking good, right? Well, look closer...

I actually had to use the L-BFGS-B optimization method for it to work.
Had I used the expected BFGS method, nan and inf values due to log(0) would have made trouble.
Why? I can think of two reasons:

  1. Even if being multiplied with 0, the log(0) expression is still evaluated by numpy. And unfortunately: 0 * np.nan = np.nan.
  2. Floating point arithmetic limits which don't exist in Mathematics.

One way to counteract those issues is to substitute the troubling values:


In [31]:
def safe_log(x, nan_substitute=-1e+4):
    l = np.log(x)
    l[np.logical_or(np.isnan(l), np.isinf(l))] = nan_substitute
    return l

In [32]:
def cost_function_safe(theta, X, y):
    h = sigmoid(X.dot(theta))
    return -sum(y * safe_log(h) + (1 - y) * safe_log(1 - h))

In [33]:
J_history = []

t0 = time.time()
res = scipy.optimize.minimize(
    fun=cost_function_safe,
    x0=np.random.rand(train_X.shape[1]) * 0.001,
    args=(train_X, train_y),
    method='BFGS',
    jac=gradient,
    options={'maxiter': 100, 'disp': True},
    callback=lambda x: J_history.append(cost_function_safe(x, train_X, train_y)),
)
t1 = time.time()

print('Optimization took {s} seconds'.format(s=t1 - t0))
optimal_theta = res.x


Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.001535
         Iterations: 100
         Function evaluations: 114
         Gradient evaluations: 114
Optimization took 5.0561230182647705 seconds
/Users/fhartl/anaconda/envs/py34/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: RuntimeWarning: overflow encountered in exp
  from IPython.kernel.zmq import kernelapp as app
/Users/fhartl/anaconda/envs/py34/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: RuntimeWarning: divide by zero encountered in log
  from IPython.kernel.zmq import kernelapp as app


Notice that the above optimization procedure doesn't converge due to the substitutions (which doesn't allow the gradients to further improve (= get smaller) at some point). Therefore, it used all 100 allowed iterations.


In [34]:
plt.plot(J_history, marker='o')
plt.xlabel('Iterations')
plt.ylabel('J(theta)')


Out[34]:
<matplotlib.text.Text at 0x11e1409b0>

In [35]:
print('Training accuracy: {acc}'.format(acc=accuracy(res.x, train_X, train_y)))
print('Test accuracy: {acc}'.format(acc=accuracy(res.x, test_X, test_y)))


Training accuracy: 1.0
Test accuracy: 0.9985815602836879