MNIST classification



In [1]:

    
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import tensorflow as tf

from gpflow.likelihoods import MultiClass
from gpflow.kernels import RBF, White
from gpflow.models.svgp import SVGP
from gpflow.training import AdamOptimizer

from scipy.stats import mode
from scipy.cluster.vq import kmeans2

from doubly_stochastic_dgp.dgp import DGP

import time

def get_mnist_data(data_path='/data'):
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets(data_path+'/MNIST_data/', one_hot=False)

    X, Y = mnist.train.next_batch(mnist.train.num_examples)
    Xval, Yval = mnist.validation.next_batch(mnist.validation.num_examples)
    Xtest, Ytest = mnist.test.next_batch(mnist.test.num_examples)

    Y, Yval, Ytest = [np.array(y, dtype=float)[:, None] for y in [Y, Yval, Ytest]]

    X = np.concatenate([X, Xval], 0)
    Y = np.concatenate([Y, Yval], 0)
    
    return X.astype(float), Y.astype(float), Xtest.astype(float), Ytest.astype(float)

X, Y, Xs, Ys = get_mnist_data()









    



Extracting /data/MNIST_data/train-images-idx3-ubyte.gz
Extracting /data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting /data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting /data/MNIST_data/t10k-labels-idx1-ubyte.gz

We'll use 100 inducing points



In [2]:

    
M = 100
Z = kmeans2(X, M, minit='points')[0]

We'll compare three models: an ordinary sparse GP and DGPs with 2 and 3 layers.

We'll use a batch size of 1000 for all models



In [3]:

    
m_sgp = SVGP(X, Y, RBF(784, lengthscales=2., variance=2.), MultiClass(10), 
             Z=Z, num_latent=10, minibatch_size=1000, whiten=True)

def make_dgp(L):
    kernels = [RBF(784, lengthscales=2., variance=2.)]
    for l in range(L-1):
        kernels.append(RBF(30, lengthscales=2., variance=2.))
    model = DGP(X, Y, Z, kernels, MultiClass(10), 
                minibatch_size=1000,
                num_outputs=10)
    
    # start things deterministic 
    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5 
    
    return model

m_dgp2 = make_dgp(2)
m_dgp3 = make_dgp(3)

For the SGP model we'll calcuate accuracy by simply taking the max mean prediction:



In [4]:

    
def assess_model_sgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch)
    l = model.predict_density(X_batch, Y_batch)
    a = (np.argmax(m, 1).reshape(Y_batch.shape).astype(int)==Y_batch.astype(int))
    return l, a

For the DGP models we have stochastic predictions. We need a single prediction for each datum, so to do this we take $S$ samples for the one-hot predictions ($(S, N, 10)$ matrices for mean and var), then we take the max over the class means (to give a $(S, N)$ matrix), and finally we take the modal class over the samples (to give a vector of length $N$):

We'll use 100 samples



In [5]:

    
S = 100
def assess_model_dgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch, S)
    l = model.predict_density(X_batch, Y_batch, S)
    a = (mode(np.argmax(m, 2), 0)[0].reshape(Y_batch.shape).astype(int)==Y_batch.astype(int))
    return l, a

We need batch predictions (we might run out of memory otherwise)



In [6]:

    
def batch_assess(model, assess_model, X, Y):
    n_batches = max(int(len(X)/1000), 1)
    lik, acc = [], []
    for X_batch, Y_batch in zip(np.split(X, n_batches), np.split(Y, n_batches)):
        l, a = assess_model(model, X_batch, Y_batch)
        lik.append(l)
        acc.append(a)
    lik = np.concatenate(lik, 0)
    acc = np.array(np.concatenate(acc, 0), dtype=float)
    return np.average(lik), np.average(acc)

Now we're ready to go

The sparse GP:



In [7]:

    
iterations = 20000



In [8]:

    
AdamOptimizer(0.01).minimize(m_sgp, maxiter=iterations)
l, a = batch_assess(m_sgp, assess_model_sgp, Xs, Ys)
print('sgp test lik: {:.4f}, test acc {:.4f}'.format(l, a))









    



sgp test lik: -0.1092, test acc 0.9698

Using more inducing points improves things, but at the expense of very slow computation (500 inducing points takes about a day)

The two layer DGP:



In [9]:

    
AdamOptimizer(0.01).minimize(m_dgp2, maxiter=iterations)
l, a = batch_assess(m_dgp2, assess_model_dgp, Xs, Ys)
print('dgp2 test lik: {:.4f}, test acc {:.4f}'.format(l, a))









    



dgp2 test lik: -0.0731, test acc 0.9794

And the three layer:



In [10]:

    
AdamOptimizer(0.01).minimize(m_dgp3, maxiter=iterations)
l, a = batch_assess(m_dgp3, assess_model_dgp, Xs, Ys)
print('dgp3 test lik: {:.4f}, test acc {:.4f}'.format(l, a))









    



dgp3 test lik: -0.0709, test acc 0.9799

Using the deeper models we see a small improvement in accuracy, and a larger improvement in test log likelihood