In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from gpflow.likelihoods import MultiClass
from gpflow.kernels import RBF, White
from gpflow.models.svgp import SVGP
from gpflow.training import AdamOptimizer
from scipy.stats import mode
from scipy.cluster.vq import kmeans2
from doubly_stochastic_dgp.dgp import DGP
import time
def get_mnist_data(data_path='/data'):
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets(data_path+'/MNIST_data/', one_hot=False)
X, Y = mnist.train.next_batch(mnist.train.num_examples)
Xval, Yval = mnist.validation.next_batch(mnist.validation.num_examples)
Xtest, Ytest = mnist.test.next_batch(mnist.test.num_examples)
Y, Yval, Ytest = [np.array(y, dtype=float)[:, None] for y in [Y, Yval, Ytest]]
X = np.concatenate([X, Xval], 0)
Y = np.concatenate([Y, Yval], 0)
return X.astype(float), Y.astype(float), Xtest.astype(float), Ytest.astype(float)
X, Y, Xs, Ys = get_mnist_data()
We'll use 100 inducing points
In [2]:
M = 100
Z = kmeans2(X, M, minit='points')[0]
We'll compare three models: an ordinary sparse GP and DGPs with 2 and 3 layers.
We'll use a batch size of 1000 for all models
In [3]:
m_sgp = SVGP(X, Y, RBF(784, lengthscales=2., variance=2.), MultiClass(10),
Z=Z, num_latent=10, minibatch_size=1000, whiten=True)
def make_dgp(L):
kernels = [RBF(784, lengthscales=2., variance=2.)]
for l in range(L-1):
kernels.append(RBF(30, lengthscales=2., variance=2.))
model = DGP(X, Y, Z, kernels, MultiClass(10),
minibatch_size=1000,
num_outputs=10)
# start things deterministic
for layer in model.layers[:-1]:
layer.q_sqrt = layer.q_sqrt.value * 1e-5
return model
m_dgp2 = make_dgp(2)
m_dgp3 = make_dgp(3)
For the SGP model we'll calcuate accuracy by simply taking the max mean prediction:
In [4]:
def assess_model_sgp(model, X_batch, Y_batch):
m, v = model.predict_y(X_batch)
l = model.predict_density(X_batch, Y_batch)
a = (np.argmax(m, 1).reshape(Y_batch.shape).astype(int)==Y_batch.astype(int))
return l, a
For the DGP models we have stochastic predictions. We need a single prediction for each datum, so to do this we take $S$ samples for the one-hot predictions ($(S, N, 10)$ matrices for mean and var), then we take the max over the class means (to give a $(S, N)$ matrix), and finally we take the modal class over the samples (to give a vector of length $N$):
We'll use 100 samples
In [5]:
S = 100
def assess_model_dgp(model, X_batch, Y_batch):
m, v = model.predict_y(X_batch, S)
l = model.predict_density(X_batch, Y_batch, S)
a = (mode(np.argmax(m, 2), 0)[0].reshape(Y_batch.shape).astype(int)==Y_batch.astype(int))
return l, a
We need batch predictions (we might run out of memory otherwise)
In [6]:
def batch_assess(model, assess_model, X, Y):
n_batches = max(int(len(X)/1000), 1)
lik, acc = [], []
for X_batch, Y_batch in zip(np.split(X, n_batches), np.split(Y, n_batches)):
l, a = assess_model(model, X_batch, Y_batch)
lik.append(l)
acc.append(a)
lik = np.concatenate(lik, 0)
acc = np.array(np.concatenate(acc, 0), dtype=float)
return np.average(lik), np.average(acc)
Now we're ready to go
The sparse GP:
In [7]:
iterations = 20000
In [8]:
AdamOptimizer(0.01).minimize(m_sgp, maxiter=iterations)
l, a = batch_assess(m_sgp, assess_model_sgp, Xs, Ys)
print('sgp test lik: {:.4f}, test acc {:.4f}'.format(l, a))
Using more inducing points improves things, but at the expense of very slow computation (500 inducing points takes about a day)
The two layer DGP:
In [9]:
AdamOptimizer(0.01).minimize(m_dgp2, maxiter=iterations)
l, a = batch_assess(m_dgp2, assess_model_dgp, Xs, Ys)
print('dgp2 test lik: {:.4f}, test acc {:.4f}'.format(l, a))
And the three layer:
In [10]:
AdamOptimizer(0.01).minimize(m_dgp3, maxiter=iterations)
l, a = batch_assess(m_dgp3, assess_model_dgp, Xs, Ys)
print('dgp3 test lik: {:.4f}, test acc {:.4f}'.format(l, a))
Using the deeper models we see a small improvement in accuracy, and a larger improvement in test log likelihood