What's remarkable about Rasmus et. al. 2015 is that they are able to achieve state-of-the-art performance on permutation invariant MNIST without dropout (though the denoising step could be performing a similar form of regularization). Unlike previous work with autoencoders, they acheive this great performance with the same semi-supervised cost function for the entire training process.
It's also worth noting their model claims by far the best semisupervised performance with about ~.75% error with only 500 labeled MNIST digits.
In [1]:
import time
import numpy as np
import theano
import theano.tensor as T
import peano
import peano.pops as P
from pylearn2.space import CompositeSpace, VectorSpace
dtype = theano.config.floatX
In [2]:
z1 = P.nnet.Sequential('z1')
z1.add(P.nnet.Linear(784, 1000))
z1.add(P.nnet.BatchNormalization(1000))
z2 = P.nnet.Sequential('z2')
z2.add(P.nnet.Linear(1000, 500))
z2.add(P.nnet.BatchNormalization(500))
z3 = P.nnet.Sequential('z3')
z3.add(P.nnet.Linear(500, 250))
z3.add(P.nnet.BatchNormalization(250))
z4 = P.nnet.Sequential('z4')
z4.add(P.nnet.Linear(250, 250))
z4.add(P.nnet.BatchNormalization(250))
z5 = P.nnet.Sequential('z5')
z5.add(P.nnet.Linear(250, 250))
z5.add(P.nnet.BatchNormalization(250))
ll0 = P.nnet.Lateral(784)
ll1 = P.nnet.Lateral(1000)
ll2 = P.nnet.Lateral(500)
ll3 = P.nnet.Lateral(250)
ll4 = P.nnet.Lateral(250)
ll5 = P.nnet.Lateral(250)
ll6 = P.nnet.Lateral(10)
u6 = P.nnet.Linear(10, 250)
u5 = P.nnet.Linear(250, 250)
u4 = P.nnet.Linear(250, 250)
u3 = P.nnet.Linear(250, 500)
u2 = P.nnet.Linear(500, 1000)
u1 = P.nnet.Linear(1000, 784)
sl = P.nnet.Sequential('sl')
sl.add(P.nnet.Linear(250, 10))
sl.add(T.nnet.softmax)
xt = T.matrix(dtype=dtype)
z1f = z1.apply(xt)
h1 = T.nnet.relu(z1f)
z2f = z2.apply(h1)
h2 = T.nnet.relu(z2f)
z3f = z3.apply(h2)
h3 = T.nnet.relu(z3f)
z4f = z4.apply(h3)
h4 = T.nnet.relu(z4f)
z5f = z5.apply(h4)
h5 = T.nnet.relu(z5f)
y_s = sl.apply(z5f)
zh6 = ll6.apply(y_s, 0.)
u6f = u6.apply(zh6)
zh5 = ll5.apply(z5f, u6f)
u5f = u5.apply(zh5)
zh4 = ll4.apply(z4f, u5f)
u4f = u4.apply(zh4)
zh3 = ll3.apply(z3f, u4f)
u3f = u3.apply(zh3)
zh2 = ll2.apply(z2f, u3f)
u2f = u2.apply(zh2)
zh1 = ll1.apply(z1f, u2f)
u1f = u1.apply(zh1)
xh = ll0.apply(xt, u1f)
Gather the parameters and construct the cost functions
In [3]:
params = []
for l in [z1,z2,z3,z4,z5,ll0,ll1,ll2,ll3,ll4,ll5,ll6,u6,u5,u4,u3,u2,u1,sl]:
params += l.params
x_true = T.matrix(dtype=dtype)
y_true = T.matrix(dtype=dtype)
lr = T.scalar(dtype=dtype)
r_cost = P.cost.mean_squared_error(x_true, xh)
s_cost = P.cost.cross_entropy(y_true, y_s)
cost = s_cost + 500.*r_cost
misclass_cost = T.neq(T.argmax(y_true, axis=1), T.argmax(y_s, axis=1)).mean()
In [4]:
gparams = T.grad(cost, wrt=params)
updates = peano.optimizer.adam_update(params, gparams, alpha=lr)
learn_mlp_fn = theano.function(inputs = [xt, x_true, y_true, lr],
outputs = cost,
updates = updates)
misclass_mlp_fn = theano.function(inputs = [xt, y_true],
outputs = misclass_cost)
encode_mlp_fn = theano.function(inputs = [xt],
outputs = xh)
decode_mlp_fn = theano.function(inputs = [xt, y_s],
outputs = xh)
In accordance with the paper, we are training on the entire MNIST training set (all 60000 digits). After 100 epochs we evalute on the MNIST test set (10000 digits). Since this is the actual test set, we are not allowed to tweak anything. The test set error is the final error for this model.
In [5]:
from pylearn2.datasets import mnist
ds = mnist.MNIST(which_set = 'train', start=0, stop=60000)
val = mnist.MNIST(which_set = 'test', start=0, stop=10000)
val_X, val_y = val.get_data()
val_y = np.squeeze(np.eye(10)[val_y]).astype(dtype)
data_space = VectorSpace(dim=784)
label_space = VectorSpace(dim= 10)
lrd = np.linspace(.002,0.,50).astype(dtype)
for i in range(100):
cost = 0.
misclass = 0.
ds_iter = ds.iterator(mode='sequential', batch_size=100, data_specs=(CompositeSpace((data_space, label_space)), ('features', 'targets')))
t0 = time.time()
for X,y in ds_iter:
if i < 50:
learn_mlp_fn(X+0.3*np.random.randn(*X.shape).astype(dtype) , X, y, 0.002)
else:
learn_mlp_fn(X+0.3*np.random.randn(*X.shape).astype(dtype) , X, y, lrd[i-50])
print 'epoch', i, time.time()-t0, 'seconds'
print 'Test set error:', misclass_mlp_fn(val_X, val_y)
So with everything said and done, the performance we acheive is ~.92% error on the test set which is very good but off from the .68% claimed in the paper. There could be finicky parameters like weight initilization that might account for the difference.