In [31]:
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
from load import mnist

In [32]:
srng = RandomStreams()

In [33]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

In [34]:
def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

In [35]:
## Rectifier: If X below 0, return 0; else return X.
def rectify(X):
    return T.maximum(X, 0.)

In [36]:
## Numerically stable softmax
def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

In [37]:
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        ## A running average of the magnitude of the gradient
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        ## Scale the gradient based on the running average
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

In [38]:
## Randomly drop values and scale the rest
def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

In [39]:
## 1. Noise injected into model
## 2. Rectifiers now used
## 3. 2 hidden layers
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    
    return h, h2, py_x

In [40]:
train_x, test_x, train_y, test_y = mnist(onehot=True)

In [41]:
X = T.fmatrix()
Y = T.fmatrix()

In [42]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

In [44]:
noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)

In [45]:
cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.001)

In [46]:
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

In [49]:
def run_model(max_iter=100):
    for i in range(max_iter):
        for start, end in zip(range(0, len(train_x), 128), range(128, len(train_x), 128)):
            cost = train(train_x[start:end], train_y[start:end])
        print np.mean(np.argmax(test_y, axis=1) == predict(test_x))

In [50]:
run_model(10)


0.9696
0.972
0.974
0.971
0.9751
0.9738
0.9758
0.9763
0.9773
0.9769

In [ ]: