In [1]:
import theano
from theano import tensor as T
import numpy as np
from load import mnist
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d
In [2]:
srng = RandomStreams()
In [3]:
def floatX(X):
return np.asarray(X, dtype=theano.config.floatX)
In [4]:
def init_weights(shape):
return theano.shared(floatX(np.random.randn(*shape) * 0.01))
In [5]:
def rectify(X):
return T.maximum(X, 0.)
In [6]:
def softmax(X):
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
In [7]:
def dropout(X, p=0.):
if p > 0:
retain_prob = 1 - p
X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
X /= retain_prob
return X
In [8]:
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
grads = T.grad(cost=cost, wrt=params)
updates = []
for p, g in zip(params, grads):
acc = theano.shared(p.get_value() * 0.)
acc_new = rho * acc + (1 - rho) * g ** 2
gradient_scaling = T.sqrt(acc_new + epsilon)
g = g / gradient_scaling
updates.append((acc, acc_new))
updates.append((p, p - lr * g))
return updates
In [9]:
def model(X, w, w2, w3, w4, p_drop_conv, p_drop_hidden):
# Block of computation
# activate -> pool -> noise
l1a = rectify(conv2d(X, w, border_mode='full'))
l1 = max_pool_2d(l1a, (2, 2))
l1 = dropout(l1, p_drop_conv)
l2a = rectify(conv2d(l1, w2))
l2 = max_pool_2d(l2a, (2, 2))
l2 = dropout(l2, p_drop_conv)
l3a = rectify(conv2d(l2, w3))
l3b = max_pool_2d(l3a, (2, 2))
l3 = T.flatten(l3b, outdim=2) # convert from 4tensor to normal matrix
l3 = dropout(l3, p_drop_conv)
l4 = rectify(T.dot(l3, w4))
l4 = dropout(l4, p_drop_hidden)
pyx = softmax(T.dot(l4, w_o))
return l1, l2, l3, l4, pyx
In [10]:
train_x, test_x, train_y, test_y = mnist(onehot=True)
In [11]:
train_x = train_x.reshape(-1, 1, 28, 28) # Reshape into conv 4tensor(b, c, 0, 1) format
test_x = test_x.reshape(-1, 1, 28, 28)
In [12]:
X = T.ftensor4() # 4tensor instead of matrix
Y = T.fmatrix()
In [13]:
w = init_weights((32, 1, 3, 3)) # Conv weights(# kernels, # channels, kernel width, kernel height)
w2 = init_weights((64, 32, 3, 3))
w3 = init_weights((128, 64, 3, 3))
w4 = init_weights((128 * 3 * 3, 625))# highest conv layer has 128 filters and 3x3 grid of responses
w_o = init_weights((625, 10))
In [14]:
# Compile two versions:
# Noise during training 20% dropout for conv layers, 50% for output.
noise_l1, noise_l2, noise_l3, noise_l4, noise_py_x = model(X, w, w2, w3, w4, 0.2, 0.5)
# no noise for prediction
l1, l2, l3, l4, py_x = model(X, w, w2, w3, w4, 0., 0.)
y_x = T.argmax(py_x, axis=1)
In [15]:
cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w, w2, w3, w4, w_o]
updates = RMSprop(cost, params, lr=0.001)
In [16]:
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
In [15]: