Theano Basics


In [1]:
from __future__ import print_function

import theano
import numpy as np

from theano import tensor as T
floatX = theano.config.floatX

In [2]:
# Convention:
#  uppercase: symbolic theano element or function
#  lowercase: numpy array
W = T.vector('w')
X = T.matrix('X')
Y = X.dot(W)
F = theano.function([W,X], Y)

w = np.ones(4)
x = np.ones((10,4))
y = F(w,x)
print(y)


[ 4.  4.  4.  4.  4.  4.  4.  4.  4.  4.]

In [3]:
# The most underused tool in machine learning
# AUTODIFF
grad_w = T.grad(Y.sum(), W)
F_grad = theano.function([W,X], grad_w)
g = F_grad(w,x)
# this should be equal to the sum of the columns of X (do you know how to matrix calculus?)
print(g)


[ 10.  10.  10.  10.]

In [4]:
# An easier example
B = T.scalar('E')
R = T.sqr(B)
A = T.grad(R, B)
Z = theano.function([B], A)
i = 2
l = Z(i)
print(l)


4.0

In [5]:
# If that didn't blow your mind, well, it should have.
def sharedX(X):
    return theano.shared(X.astype(floatX))

B = sharedX(np.ones(2))
R = T.sqr(B).sum()
A = T.grad(R, B)
Z = theano.function([], R, updates={B: B - .1*A})
for i in range(10):
    print('cost function = {}'.format(Z()))
    print('parameters    = {}'.format(B.get_value()))
# Try to change range to 100 to see what happens


cost function = 2.0
parameters    = [ 0.8  0.8]
cost function = 1.28
parameters    = [ 0.64  0.64]
cost function = 0.8192
parameters    = [ 0.512  0.512]
cost function = 0.524288
parameters    = [ 0.4096  0.4096]
cost function = 0.33554432
parameters    = [ 0.32768  0.32768]
cost function = 0.2147483648
parameters    = [ 0.262144  0.262144]
cost function = 0.137438953472
parameters    = [ 0.2097152  0.2097152]
cost function = 0.0879609302221
parameters    = [ 0.16777216  0.16777216]
cost function = 0.0562949953421
parameters    = [ 0.13421773  0.13421773]
cost function = 0.036028797019
parameters    = [ 0.10737418  0.10737418]

Neural Nets


In [6]:
""" Now that we now how to sum, we have enough to Deep Learn
 ... I should say something in the board about the Model-View-Controller way we usually
     deep learn with Theano.
     Model      : Neural net parameters and dataset generator
     View       : Logging, graph updates, saving cross-validated best parameters
     Controller : Update algorithm that follows gradient directions to optimize paramters

 Download this dataset : http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
"""
%matplotlib inline
import cPickle
from pylab import imshow

train_set, valid_set, test_set = cPickle.load(file('mnist.pkl', 'r'))
print(len(train_set))
train_x, train_y = train_set
test_x , test_y  = test_set
print(train_x.shape)
print(train_y.shape)
_ = imshow(train_x[0].reshape((28,28)), cmap='gray')


2
(50000, 784)
(50000,)

In [7]:
def batch_iterator(x, y, batch_size):
    num_batches = x.shape[0] // batch_size
    for i in xrange(0,num_batches):
        # TODO: use random integers instead of consecutive
        #   values to avoid biased gradients
        first = i * batch_size
        last  = (i+1) * batch_size
        x_batch = x[first:last].astype(floatX)
        y_pre   = y[first:last]
        y_batch = np.zeros((batch_size, 10))
        for row, col in enumerate(y_pre):
            y_batch[row, col] = 1
        yield (x_batch, y_batch.astype(floatX))

for x,y in batch_iterator(train_x, train_y, 10000):
    print('{}, {}'.format(x.shape, y.shape))
print(y[0])
_ = imshow(x[0].reshape((28,28)), cmap='gray')


(10000, 784), (10000, 10)
(10000, 784), (10000, 10)
(10000, 784), (10000, 10)
(10000, 784), (10000, 10)
(10000, 784), (10000, 10)
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]

In [13]:
# Define layers
def rectifier(input_dim, output_dim, X):
    W = sharedX(np.random.normal(0, .001, size=(input_dim, output_dim)))
    b = sharedX(np.zeros((output_dim,)))
    Z = T.dot(X,W) + b.dimshuffle('x',0)
    O = T.switch(Z>0, Z, 0)
    return W,b,O

def softmax(input_dim, output_dim, X, Y):
    W = sharedX(np.random.normal(0, .001, size=(input_dim, output_dim)))
    b = sharedX(np.zeros((output_dim,)))
    Z = T.dot(X,W) + b.dimshuffle('x',0)
    O = T.nnet.softmax(Z)
    cost = T.nnet.binary_crossentropy(O, Y).sum(axis=-1).mean()
    return W,b,O,cost

X = T.matrix('X')
Y = T.matrix('Y')
W0, b0, O0 = rectifier(784, 100, X)
W1, b1, O1 = rectifier(100, 100, O0)
W2, b2, O2, cost = softmax(100, 10,  O1, Y)

# Always write tests
F = theano.function([X,Y], [cost, O2])
x = np.zeros((100,784)).astype(floatX)
y = np.ones((100,10)).astype(floatX)
c, z = F(x,y)
assert c>0
assert z.shape == (100,10)
print(z[0])


[ 0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1]

In [14]:
from collections import OrderedDict
params = [W0, b0, W1, b1, W2, b2]
updates = dict()
for p in params:
    updates[p] = p - .01 * T.grad(cost, p)
updates = OrderedDict(updates)
trainer = theano.function([X,Y], cost, updates=updates)

In [15]:
num_epochs = 100
for i in range(num_epochs):
    print('-'*10)
    print('Epoch: {}'.format(i))
    for iter,b in enumerate(batch_iterator(train_x, train_y, 128)):
        x = b[0]
        y = b[1]
        last_cost = trainer(x,y)
    print('cost: {}'.format(trainer(x,y)))


----------
Epoch: 0
cost: 3.24799897328
----------
Epoch: 1
cost: 3.24637901577
----------
Epoch: 2
cost: 3.24544521374
----------
Epoch: 3
cost: 3.24489438784
----------
Epoch: 4
cost: 3.24456359659
----------
Epoch: 5
cost: 3.24436235329
----------
Epoch: 6
cost: 3.24423881687
----------
Epoch: 7
cost: 3.24416252048
----------
Epoch: 8
cost: 3.24411520784
----------
Epoch: 9
cost: 3.24408578483
----------
Epoch: 10
cost: 3.24406745273
----------
Epoch: 11
cost: 3.24405601092
----------
Epoch: 12
cost: 3.24404885714
----------
Epoch: 13
cost: 3.24404437278
----------
Epoch: 14
cost: 3.24404155268
----------
Epoch: 15
cost: 3.2440397677
----------
Epoch: 16
cost: 3.24403862545
----------
Epoch: 17
cost: 3.24403788064
----------
Epoch: 18
cost: 3.24403738298
----------
Epoch: 19
cost: 3.24403703693
----------
Epoch: 20
cost: 3.24403678167
----------
Epoch: 21
cost: 3.24403658028
----------
Epoch: 22
cost: 3.244036407
----------
Epoch: 23
cost: 3.2440362482
----------
Epoch: 24
cost: 3.24403609334
----------
Epoch: 25
cost: 3.24403593585
----------
Epoch: 26
cost: 3.24403576784
----------
Epoch: 27
cost: 3.2440355873
----------
Epoch: 28
cost: 3.24403539016
----------
Epoch: 29
cost: 3.24403517261
----------
Epoch: 30
cost: 3.24403493102
----------
Epoch: 31
cost: 3.24403466262
----------
Epoch: 32
cost: 3.24403436311
----------
Epoch: 33
cost: 3.24403402765
----------
Epoch: 34
cost: 3.24403364882
----------
Epoch: 35
cost: 3.24403321959
----------
Epoch: 36
cost: 3.24403272996
----------
Epoch: 37
cost: 3.24403216987
----------
Epoch: 38
cost: 3.24403151759
----------
Epoch: 39
cost: 3.24403076179
----------
Epoch: 40
cost: 3.24402987749
----------
Epoch: 41
cost: 3.24402883166
----------
Epoch: 42
cost: 3.24402754211
----------
Epoch: 43
cost: 3.24402602991
----------
Epoch: 44
cost: 3.2440241948
----------
Epoch: 45
cost: 3.2440219314
----------
Epoch: 46
cost: 3.24401910032
----------
Epoch: 47
cost: 3.24401549934
----------
Epoch: 48
cost: 3.24401083334
----------
Epoch: 49
cost: 3.24400465974
----------
Epoch: 50
cost: 3.24399628209
----------
Epoch: 51
cost: 3.24398459195
----------
Epoch: 52
cost: 3.2439677118
----------
Epoch: 53
cost: 3.24394231887
----------
Epoch: 54
cost: 3.24390217629
----------
Epoch: 55
cost: 3.24383462327
----------
Epoch: 56
cost: 3.24371140817
----------
Epoch: 57
cost: 3.24346018389
----------
Epoch: 58
cost: 3.24285853744
----------
Epoch: 59
cost: 3.24096812154
----------
Epoch: 60
cost: 3.23089552684
----------
Epoch: 61
cost: 3.12590093386
----------
Epoch: 62
cost: 2.95756529172
----------
Epoch: 63
cost: 2.84535398592
----------
Epoch: 64
cost: 2.68907151912
----------
Epoch: 65
cost: 2.52265974165
----------
Epoch: 66
cost: 2.45028472116
----------
Epoch: 67
cost: 2.40018156869
----------
Epoch: 68
cost: 2.33886133861
----------
Epoch: 69
cost: 2.26196722755
----------
Epoch: 70
cost: 2.18171480325
----------
Epoch: 71
cost: 2.10275115781
----------
Epoch: 72
cost: 2.02131642541
----------
Epoch: 73
cost: 1.75852680397
----------
Epoch: 74
cost: 1.63110831839
----------
Epoch: 75
cost: 1.56621211374
----------
Epoch: 76
cost: 1.48378594499
----------
Epoch: 77
cost: 1.39341893482
----------
Epoch: 78
cost: 1.27500592905
----------
Epoch: 79
cost: 1.10222175327
----------
Epoch: 80
cost: 0.99028564102
----------
Epoch: 81
cost: 0.924115294499
----------
Epoch: 82
cost: 0.871967153625
----------
Epoch: 83
cost: 0.823329313166
----------
Epoch: 84
cost: 0.776880251134
----------
Epoch: 85
cost: 0.732881327831
----------
Epoch: 86
cost: 0.693275470382
----------
Epoch: 87
cost: 0.656828014164
----------
Epoch: 88
cost: 0.623514990728
----------
Epoch: 89
cost: 0.59437572681
----------
Epoch: 90
cost: 0.570174970066
----------
Epoch: 91
cost: 0.549500032845
----------
Epoch: 92
cost: 0.530968409387
----------
Epoch: 93
cost: 0.516512140444
----------
Epoch: 94
cost: 0.501774799484
----------
Epoch: 95
cost: 0.489293294937
----------
Epoch: 96
cost: 0.478189165397
----------
Epoch: 97
cost: 0.467420804638
----------
Epoch: 98
cost: 0.455858228147
----------
Epoch: 99
cost: 0.444447902718

In [16]:
w0 = W0.get_value()
_ = imshow(w0[:,0].reshape((28,28)), cmap='gray')



In [17]:
ERR = T.neq(O2.argmax(axis=-1), Y.argmax(axis=-1))
Ferr = theano.function([X,Y], ERR)
def testnet(x, y):
    testerr = 0.
    for b1,b2 in batch_iterator(x, y, 500):
        testerr += Ferr(b1,b2)
    return testerr.sum()

print('test error: {}, test acc: {}'.format(testnet(test_x, test_y),
       1 - testnet(test_x, test_y) / 10000.))


test error: 618.0, test acc: 0.9382

Convolutional Nets


In [19]:
"""
 We can do much better than this with more hidden neurons and dropout.
 Watch Alec Radford's presentation to see how to do that 
 with Python/Theano: https://www.youtube.com/watch?v=S75EdAcXHKk
 For now, lets move on to convnets.
 
"""
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d
def conv_rectifier(input_channels, output_channels, filter_dim, X):
    W = sharedX(np.random.normal(0, .001, size=(output_channels,
                                                      input_channels,
                                                      filter_dim,
                                                      filter_dim)))
    b  = sharedX(np.zeros((output_channels,)))
    Z  = conv2d(X,W) + b.dimshuffle('x',0,'x','x')
    DS = max_pool_2d(Z, ds=[2,2])
    O  = T.switch(DS>0, DS, 0)
    return W,b,O

# test
X = T.tensor4('X')
W, b, O = conv_rectifier(1, 9, 5, X)
F = theano.function([X], O)

x = np.ones((5, 1, 28, 28))
print(x.shape)
o = F(x)
o.shape


(5, 1, 28, 28)
Out[19]:
(5, 9, 12, 12)

In [21]:
Y = T.matrix('Y')
W0, b0, O0 = conv_rectifier(1, 20, 5, X)
W1, b1, O1 = conv_rectifier(20, 50, 5, O0)

# test
F = theano.function([X], O1)
o = F(x)
print(o.shape)


(128, 50, 4, 4)

In [22]:
W2, b2, O2 = rectifier(50*4*4, 500, O1.flatten(2))
W3, b3, O3, cost = softmax(500, 10,  O2, Y)
# Teeeeeest
x = np.ones((128,1,28,28)).astype(floatX)
y = np.ones((128,10)).astype(floatX)
F = theano.function([X, Y], [O3, cost])
z, c = F(x,y)
assert c>0
assert z.shape == (128,10)

In [23]:
# We need to modify the batch_iterator slightly to serve formated images
def batch_iterator(x, y, batch_size):
    num_batches = x.shape[0] // batch_size
    for i in xrange(0,num_batches):
        # TODO: use random integers instead of consecutive
        #   values to avoid biased gradients
        first = i * batch_size
        last  = (i+1) * batch_size
        x_batch = x[first:last].reshape((batch_size,1,28,28))
        y_pre   = y[first:last]
        y_batch = np.zeros((batch_size, 10))
        for row, col in enumerate(y_pre):
            y_batch[row, col] = 1
        yield (x_batch, y_batch)

for x,y in batch_iterator(train_x, train_y, 10000):
    print('{}, {}'.format(x.shape, y.shape))
print(y[0])
_ = imshow(x[0].reshape((28,28)), cmap='gray')


(10000, 1, 28, 28), (10000, 10)
(10000, 1, 28, 28), (10000, 10)
(10000, 1, 28, 28), (10000, 10)
(10000, 1, 28, 28), (10000, 10)
(10000, 1, 28, 28), (10000, 10)
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]

In [24]:
params = [W0, b0, W1, b1, W2, b2, W3, b3]
updates = dict()
for p in params:
    updates[p] = p - .01 * T.grad(cost, p)
updates = OrderedDict(updates)
trainer = theano.function([X,Y], cost, updates=updates)

In [ ]:
num_epochs = 100
for i in range(num_epochs):
    print('-'*10)
    print('Epoch: {}'.format(i))
    for iter,b in enumerate(batch_iterator(train_x, train_y, 128)):
        x = b[0]
        y = b[1]
        last_cost = trainer(x,y)
    print('cost: {}'.format(trainer(x,y)))

In [ ]:
w0 = W0.get_value()
_ = imshow(w0[0,0,:,:].reshape((5,5)), cmap='gray')

In [ ]:
ERR = T.neq(O3.argmax(axis=-1), Y.argmax(axis=-1))
Ferr = theano.function([X,Y], ERR)
def testnet(x, y):
    testerr = 0.
    for b1,b2 in batch_iterator(x, y, 500):
        testerr += Ferr(b1,b2)
    return testerr.sum()

print('test error: {}, test acc: {}'.format(testnet(test_x, test_y),
       1 - testnet(test_x, test_y) / 10000.))