In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.pyplot.gray()
import numpy as np
import theano
import theano.tensor as T
rng = np.random.RandomState(42)
import sys
import time
theano.config.floatX = 'float32'
We will make heavy use of the resources in the Theano deep learning tutorial. We have it integrated into our git repository as a submodule. You can clone the git repository by doing the following steps:
(be sure to include the --recursive
or you won't get the Theano deep learning tutorial)
git clone --recursive https://github.com/graphific/DL-Meetup-intro.git
If you already cloned the repository, but the DeepLearningTutorial
folder is empty, you need to checkout the submodule. Make sure you are in the folder DL-Meetup-intro
and then execute the following command:
git submodule update --init --recursive
Now we have to add this directory to the PythonPath. Depending on the location of your git repository you might have to change this path.
In [3]:
#be sure to include Lisa lab code to make it easier for us:
sys.path.insert(1,'./DeepLearningTutorials/code')
#sys.path
In [5]:
#load MNIST dataset
from logistic_sgd import load_data
dataset='mnist.pkl.gz'
## If not already cached this function actually downloads the data
datasets = load_data(dataset)
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
If you get an error here like "ImportError: No module named logistic_sgd"
I repeat: you need to checkout the submodule. Make sure you are in the folder DL-Meetup-intro
and then execute the following command:
git submodule update --init --recursive
theano's shared variable = genius
data can be shared between the CPU and the GPU without you having to write a single line of code
In [6]:
print train_set_x
In [7]:
print train_set_x.get_value()
In [8]:
print "Size of the training data matrix: ", train_set_x.get_value().shape
In [9]:
from utils import tile_raster_images
samples = tile_raster_images(train_set_x.get_value(), img_shape=(28,28),
tile_shape=(3,10), tile_spacing=(0, 0),
scale_rows_to_unit_interval=True,
output_pixel_vals=True)
plt.imshow(samples)
plt.show()
In [10]:
# estimate the mean and std dev from the training data
# then use these estimates to normalize the data
# estimate the mean and std dev from the training data
norm_mean = train_set_x.mean(axis=0)
train_set_x = train_set_x - norm_mean
norm_std = train_set_x.std(axis=0)
norm_std = norm_std.clip(0.00001, norm_std.max())
train_set_x = train_set_x / norm_std
test_set_x = test_set_x - norm_mean
test_set_x = test_set_x / norm_std
valid_set_x = valid_set_x - norm_mean
valid_set_x = valid_set_x / norm_std
In [11]:
# visualize the result
samples = tile_raster_images(train_set_x.eval(), img_shape=(28,28),
tile_shape=(3,10), tile_spacing=(0, 0),
scale_rows_to_unit_interval=True,
output_pixel_vals=True)
plt.imshow(samples)
plt.show()
In [12]:
train_set_x = theano.shared(train_set_x.eval())
valid_set_x = theano.shared(valid_set_x.eval())
test_set_x = theano.shared(test_set_x.eval())
In [13]:
# Step 1. Declare Theano variables
x = T.fmatrix()
y = T.ivector()
# size of our image patches, and number of output units (classes)
W = theano.shared(value=np.zeros((28*28, 10),
dtype=theano.config.floatX),
name='W',
borrow=True
)
b = theano.shared(value=np.zeros((10,), # number of output units (classes)
dtype=theano.config.floatX),
name='b',
borrow=True
)
In [14]:
# Step 2. Construct Theano expression graph
#likelihood to make predictions
p_y_given_x = T.nnet.softmax(T.dot(x, W) + b)
#conversion of the class probabilities to class labels
y_pred = T.argmax(p_y_given_x, axis=1)
#classification error
error = T.mean(T.neq(y_pred, y))
#the negative log likelihood as our cost function to optimize
cost = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
In [15]:
g_W = T.grad(cost=cost, wrt=W)
g_b = T.grad(cost=cost, wrt=b)
learning_rate = 0.1
updates = [(W, W - learning_rate * g_W),
(b, b - learning_rate * g_b)]
In [16]:
# Step 3. Compile expressions to functions
train = theano.function(inputs=[x, y],
outputs=cost,
updates=updates)
validate = theano.function(inputs=[x, y],
outputs=error)
In [17]:
# Step 4. Perform computation
train_monitor = []
val_monitor = []
n_epochs = 10
start_time = time.clock()
for epoch in range(n_epochs):
loss = train(train_set_x.eval(), train_set_y.eval())
train_monitor.append(validate(train_set_x.eval(), train_set_y.eval()))
val_monitor.append(validate(valid_set_x.eval(), valid_set_y.eval()))
if epoch%2 == 0:
print "Iteration: ", epoch
print "Training error, validation error: ", train_monitor[-1], val_monitor[-1]
end_time = time.clock()
print 'The code ran for %f seconds' % ((end_time - start_time))
In [18]:
plt.plot(train_monitor, c='r')
plt.plot(val_monitor, c='b')
plt.xlabel('Number of Epochs')
plt.ylabel('Missclassification')
plt.show()
In [22]:
# optimize run time
# dont bother understanding this
# do the evaluation once and save it in a normal non-shared variable that we can call our functions with
# also could:
# As we already know the number of epochs we are going to train for,
# we could also fix the size of our monitor variables beforehand and then fill them during training.
W.set_value(np.zeros((28*28, 10),
dtype=theano.config.floatX))
b.set_value(np.zeros((10,),
dtype=theano.config.floatX))
train_monitor = []
val_monitor = []
n_epochs = 100
trainx = train_set_x.eval()
trainy = train_set_y.eval()
valx = valid_set_x.eval()
valy = valid_set_y.eval()
testx = test_set_x.eval()
testy = test_set_y.eval()
start_time = time.clock()
for epoch in range(n_epochs):
loss = train(trainx, trainy)
train_monitor.append(validate(trainx, trainy))
val_monitor.append(validate(valx,valy))
if epoch%10 == 0:
print "Iteration: ", epoch
print "Training error, validation error: ", train_monitor[-1], val_monitor[-1]
end_time = time.clock()
print 'The code ran for %f seconds' % ((end_time - start_time))
In [23]:
plt.plot(train_monitor, c='r')
plt.plot(val_monitor, c='b')
plt.xlabel('Number of Epochs')
plt.ylabel('Missclassification')
plt.show()
In [24]:
train = theano.function(inputs=[],
outputs=[cost, error],
updates=updates,
givens={x: train_set_x, y: train_set_y}
)
validate = theano.function(inputs=[],
outputs=error,
givens={x: valid_set_x, y: valid_set_y}
)
In [25]:
#do 20 and then 100 epochs as one batch iteration:
batch_size = 20
n_train_batches = train_set_x.shape[0].eval() / batch_size
index = T.iscalar()
train_batch = theano.function(inputs=[index],
outputs=cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
In [28]:
# Step 4. Perform computation
# reset W and b to zero
W.set_value(np.zeros((28*28, 10),
dtype=theano.config.floatX))
b.set_value(np.zeros((10,),
dtype=theano.config.floatX))
train_monitor_batch = []
val_monitor_batch = []
n_epochs = 20
start_time = time.clock()
for epoch in range(n_epochs):
for batch_index in range(n_train_batches):
loss = train_batch(batch_index)
val_monitor_batch.append(validate())
train_monitor_batch.append(train())
if epoch%1 == 0:
print "Iteration: ", epoch
#print "Validation error: ",val_monitor_batch[-1]
print "Training error, validation error: ", train_monitor_batch[-1][1], val_monitor_batch[-1]
end_time = time.clock()
print 'The code ran for %f seconds' % ((end_time - start_time))
In [29]:
#now also draws cost:
#plt.plot(train_monitor_batch, c='r')
#do same with larger batch above..
plt.plot(val_monitor_batch, c='b')
plt.xlabel('Number of Epochs')
plt.ylabel('Missclassification')
plt.show()
In [30]:
# What we had is commented with #
# learning_rate = 0.1
batch_size = 200
learning_rate = np.array(0.1, dtype=theano.config.floatX)
shared_learning_rate = theano.shared(value=learning_rate, name='lr')
#updates = [(W, W - learning_rate * g_W),
# (b, b - learning_rate * g_b)]
updates = [(W, W - shared_learning_rate * g_W),
(b, b - shared_learning_rate * g_b),
(shared_learning_rate, shared_learning_rate * 0.995)]
#unchanged:
train_batch = theano.function(inputs=[index],
outputs=cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
In [31]:
#gives:
for i in xrange(4):
print shared_learning_rate.get_value()
train_batch(1)
we'll use the pre-defined class LogisticRegression
from the Theano deep learning tutorial as our last classification layer.
To make our model deeper we need to define a hidden layer.
definition of the HiddenLayer
class from the Theano deep learning tutorial.
recap:
The hidden layer computes the function $s(b+Wx)$, where $s$ is our activation function, $b$ are the biases and $W$ is the weight matrix.
In [32]:
#copied from Thenao DL tut:
from logistic_sgd import LogisticRegression
class HiddenLayer(object):
def __init__(self, rng, input, n_in, n_out, W=None, b=None,
activation=T.tanh):
self.input = input
if W is None:
W_values = np.asarray(
rng.uniform(
low=-np.sqrt(6. / (n_in + n_out)),
high=np.sqrt(6. / (n_in + n_out)),
size=(n_in, n_out)
),
dtype=theano.config.floatX
)
if activation == theano.tensor.nnet.sigmoid:
W_values *= 4
W = theano.shared(value=W_values, name='W', borrow=True)
if b is None:
b_values = np.zeros((n_out,), dtype=theano.config.floatX)
b = theano.shared(value=b_values, name='b', borrow=True)
self.W = W
self.b = b
lin_output = T.dot(input, self.W) + self.b
self.output = (
lin_output if activation is None
else activation(lin_output)
)
# parameters of the model
self.params = [self.W, self.b]
Multi Layer Perceptron:
In [33]:
class MLP(object):
def __init__(self, rng, input, n_in, n_hidden, n_out):
self.hiddenLayer = HiddenLayer(
rng=rng,
input=input,
n_in=n_in,
n_out=n_hidden,
activation=T.tanh
)
self.logRegressionLayer = LogisticRegression(
input=self.hiddenLayer.output,
n_in=n_hidden,
n_out=n_out
)
self.negative_log_likelihood = (
self.logRegressionLayer.negative_log_likelihood
)
self.errors = self.logRegressionLayer.errors
self.params = self.hiddenLayer.params + self.logRegressionLayer.params
Input for the logistic regression layer = the output of the hidden layer.
This way the two layers are connected in the model.
This means we can just use the error and loss functions from the logistic regression layer as the error and loss function of our whole model.
as error of the logistic regression layer is defined in terms of the layer's input.
This input now is connected to the output of the hidden layer, thus computing the gradient of the loss function will unravel the whole network all the way down to the first layer's input, which is our image data.
Now we need to define a new training function along with new functions for testing and validation. We use two separate data sets for validation and testing. The reason is that deep network classifiers have a lot of parameters to tune. We use the training data for the actual gradient decent optimization, but we also have to tune the hyper-parameter, like how many layers to choose, how many units in each layer, which learning rate, activation funtion, etc. In a way this is like a whole second level of optimizing our network. This means our validation data can be seen as part of the training data, just for training different parameters than the training data. The test data then is there to test the generalization performance our our trained network. If you are strict about it, you are only allowed to compute the test error once all parameters in the network are fixed.
In [34]:
n_hidden = 100
classifier = MLP(
rng=rng,
input=x,
n_in=28 * 28,
n_hidden=n_hidden,
n_out=10
)
cost = classifier.negative_log_likelihood(y)
n_validation_batches= valid_set_x.shape[0].eval() / batch_size
validate_model = theano.function(
inputs=[index],
outputs=classifier.errors(y),
givens={
x: valid_set_x[index * batch_size:(index + 1) * batch_size],
y: valid_set_y[index * batch_size:(index + 1) * batch_size]
}
)
test_model = theano.function(
inputs=[],
outputs=classifier.errors(y),
givens={
x: test_set_x,
y: test_set_y
}
)
gparams = [T.grad(cost, param) for param in classifier.params]
learning_rate = 0.1
updates = [(param, param - learning_rate * gparam)
for param, gparam in zip(classifier.params, gparams)
]
train_model = theano.function(
inputs=[index],
outputs=cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
In [35]:
start_time = time.clock()
n_epochs = 20
start_time = time.clock()
for epoch in xrange(n_epochs):
for minibatch_index in xrange(n_train_batches):
minibatch_avg_cost = train_model(minibatch_index)
validation_losses = [validate_model(i) for i
in xrange(n_validation_batches)]
this_validation_loss = np.mean(validation_losses)
print('epoch %i, minibatches %i, validation error %f %%' %
(
epoch,
n_train_batches,
this_validation_loss * 100.
)
)
end_time = time.clock()
print 'The code ran for %f seconds' % ((end_time - start_time))
print "Test error is %f %%" % (test_model() * 100)
So whats happening in the hidden layer?
let's visualize it:
In [36]:
filters = tile_raster_images(classifier.hiddenLayer.W.T.eval(), img_shape=(28,28), tile_shape=(10,10), tile_spacing=(0, 0),
scale_rows_to_unit_interval=True,
output_pixel_vals=True)
plt.imshow(filters)
plt.show()
visualize activations:
In [37]:
hidden_activation = classifier.hiddenLayer.output
compute_activations = theano.function(inputs=[],
outputs=hidden_activation,
givens={x: valid_set_x})
activation_matrix = compute_activations()
plt.imshow(activation_matrix[:100,:])
plt.xlabel("hidden unit")
plt.ylabel("samples")
plt.show()
According to the universal approximation theorem a single hidden layer neural network with a linear output unit can approximate any continuous function arbitrarily well, given enough hidden units.
The result applies for sigmoid, tanh and many other hidden layer activation functions. While this is a great result and sounds like we don't really need to bother about adding more layers, it can be very hard in practice to find the right local minimum of such a network. It also doesn't specify how many hidden units are enough.
It is computationally more feasible to stack moderate size hidden layers on top of each other, than to try and optimize a gigantic single hidden layer. So finally here is the generalization of our MLP
class for an arbitrary number of hidden layers:
In [38]:
class MLP_deep(object):
def __init__(self, rng, input, n_in, n_hidden, n_out):
self.hiddenLayers = []
self.params = []
next_input = input
next_n_in = n_in
for n_h in n_hidden:
hl = HiddenLayer(
rng=rng,
input=next_input,
n_in=next_n_in,
n_out=n_h,
activation=T.tanh
)
next_input = hl.output
next_n_in = n_h
self.hiddenLayers.append(hl)
self.params += hl.params
self.logRegressionLayer = LogisticRegression(
input=self.hiddenLayers[-1].output,
n_in=n_hidden[-1],
n_out=n_out
)
self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
self.errors = self.logRegressionLayer.errors
self.params += self.logRegressionLayer.params
again lets connect the layers by by specifying the output of the lower layer as input for the new layer on top:
In [39]:
#2 hidden layers, one with 200n and other with 100
n_hidden = [200,100]
classifier = MLP_deep(
rng=rng,
input=x,
n_in=28 * 28,
n_hidden=n_hidden,
n_out=10
)
cost = classifier.negative_log_likelihood(y)
n_validation_batches= valid_set_x.shape[0].eval() / batch_size
validate_model = theano.function(
inputs=[index],
outputs=classifier.errors(y),
givens={
x: valid_set_x[index * batch_size:(index + 1) * batch_size],
y: valid_set_y[index * batch_size:(index + 1) * batch_size]
}
)
test_model = theano.function(
inputs=[],
outputs=classifier.errors(y),
givens={
x: test_set_x,
y: test_set_y
}
)
gparams = [T.grad(cost, param) for param in classifier.params]
learning_rate = 0.1
updates = [(param, param - learning_rate * gparam)
for param, gparam in zip(classifier.params, gparams)
]
train_model = theano.function(
inputs=[index],
outputs=cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
In [40]:
start_time = time.clock()
n_epochs = 20
start_time = time.clock()
val_monitor_batch = []
for epoch in xrange(n_epochs):
for minibatch_index in xrange(n_train_batches):
minibatch_avg_cost = train_model(minibatch_index)
validation_losses = [validate_model(i) for i
in xrange(n_validation_batches)]
this_validation_loss = np.mean(validation_losses)
val_monitor_batch.append(this_validation_loss)
print('epoch %i, minibatches %i, validation error %f %%' %
(
epoch,
n_train_batches,
this_validation_loss * 100.
)
)
end_time = time.clock()
print 'The code ran for %f seconds' % ((end_time - start_time))
print "Test error is %f %%" % (test_model() * 100)
In [41]:
filters = tile_raster_images(classifier.hiddenLayers[0].W.T.eval(), img_shape=(28,28), tile_shape=(10,10), tile_spacing=(0, 0),
scale_rows_to_unit_interval=True,
output_pixel_vals=True)
plt.imshow(filters)
plt.show()
In [42]:
#if we got time left lets play with the DNN....
In [43]:
#now also draws cost:
#plt.plot(train_monitor_batch, c='r')
#do same with larger batch above..
plt.plot(val_monitor_batch, c='b')
plt.xlabel('Number of Epochs')
plt.ylabel('Missclassification')
plt.show()
In [ ]: