Seminar 5: Deep Networks

Run the code and read the text boxes carefully!

In [ ]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import cPickle as pickle
import os
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy.misc import imread, imsave, imresize
from lasagne.utils import floatX

In [ ]:
from lasagne.layers import InputLayer
from lasagne.layers import DenseLayer
from lasagne.layers import NonlinearityLayer
from lasagne.layers import DropoutLayer
from lasagne.layers import Pool2DLayer as PoolLayer
from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.nonlinearities import rectify, softmax

IMAGE_W = 224

#vgg19 model
def build_model():
    net = {}
    net['input'] = InputLayer((None, 3, 224, 224))
    net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False)
    net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False)
    net['pool1'] = PoolLayer(net['conv1_2'], 2)
    net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False)
    net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False)
    net['pool2'] = PoolLayer(net['conv2_2'], 2)
    net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False)
    net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False)
    net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False)
    net['conv3_4'] = ConvLayer(net['conv3_3'], 256, 3, pad=1, flip_filters=False)
    net['pool3'] = PoolLayer(net['conv3_4'], 2)
    net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False)
    net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False)
    net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False)
    net['conv4_4'] = ConvLayer(net['conv4_3'], 512, 3, pad=1, flip_filters=False)
    net['pool4'] = PoolLayer(net['conv4_4'], 2)
    net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False)
    net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False)
    net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False)
    net['conv5_4'] = ConvLayer(net['conv5_3'], 512, 3, pad=1, flip_filters=False)
    net['pool5'] = PoolLayer(net['conv5_4'], 2)
    net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
    net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5)
    net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096)
    net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5)
    net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=1000, nonlinearity=None)
    net['prob'] = NonlinearityLayer(net['fc8'], softmax)
    return net

In [ ]:
#classes' names are stored here
classes = pickle.load(open('classes.pkl'))
#for example, 10th class is ostrich:
print classes[9]

You have to implement two functions in the cell below.

Preprocess function should take the image with shape (w, h, 3) and transform it into a tensor with shape (1, 3, 224, 224). Without this transformation, vgg19 won't be able to digest input image. Additionally, your preprocessing function have to rearrange channels RGB -> BGR and subtract mean values from every channel.

In [ ]:
MEAN_VALUES = np.array([104, 117, 123])
IMAGE_W = 224

def preprocess(img):

def deprocess(img):

img = np.random.rand(IMAGE_W, IMAGE_W, 3)

print np.linalg.norm(deprocess(preprocess(img)) - img)

If your implementation is correct, the number above will be small, because deprocess function is the inverse of preprocess function

In [ ]:
#load model weights
#vgg19.npz is available for download at
net = build_model()
params = np.load('vgg19.npz')['params']
for i in range(32,len(params)):
    params[i] = params[i].T
lasagne.layers.set_all_param_values(net.values(), params)

In [ ]:
input_image = T.tensor4('input')
output = lasagne.layers.get_output(net['prob'], input_image)
prob = theano.function([input_image], output)

In the cell below, you can test your preprocessing function on some sample images. If it is implemented correctly, albatross.jpg will be classified as albatross with 99.9% certainty, and with other pictures the network will produce mostly meaningful result.

You can notice that network output varies from run to run. This behaviour can be supressed with help of "deterministic" keyword in get_output function in the cell above.

In [ ]:
img = imread('sample_images/albatross.jpg')

p = prob(preprocess(img))

labels = p.ravel().argsort()[-1:-6:-1]
print 'top-5 classes are:'
for l in labels:
    print '%3f\t%s' % (p.ravel()[l], classes[l].split(',')[0])

Now, use vgg19 network and your knowledge of machine learning to classify cats and dogs!


catsvsdogs/val/ validation images

catsvsdogs/val_labels.pickle labels for validation images, sorted by filename

catsvsdogs/test/ test images

You have to implement classification algorithm, tune it on validation images, save output of your algorithm on test images in form of pickled file, as shown below. Your results, as well as this notebook, have to be attached to your letter to

I expect classification accuracy >95%, or >90% at least

Cheating is not allowed

In [ ]:
def classify(img):
    if np.random.rand() > 0.5:
        return 'cat'
        return 'dog'

In [ ]:
path = 'catsvsdogs/test/'
files = sorted(os.listdir(path))

labels = []

for f in files:
    img = imread(path + f)
    label = classify(img)
pickle.dump(labels, open('test_labels.pickle', 'wb'))


It is easy to visualize the weights of the first convolutional layer:

In [ ]:
w = net['conv1_1'].W.eval().copy()
w -= w.min()
w /= w.max()
plt.figure(figsize=(10, 10))
for i in range(8):
    for j in range(8):
        n = 8*j + i
        if n < 64:
            plt.imshow(w[n,:,:,:].transpose((1,2,0)), interpolation='none')

On higher layers, filters have more than 3 channels, so it is impossible to visualize them directly. However, of we want to understand something about features on higher layers, it is possible to visualize them via optimization of the input image.

Namely, we can solve the following problem

$$J=\mathrm{argmax} \left( n^i_{xyc}(I) \right)$$

there $n^i_{xyc}$ is the activation of neuron on $i$'th layer in position $x$,$y$,$c$ given input image $I$. Basically, $J$ is the answer on a question "what our neuron is looking for?"

In [ ]:
generated_image = theano.shared(floatX(np.zeros((1, 3, IMAGE_W, IMAGE_W))))
gen_features = lasagne.layers.get_output(net.values(), generated_image)
gen_features = {k: v for k, v in zip(net.keys(), gen_features)}

layer_name = 'pool1'
c = 0
blob_width = gen_features[layer_name].shape[2]
x = blob_width/2
y = blob_width/2
activation_loss = 1e10*(1e1 - gen_features[layer_name][0, c, x, y])**2

tv_loss = T.mean(T.abs_(generated_image[:,:,1:,1:] - generated_image[:,:,:-1,1:]) +
                 T.abs_(generated_image[:,:,1:,1:] - generated_image[:,:,1:,:-1]))

loss = activation_loss + 1.0 * tv_loss

grad = T.grad(loss, generated_image)

f_loss = theano.function([], loss)
f_grad = theano.function([], grad)

# Helper functions to interface with scipy.optimize
def eval_loss(x0):
    x_ = floatX(x0.reshape((1, 3, IMAGE_W, IMAGE_W)))
    return f_loss().astype('float64')

def eval_grad(x0):
    x0 = floatX(x0.reshape((1, 3, IMAGE_W, IMAGE_W)))
    return np.array(f_grad()).flatten().astype('float64')

In [ ]:
#run input image optimization via scipy.optimize.fmin_l_bfgs_b
generated_image.set_value(floatX(np.zeros((1, 3, IMAGE_W, IMAGE_W))))
x0 = generated_image.get_value().astype('float64')
status = scipy.optimize.fmin_l_bfgs_b(eval_loss, x0.flatten(), fprime=eval_grad, maxfun=20)
x0 = generated_image.get_value().astype('float64')

If your deprocess function is implemented correctly, you'll see that the neuron on the first pooling layer is looking for. The result should look like gabor filter, simular to ones found in the first layer of networks with large filters, such as AlexNet.

In [ ]:
#show the results
for d in [112, 64, 32, 16, 8]:
    pic = deprocess(x0)[w/2-d:w/2+d,w/2-d:w/2+d,:]
    pic -= pic.min()
    pic /= pic.max()
    plt.imshow(pic, interpolation='None')

Optional problem: Adjust the code above to work with neurons on fc8 layer.

fc8 neurons are wired to output classes, so maximization of neuron value will produce an image which contains as much of given class (from the point of view of neural network) as possible.

Examples of such images are shown at:

In [ ]: