Deep Learning: Dogs vs Cats Analysis


In [ ]:
%matplotlib inline
import os
import sys
import math
import zipfile
import glob
import numpy as np
import utils; reload(utils)
from utils import *

from keras.models import Sequential
from keras.layers import Lambda, Dense
from keras import backend as K
from matplotlib import pyplot as plt

Create Folder Structure


In [ ]:
%pwd

In [ ]:
#Allow relative imports to directories above this directory
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [ ]:
zip_ref = zipfile.ZipFile('train.zip', 'r')
zip_ref.extractall('.')
zip_ref.close()

In [ ]:
zip_ref = zipfile.ZipFile('test.zip', 'r')
zip_ref.extractall('.')
zip_ref.close()

In [ ]:
#Create references to important directories we will use over and over
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir

In [ ]:
%cd $DATA_HOME_DIR

In [ ]:
#Create directories
os.mkdir('valid')
os.mkdir('files')
os.mkdir('models')
os.mkdir('sample')
os.mkdir('sample/train')
os.mkdir('sample/valid')
os.mkdir('sample/files')
os.mkdir('sample/models')
os.mkdir('sample/test')
os.mkdir('sample/test/unknown')

In [ ]:
%cd $DATA_HOME_DIR/train

In [ ]:
# We move a certain number of files from the train to the valid directory.
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])

In [ ]:
from shutil import copyfile

In [ ]:
# We copy a certain number of files from the train to the sample/train directory.
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

In [ ]:
%cd $DATA_HOME_DIR/valid

In [ ]:
# We copy a certain number of files from the valid to the sample/valid directory.
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

In [ ]:
%cd $DATA_HOME_DIR/test

In [ ]:
# We copy a certain number of files from the test to the sample/test directory.
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/' + shuf[i])

In [ ]:
#Divide cat/dog images into separate directories

%cd $DATA_HOME_DIR/sample/train
os.mkdir('cats')
os.mkdir('dogs')
os.rename("cat.*.jpg", "cats/")
os.rename("dog.*.jpg", "dogs/")

%cd $DATA_HOME_DIR/sample/valid
os.mkdir('cats')
os.mkdir('dogs')
os.rename("cat.*.jpg", "cats/")
os.rename("dog.*.jpg", "dogs/")

%cd $DATA_HOME_DIR/valid
os.mkdir('cats')
os.mkdir('dogs')
os.rename("cat.*.jpg", "cats/")
os.rename("dog.*.jpg", "dogs/")

%cd $DATA_HOME_DIR/train
os.mkdir('cats')
os.mkdir('dogs')
os.rename("cat.*.jpg", "cats/")
os.rename("dog.*.jpg", "dogs/")

In [ ]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
os.rename("*.jpg", "unknown/")

In [ ]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/sample/test
os.rename("*.jpg", "unknown/")

In [ ]:
%cd $DATA_HOME_DIR

Things to keep in mind (Troubleshooting)

  1. Choose always verbosity=2 when training. Otherwise the notebook will crash.
  2. Monitor the RAM while running the cells. You might find out that it gets filled quite fast. If this is the case, please wait until it's freed up.
  3. In theory by running import gc; gc.collect() the Garbage collector is called. In practice it doesn't make much difference.
  4. When it says "Memory Error" it means that you have filled the Computer RAM. Try restarting Jupyter.
  5. When it says "It cannot allocate..." it means that you have filled the GPU VRAM. Try restarting Jupyter.
  6. Don't go with a batch_size bigger than 4 when you have 8 GB RAM.
  7. If you set shuffle=True in gen.flow_from_directory while getting the training batch, you might get weird results in the "Removing Dropout" section.
  8. If you disable all optimizations in Theano in order to get memory, you might have some exceptions like: Cuda error 'unspecified launch failure'
  9. If you mix up optimizers like Adam or RMSprop, you might have weird results. Always use the same one.
  10. IMPORTANT: If you get an accuracy near 0.500 in both training and validation set, try to reduce the learning rate to 0.00001 for example.

Run the following lines in order to set up the Enviroment


In [68]:
# We set the "seed" so we make the results a bit more predictable.
np.random.seed(1)

In [69]:
# Type 'sample/' if you want to work on a smaller dataset.
path = ''
# Depending on your GPU you should change this. For a GTX 970 this is a good value. 
batch_size = 4

In [70]:
# This is the timestamp that we are going to use when saving files.
timestamp = '102714012017'

In [71]:
# Define some useful paths to save files (e.g weights)
files_path = path + 'files/'
models_path = path + 'models/'

In [72]:
def load_batches(path, shuffle=[False, False, True], augmentation=False):
    """
    Load different batches that we'll use in our calculations.
    """

    gen = image.ImageDataGenerator()
    val_batches = gen.flow_from_directory(path + 'valid', target_size=(224,224),
                    class_mode='categorical', shuffle=shuffle[0], batch_size=batch_size)
    test_batches = gen.flow_from_directory(path + 'test', target_size=(224,224),
                    class_mode='categorical', shuffle=shuffle[1], batch_size=batch_size)
    
    # We only want Data augmentation for the training set.
    if augmentation:
        gen = image.ImageDataGenerator(rotation_range=20, width_shift_range=0.1, shear_range=0.05,
                                       height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True)
    train_batches = gen.flow_from_directory(path + 'train', target_size=(224,224),
        class_mode='categorical', shuffle=shuffle[2], batch_size=batch_size)

    return train_batches, val_batches, test_batches

In [73]:
def finetune(model):
    """
    Removes the last layer (usually Dense) and replace it by another one more fitting.
    This is useful when using a pre-trained model like VGG.
    """
    model.pop()
    for layer in model.layers: layer.trainable=False
    model.add(Dense(train_batches.nb_class, activation='softmax'))
    model.compile(optimizer=RMSprop(lr=0.01, rho=0.7),
              loss='categorical_crossentropy', metrics=['accuracy'])

In [74]:
def backpropagation(model):
    """
    Now we do Backpropagation. Backpropagation is when we want to train not only the last
    Dense layer, but also some previous ones. Note that we don't train Convolutional layers.
    """
    layers = model.layers
    for layer in layers: layer.trainable=False
    # Get the index of the first dense layer...
    first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]
    # ...and set this and all subsequent layers to trainable
    for layer in layers[first_dense_idx:]: layer.trainable=True

In [75]:
def save_weights(model, path, name, timestamp):
    print 'Saving weights: {}.h5'.format(path + name + '_' + timestamp)
    model.save_weights(path + '{}_{}.h5'.format(name, timestamp))

In [76]:
def load_weights(model, filepath):
    print 'Loading weights: {}'.format(filepath)
    model.load_weights(filepath)

In [77]:
def train_model(model, train_batches, val_batches, rules, name, timestamp):
    """
    Rules will be something like:
        (
            (0.01, 3),
            (0.1, 2),
            ...
        )
    """
    for lr, epochs in rules:
        model.compile(optimizer=RMSprop(lr=lr, rho=0.7),
              loss='categorical_crossentropy', metrics=['accuracy'])

        for i in range(epochs):
            print 'Lr: {}, Epoch: {}'.format(lr, i + 1)
            model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, verbose=2,
                               nb_epoch=1, validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
            
            #sys.stdout = open('keras_output.txt', 'w')
            #history = model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, verbose=2,
            #                              nb_epoch=1, validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
            #sys.stdout = sys.__stdout__
            #with open('keras_output.txt') as f:
            #    content = f.readlines()
            save_weights(model, files_path, '{}_lr{}_epoch{}'.format(
                    name, lr, i+1), timestamp)

In [78]:
def split_conv_fc(model):
    """
    Split Convolutional and Dense Layers.
    """
    layers = model.layers
    last_conv_idx = [index for index,layer in enumerate(layers) 
                     if type(layer) is Convolution2D][-1]
    conv_layers = layers[:last_conv_idx+1]
    fc_layers = layers[last_conv_idx+1:]
    return conv_layers, fc_layers

In [79]:
# Copy the weights from the pre-trained model.
# NB: Since we're removing dropout, we want to half the weights
def proc_wgts(layer): return [o/2 for o in layer.get_weights()]

In [80]:
def get_fc_model(conv_layers, fc_layers):
    model = Sequential([
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation='relu'),
        Dropout(0.),
        Dense(4096, activation='relu'),
        Dropout(0.),
        Dense(2, activation='softmax')
        ])

    for l1,l2 in zip(model.layers, fc_layers): l1.set_weights(proc_wgts(l2))
    
    model.compile(optimizer=RMSprop(lr=0.00001, rho=0.7), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

Simple Model (VGG16)

This version has the most basic possible configuration using the VGG16 pre-trained network. Please try to understand everything before moving forward.

What do we want to do?

  1. Create simple model
  2. Load batches (train, valid and test)
  3. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
  4. Train the model

In [ ]:
name = 'default_parameter_vgg16'

In [ ]:
# 0. Create simple model
vgg = Vgg16()

In [ ]:
# 1. Load batches (train, valid and test)
train_batches, val_batches, test_batches = load_batches(path)

In [ ]:
# 2. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
finetune(vgg.model)

In [ ]:
# 3. Train the model
train_model(vgg.model,
            train_batches,
            val_batches,
            ((0.01, 1),),
            name + '_lastlayer',
            timestamp)

In [ ]:
save_weights(vgg.model, files_path, name, timestamp)

Data Augmentation (VGG16)

"Data Augmentation" is a technic to reduce "over-fitting", where a generator slightly modifies the images we load "on-the-fly" so the model cannot adapt too much to our training data.

What do we want to do?

  1. Create simple model
  2. Load batches (train, valid and test) with Data Augmentation (random changes to the images we load)
  3. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
  4. Train the model

In [ ]:
name = 'data_augmentation_vgg16'

In [ ]:
# 0. Create simple model
vgg = Vgg16()

In [ ]:
# 1. Load batches (train, valid and test) with Data Augmentation (random changes to the images we load)
train_batches, val_batches, test_batches = load_batches(path, augmentation=True)

In [ ]:
# 2. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
finetune(vgg.model)

In [ ]:
# 3. Train the model
train_model(vgg.model, train_batches, val_batches, ((0.01, 1), (0.1, 1), (0.001, 1), (0.0001, 1)), name + '_lastlayer', timestamp)

In [ ]:
save_weights(vgg.model, files_path, name, timestamp)

Backpropagation - Only Dense Layers (VGG16)

"Backpropagation" is the method of iteratively changing the weights of previous layers, not only the last one. When doing that for "Convolutional layers" we need to be very careful as it takes A LOT of memory.

What do we want to do?

  1. Create simple model
  2. Load batches (train, valid and test)
  3. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
  4. Train first the last layer of the model. This way we are going to improve the overall accuracy.
  5. Set a "trainable" ONLY the dense layers.
  6. Train all the dense layers. Keep in mind that here the learning rate MUST be really small, as we assume that the pre-trained model is relatively good.

In [ ]:
name = 'backpropagation_vgg16'

In [ ]:
# 0. Create simple model
vgg = Vgg16()

In [ ]:
# 1. Load batches (train, valid and test)
train_batches, val_batches, test_batches = load_batches(path)

In [ ]:
# 2. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
finetune(vgg.model)

In [ ]:
3. Train first the last layer of the model. This way we are going to improve the overall accuracy.
train_model(vgg.model,
            train_batches,
            val_batches,
            ((0.01, 1)),
            name + '_lastlayer',
            timestamp)

In [ ]:
# 4. Set a "trainable" ALL the dense layers.
backpropagation(vgg.model)

In [ ]:
# 5. Train all the dense layers. Keep in mind that here the learning rate MUST be really small, as we assume that the pre-trained model is relatively good.
train_model(vgg.model, train_batches, val_batches, ((0.0001, 1), (0.00001, 1)), name + '_denselayers', timestamp)

In [ ]:
save_weights(vgg.model, files_path, name, timestamp)

Data Augmentation + Backpropagation (VGG16)

Here we try the two methods together. Let's see if this improves the accuracy.

What do we want to do?

  1. Create simple model
  2. Load batches (train, valid and test) with Data Augmentation (random changes to the images we load)
  3. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
  4. Train first the last layer of the model. This way we are going to improve the overall accuracy.
  5. Set a "trainable" ONLY the dense layers.
  6. Train all the dense layers. Keep in mind that here the learning rate MUST be really small, as we assume that the pre-trained model is relatively good.

In [ ]:
name = 'data_augmentation_backpropagation_vgg16'

In [ ]:
# 0. Create simple model
vgg = Vgg16()

In [ ]:
# 1. Load batches (train, valid and test) with Data Augmentation (random changes to the images we load)
train_batches, val_batches, test_batches = load_batches(path, augmentation=True)

In [ ]:
# 2. Finetune the model (replace the last dense layer by one that has only two outputs in this case)
finetune(vgg.model)

In [ ]:
# 3. Train first the last layer of the model. This way we are going to improve the overall accuracy.
train_model(vgg.model,
            train_batches,
            val_batches,
            ((0.01, 6), (0.001, 3), (0.0001, 3)),
            name + '_lastlayer', timestamp)

In [ ]:
# 4. Set a "trainable" ONLY the dense layers.
backpropagation(vgg.model)

In [ ]:
# 5. Train all the dense layers. Keep in mind that here the learning rate MUST be really small, as we assume that the pre-trained model is relatively good.
train_model(vgg.model,
            train_batches,
            val_batches,
            ((0.0001, 1), (0.00001, 1)),
            name + '_denselayers',
            timestamp)

In [ ]:
save_weights(vgg.model, files_path, name, timestamp)

Remove Dropout (VGG16)

"Dropout" is a regularization method that randomly removes a certain percent of activations from the previous layer. It's commonly used for networks that are "under-fitting", meaning that we are still throwing away useful information.

Why we calculate the "features" before? Because we don't want to train the convolutional layers (it takes too long). By using the output of the convolutional layers we are using a simple linear model which it's extremly fast.

What do we want to do?

  1. Create model, finetune it and load good weights that we calculated before.
    • And load batches (train, valid and test)
  2. Split the layers into two groups: Convolutional layers and Dense layers.
  3. Create a model with only the Convolutional layers.
  4. Calculate the predictions of our train and valid data using this new model.
    • We'll have something like: [0, 0, 0.12, 0.45, 0,...]
    • The shape of the resulting array will be: (nb_samples, 512, 14, 14). This is a list of filters, so if we have 2000 images, we'll have for each image: 512 filters, where each filter is an array of 14x14.
    • This will be the input of the next linear model.
  5. Get "real" classes for the data using train_batches.classes. e.g 1 0 0 0 1 0 1 0 0 (each number is the class of an image)
  6. Transform those classes in OneHot format. e.g [0,0,1,0...] per each image
  7. Create a new linear model that has this array as an input.
    • Because we removed the Dropout, now certain layers have the double number of inputs as before.
    • To fix that, we remove the half of the weights on those layers, so we replicate the behavior of Dropout. e.g for l1,l2 in zip(model.layers, fc_layers): l1.set_weights(proc_wgts(l2))
  8. We train the model

In [ ]:
name = 'remove_dropout_vgg16'

In [ ]:
# 1) Create model
vgg = Vgg16()
model = vgg.model

In [ ]:
# 1b) And load batches (train, valid and test)
train_batches, val_batches, test_batches = load_batches(path, shuffle=[False, False, False])

In [ ]:
# 1c) finetune it!
finetune(model)

In [ ]:
# 1d) Load good weights that we calculated before [This is an example, please change the path]
load_weights(model, 'files/data_augmentation_backpropagation_vgg16_lastlayer_lr0.0001_epoch2_144813012017.h5')

In [81]:
# 2) Split the layers into two groups: Convolutional layers and Dense layers (or fully connected).
layers = model.layers
last_conv_idx = [index for index,layer in enumerate(layers) 
                 if type(layer) is Convolution2D][-1]
print last_conv_idx
conv_layers = layers[:last_conv_idx+1]
fc_layers = layers[last_conv_idx+1:]


30

In [82]:
# 3) Create a model with only the Convolutional layers.
conv_model = Sequential(conv_layers)

In [ ]:
conv_model.summary()

In [ ]:
# 4) Calculate the predictions.
# The shape of the resulting array will be: (nb_samples, 512, 14, 14).
# This is a list of filters, so if we have 2000 images, we'll have for
# each image: 512 filters, where each filter is an array of 14x14.
val_features = conv_model.predict_generator(val_batches, val_batches.nb_sample)
train_features = conv_model.predict_generator(train_batches, train_batches.nb_sample)

In [ ]:
# 5) Get "real" classes for the data using train_batches.classes. e.g 1 0 0 0 1 0 1 0 0 (each number is the class of an image)
val_classes = val_batches.classes
train_classes = train_batches.classes

In [ ]:
# 6) Transform those classes in OneHot format. e.g [0,0,1,0...] per each image
val_labels = onehot(val_classes)
train_labels = onehot(train_classes)

In [ ]:
# Optional: Save features
save_array(models_path + 'debugging.bc'.format(timestamp), train_features)
save_array(models_path + 'debugging.bc'.format(timestamp), val_features)

In [ ]:
# Optional: Load features
train_features = load_array(models_path+'train_convlayer_features_144813012017_2.bc'.format(timestamp))
val_features = load_array(models_path+'valid_convlayer_features_144813012017_2.bc'.format(timestamp))

In [ ]:
train_features.shape

In [ ]:
# Optional. Look at the shape of the input of the model that we are about to create:
conv_layers[-1].output_shape[1:]
# It should have the same shape as the last convolutional layer

In [83]:
# 7) Create a new linear model that has this features as an input.
fc_model = get_fc_model(conv_layers, fc_layers)

In [ ]:
# Optional. Look at the model we've just created:
fc_model.summary()

In [ ]:
# 8) We train the model
fc_model.fit(train_features, train_labels, nb_epoch=1, verbose=2,
          batch_size=batch_size, validation_data=(val_features, val_labels))

In [ ]:
# Optional: We save the weights
save_weights(fc_model, files_path, name + '_9813', timestamp)

In [84]:
# Optional: Load weights
load_weights(fc_model, 'models/no_dropout.h5')


Loading weights: models/no_dropout.h5

In [ ]:
y = fc_model

Add Data Augmentation to Dropout 0.

Now that we are over-fitting, let's add Data Augmentation to the previous method.

What do we want to do?

  1. Load batches (train, valid and test)
  2. Get previous Fully connected model (linear model)
  3. Add this fully connected model to the convolutional model we created before
  4. Check that the new model is correct
  5. Train the model

In [ ]:
name = 'data_augmentation_plus_dropout0_vgg16'

In [ ]:
# 1b) And load batches (train, valid and test)
train_batches, val_batches, test_batches = load_batches(path, augmentation=True)

In [ ]:
# 1. Get previous Fully connected model (linear model)
# CAREFUL! This will replace the existing weights! Leave it commented out if you want to re-use the weights
conv_model = Sequential(conv_layers)
fc_model = get_fc_model(conv_layers, fc_layers)

In [85]:
# 2. Add this fully connected model to the convolutional model we created before
# We need to do this because we don't want to train the convolutional layers.
#conv_model = Sequential(conv_layers)
for layer in conv_model.layers: layer.trainable = False
# Look how easy it is to connect two models together!
conv_model.add(fc_model)

In [86]:
# 2. Check that the new model is correct
conv_model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
lambda_1 (Lambda)                (None, 3, 224, 224)   0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 226, 226)   0           lambda_1[0][0]                   
                                                                   lambda_1[0][0]                   
                                                                   lambda_1[0][0]                   
                                                                   lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  1792        zeropadding2d_1[0][0]            
                                                                   zeropadding2d_1[1][0]            
                                                                   zeropadding2d_1[2][0]            
                                                                   zeropadding2d_1[3][0]            
____________________________________________________________________________________________________
zeropadding2d_2 (ZeroPadding2D)  (None, 64, 226, 226)  0           convolution2d_1[0][0]            
                                                                   convolution2d_1[1][0]            
                                                                   convolution2d_1[2][0]            
                                                                   convolution2d_1[3][0]            
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 64, 224, 224)  36928       zeropadding2d_2[0][0]            
                                                                   zeropadding2d_2[1][0]            
                                                                   zeropadding2d_2[2][0]            
                                                                   zeropadding2d_2[3][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 64, 112, 112)  0           convolution2d_2[0][0]            
                                                                   convolution2d_2[1][0]            
                                                                   convolution2d_2[2][0]            
                                                                   convolution2d_2[3][0]            
____________________________________________________________________________________________________
zeropadding2d_3 (ZeroPadding2D)  (None, 64, 114, 114)  0           maxpooling2d_1[0][0]             
                                                                   maxpooling2d_1[1][0]             
                                                                   maxpooling2d_1[2][0]             
                                                                   maxpooling2d_1[3][0]             
____________________________________________________________________________________________________
convolution2d_3 (Convolution2D)  (None, 128, 112, 112) 73856       zeropadding2d_3[0][0]            
                                                                   zeropadding2d_3[1][0]            
                                                                   zeropadding2d_3[2][0]            
                                                                   zeropadding2d_3[3][0]            
____________________________________________________________________________________________________
zeropadding2d_4 (ZeroPadding2D)  (None, 128, 114, 114) 0           convolution2d_3[0][0]            
                                                                   convolution2d_3[1][0]            
                                                                   convolution2d_3[2][0]            
                                                                   convolution2d_3[3][0]            
____________________________________________________________________________________________________
convolution2d_4 (Convolution2D)  (None, 128, 112, 112) 147584      zeropadding2d_4[0][0]            
                                                                   zeropadding2d_4[1][0]            
                                                                   zeropadding2d_4[2][0]            
                                                                   zeropadding2d_4[3][0]            
____________________________________________________________________________________________________
maxpooling2d_2 (MaxPooling2D)    (None, 128, 56, 56)   0           convolution2d_4[0][0]            
                                                                   convolution2d_4[1][0]            
                                                                   convolution2d_4[2][0]            
                                                                   convolution2d_4[3][0]            
____________________________________________________________________________________________________
zeropadding2d_5 (ZeroPadding2D)  (None, 128, 58, 58)   0           maxpooling2d_2[0][0]             
                                                                   maxpooling2d_2[1][0]             
                                                                   maxpooling2d_2[2][0]             
                                                                   maxpooling2d_2[3][0]             
____________________________________________________________________________________________________
convolution2d_5 (Convolution2D)  (None, 256, 56, 56)   295168      zeropadding2d_5[0][0]            
                                                                   zeropadding2d_5[1][0]            
                                                                   zeropadding2d_5[2][0]            
                                                                   zeropadding2d_5[3][0]            
____________________________________________________________________________________________________
zeropadding2d_6 (ZeroPadding2D)  (None, 256, 58, 58)   0           convolution2d_5[0][0]            
                                                                   convolution2d_5[1][0]            
                                                                   convolution2d_5[2][0]            
                                                                   convolution2d_5[3][0]            
____________________________________________________________________________________________________
convolution2d_6 (Convolution2D)  (None, 256, 56, 56)   590080      zeropadding2d_6[0][0]            
                                                                   zeropadding2d_6[1][0]            
                                                                   zeropadding2d_6[2][0]            
                                                                   zeropadding2d_6[3][0]            
____________________________________________________________________________________________________
zeropadding2d_7 (ZeroPadding2D)  (None, 256, 58, 58)   0           convolution2d_6[0][0]            
                                                                   convolution2d_6[1][0]            
                                                                   convolution2d_6[2][0]            
                                                                   convolution2d_6[3][0]            
____________________________________________________________________________________________________
convolution2d_7 (Convolution2D)  (None, 256, 56, 56)   590080      zeropadding2d_7[0][0]            
                                                                   zeropadding2d_7[1][0]            
                                                                   zeropadding2d_7[2][0]            
                                                                   zeropadding2d_7[3][0]            
____________________________________________________________________________________________________
maxpooling2d_3 (MaxPooling2D)    (None, 256, 28, 28)   0           convolution2d_7[0][0]            
                                                                   convolution2d_7[1][0]            
                                                                   convolution2d_7[2][0]            
                                                                   convolution2d_7[3][0]            
____________________________________________________________________________________________________
zeropadding2d_8 (ZeroPadding2D)  (None, 256, 30, 30)   0           maxpooling2d_3[0][0]             
                                                                   maxpooling2d_3[1][0]             
                                                                   maxpooling2d_3[2][0]             
                                                                   maxpooling2d_3[3][0]             
____________________________________________________________________________________________________
convolution2d_8 (Convolution2D)  (None, 512, 28, 28)   1180160     zeropadding2d_8[0][0]            
                                                                   zeropadding2d_8[1][0]            
                                                                   zeropadding2d_8[2][0]            
                                                                   zeropadding2d_8[3][0]            
____________________________________________________________________________________________________
zeropadding2d_9 (ZeroPadding2D)  (None, 512, 30, 30)   0           convolution2d_8[0][0]            
                                                                   convolution2d_8[1][0]            
                                                                   convolution2d_8[2][0]            
                                                                   convolution2d_8[3][0]            
____________________________________________________________________________________________________
convolution2d_9 (Convolution2D)  (None, 512, 28, 28)   2359808     zeropadding2d_9[0][0]            
                                                                   zeropadding2d_9[1][0]            
                                                                   zeropadding2d_9[2][0]            
                                                                   zeropadding2d_9[3][0]            
____________________________________________________________________________________________________
zeropadding2d_10 (ZeroPadding2D) (None, 512, 30, 30)   0           convolution2d_9[0][0]            
                                                                   convolution2d_9[1][0]            
                                                                   convolution2d_9[2][0]            
                                                                   convolution2d_9[3][0]            
____________________________________________________________________________________________________
convolution2d_10 (Convolution2D) (None, 512, 28, 28)   2359808     zeropadding2d_10[0][0]           
                                                                   zeropadding2d_10[1][0]           
                                                                   zeropadding2d_10[2][0]           
                                                                   zeropadding2d_10[3][0]           
____________________________________________________________________________________________________
maxpooling2d_4 (MaxPooling2D)    (None, 512, 14, 14)   0           convolution2d_10[0][0]           
                                                                   convolution2d_10[1][0]           
                                                                   convolution2d_10[2][0]           
                                                                   convolution2d_10[3][0]           
____________________________________________________________________________________________________
zeropadding2d_11 (ZeroPadding2D) (None, 512, 16, 16)   0           maxpooling2d_4[0][0]             
                                                                   maxpooling2d_4[1][0]             
                                                                   maxpooling2d_4[2][0]             
                                                                   maxpooling2d_4[3][0]             
____________________________________________________________________________________________________
convolution2d_11 (Convolution2D) (None, 512, 14, 14)   2359808     zeropadding2d_11[0][0]           
                                                                   zeropadding2d_11[1][0]           
                                                                   zeropadding2d_11[2][0]           
                                                                   zeropadding2d_11[3][0]           
____________________________________________________________________________________________________
zeropadding2d_12 (ZeroPadding2D) (None, 512, 16, 16)   0           convolution2d_11[0][0]           
                                                                   convolution2d_11[1][0]           
                                                                   convolution2d_11[2][0]           
                                                                   convolution2d_11[3][0]           
____________________________________________________________________________________________________
convolution2d_12 (Convolution2D) (None, 512, 14, 14)   2359808     zeropadding2d_12[0][0]           
                                                                   zeropadding2d_12[1][0]           
                                                                   zeropadding2d_12[2][0]           
                                                                   zeropadding2d_12[3][0]           
____________________________________________________________________________________________________
zeropadding2d_13 (ZeroPadding2D) (None, 512, 16, 16)   0           convolution2d_12[0][0]           
                                                                   convolution2d_12[1][0]           
                                                                   convolution2d_12[2][0]           
                                                                   convolution2d_12[3][0]           
____________________________________________________________________________________________________
convolution2d_13 (Convolution2D) (None, 512, 14, 14)   2359808     zeropadding2d_13[0][0]           
                                                                   zeropadding2d_13[1][0]           
                                                                   zeropadding2d_13[2][0]           
                                                                   zeropadding2d_13[3][0]           
____________________________________________________________________________________________________
sequential_8 (Sequential)        (None, 2)             119554050   convolution2d_13[3][0]           
====================================================================================================
Total params: 134,268,738
Trainable params: 119,554,050
Non-trainable params: 14,714,688
____________________________________________________________________________________________________

In [ ]:
# 4. Train the model
train_model(conv_model,
            train_batches,
            val_batches,
            ((0.000001, 2),),
            name + '_data_augentation_to_zero_dropout',
            timestamp)

In [ ]:
# Optional: We save the weights
save_weights(conv_model, files_path, name, timestamp)

Batch Normalization.

Batch normalization (batchnorm) is a way to ensure that activations don't become too high or too low at any point in the model. Adjusting activations so they are of similar scales is called normalization. It's a MUST TO HAVE as you can improve the training speed up to 10x.

What do we want to do?

  1. Check the current shape of the convolutional layers
  2. Create model with Batch Normalization
  3. Finetune it and adjust weights based on the new Dropout number
  4. Train the Batch Normalization model
  5. Create a final model based on the "convolutional layers"
  6. Set as "non-trainable" to all the layers of this last model
  7. Add all the new layers from the Batch Normalization model to this last model.
  8. Set weights of the new added layers. Apparently, when you add a layer to a model, the weights are not copied through, so this step is required!
  9. Train the last model

In [ ]:
name = 'batch_normalization_vgg16'

In [ ]:
# 1. Check the current shape of the convolutional layers
conv_layers[-1].output_shape[1:]

In [ ]:
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation='relu'),
        Dropout(p),
        BatchNormalization(),
        Dense(4096, activation='relu'),
        Dropout(p),
        BatchNormalization(),
        Dense(1000, activation='softmax')
        ]

In [ ]:
# 2. Create model with Batch Normalization
p = 0.6
bn_model = Sequential(get_bn_layers(p))

In [66]:
def proc_wgts_bn(layer, prev_p, new_p):
    scal = (1-prev_p)/(1-new_p)
    return [o*scal for o in layer.get_weights()]

In [67]:
# 3. Finetune it and adjust weights based on the new Dropout number
for l in bn_model.layers: 
    if type(l)==Dense: l.set_weights(proc_wgts_bn(l, 0.3, 0.6))

finetune(bn_model)

In [ ]:
# 4. Train the Batch Normalization model
bn_model.fit(train_features, train_labels, nb_epoch=1, verbose=2,
             batch_size=batch_size, validation_data=(val_features, val_labels))

In [ ]:
# Optional: We save the weights
save_weights(bn_model, files_path, name, timestamp)

In [ ]:
# 5. Create a final model based on the "convolutional layers"
bn_layers = get_bn_layers(0.6)
bn_layers.pop()
bn_layers.append(Dense(2,activation='softmax'))

final_model = Sequential(conv_layers)

In [ ]:
# 6. Set as "non-trainable" to all the layers of this last model
for layer in final_model.layers: layer.trainable = False

In [ ]:
# 7. Add all the new layers from the Batch Normalization model to this last model.
for layer in bn_layers: final_model.add(layer)

In [ ]:
# 8. Set weights of the new added layers. Apparently, when you add a layer to a model, the weights are not copied through, so this step is required!
for l1,l2 in zip(bn_model.layers, bn_layers):
    l2.set_weights(l1.get_weights())

In [ ]:
# 9. Train the last model
train_model(final_model,
            train_batches,
            val_batches,
            ((0.000001, 1),),
            name + '_batch_normalization',
            timestamp)

In [ ]:
# Optional: We save the weights
save_weights(bn_model, files_path, name, timestamp)

Viewing model prediction examples

  • A few correct labels at random
  • A few incorrect labels at random
  • The most correct labels of each class (ie those with highest probability that are correct)
  • The most incorrect labels of each class (ie those with highest probability that are incorrect)
  • The most uncertain labels (ie those with probability closest to 0.5).

In [ ]:
val_batches, probs = vgg.test(path + 'valid', batch_size = batch_size)

filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)

In [ ]:
from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(path + 'valid/' + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [ ]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

In [ ]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [ ]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [ ]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [ ]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [ ]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [ ]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

Viewing Data Augmentation


In [ ]:
# dim_ordering='tf' uses tensorflow dimension ordering,
#   which is the same order as matplotlib uses for display.
# Therefore when just using for display purposes, this is more convenient
gen = image.ImageDataGenerator(rotation_range=20, width_shift_range=0.1, shear_range=0.05,
                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True,dim_ordering='tf')

In [ ]:
# Create a 'batch' of a single image
img = np.expand_dims(ndimage.imread(path+'test/unknown/87.jpg'),0)
# Request the generator to create batches from this image
aug_iter = gen.flow(img)

In [ ]:
# Get eight examples of these augmented images
aug_imgs = [next(aug_iter)[0].astype(np.uint8) for i in range(8)]

In [ ]:
# The original
plt.imshow(img[0])

In [ ]:
# Augmented data
plots(aug_imgs, (20,7), 2)

In [ ]:
# Ensure that we return to theano dimension ordering
K.set_image_dim_ordering('th')

Confussion Matrix


In [ ]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

In [ ]:
plot_confusion_matrix(cm, val_batches.class_indices)

Predict Test set + create Kaggle submission file (taught in the Course)


In [ ]:
predictions = fc_model.predict_generator(test_batches, test_batches.nb_sample)

In [ ]:
isdog = predictions[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])

In [ ]:
isdog = isdog.clip(min=0.05, max=0.95)

In [ ]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = test_batches.filenames

ids = np.array([int(f[8:f.find('.')]) for f in filenames])

In [ ]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [ ]:
submission_file_name = 'submission_{}_5.csv'.format(timestamp)
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [ ]:
from IPython.display import FileLink
FileLink(submission_file_name)

Alternative way to generate Submission file (it has a better score!)


In [87]:
load_weights(conv_model, 'files/data_augmentation_plus_dropout0_vgg16_data_augentation_to_zero_dropout_lr1e-05_epoch1_102714012017.h5')


Loading weights: files/data_augmentation_plus_dropout0_vgg16_data_augentation_to_zero_dropout_lr1e-05_epoch1_102714012017.h5

In [88]:
def write_submission_csv(submission_file_name, data, columns):
    """
    Write data according to the Kaggle submission format.
    """
    with open(submission_file_name, 'wb') as f:
        w = csv.writer(f)
        w.writerow(columns)
        for key in data.keys():
            w.writerow([key, data[key]])

In [89]:
gen = image.ImageDataGenerator()
test_batches = gen.flow_from_directory(path + 'test', target_size=(224,224),
                                       class_mode=None, shuffle=False, batch_size=batch_size)


Found 12500 images belonging to 1 classes.

In [90]:
predictions = conv_model.predict_generator(test_batches, test_batches.nb_sample)

In [95]:
predictions[0]
#conv_model.summary()


Out[95]:
array([ 0.,  1.], dtype=float32)

In [ ]:
import csv
d = {}
submission_file_name = 'submission_{}_5_new.csv'.format(timestamp)
for idx, filename in enumerate(test_batches.filenames):
    # We only want the ID, so remove the folder name and file extension.
    result = int(filename[8:-4])
    # We use a trick to never show 0 or 1, but 0.05 and 0.95.
    # This is required becase log loss penalizes predictions that are confident and wrong.
    d[result] = predictions[idx][1].clip(min=0.05, max=0.95)
write_submission_csv(submission_file_name, d, ['id', 'label'])

In [ ]:
from IPython.display import FileLink
FileLink(submission_file_name)