Transfer learning for computer vision

This notebook introduces the concept of transfer learning in computer vision. The problem of identifying objects in images is now considered to be mostly solved. This is due to several factors, among them the use of deep convolutional networks trained on large amounts of data. More strikingly, the intermediate layers learned by these networks can be used to solve new object recognition problems.

Here we demonstrate how the weights of the convolutional layers learned by the VGG-16 network, trained on the ImageNet data set, can help us to quickly build a strikingly accurate image classifier using a moderately sized data set.


In [ ]:
%matplotlib inline
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

In [ ]:
#################################################################
# Load data
#
# This cell implements a function to load the Caltech 101 data set.
# You can find it at http://www.vision.caltech.edu/Image_Datasets/Caltech101/
# To simplify things a bit, I modified the images so that they all would be 224x224 in size
# You can find the script I used for that in the scripts/ folder (pad_ct101.py)
#################################################################
from scipy import ndimage
import os
from os import listdir
from os.path import isfile, join
import subprocess
import re
import numpy as np

WIDTH = 224
HEIGHT = 224
CHANNELS = 3

# In the paper describing the network we are going to use, 
# they say they preprocess the data by subtracting the ImageNet-wide mean
IMAGENET_RGB_MEAN = [123.68, 116.779, 103.939]    

def read_caltech101(mypath):
    num_classes = 101

    # We omit the BACKGROUND_Google class
    dirs = [f for f in listdir(mypath) if f != 'BACKGROUND_Google']

    label = 0
    label_map = dict()

    X = []
    y = []

    for dir in dirs:    
        label_map[label] = dir
        onlyfiles = [f for f in listdir(mypath+'/'+dir) if isfile(join(mypath+'/'+dir, f))]
        for f in onlyfiles:
            full_path = join(mypath+'/'+dir, f)
            img = ndimage.imread(full_path).astype(np.float32)
            # We only consider color images
            if len(img.shape)==3:
                #Preprocess according to VGG specs: subtract Imagenet RGB mean
                for i in range(3):
                    img[:,:,i] -= IMAGENET_RGB_MEAN[i]
                X.append(img)
                y.append(label)
        label += 1

    
    # Convert to numpy arrays
    X = np.concatenate(X).reshape([len(X), WIDTH, HEIGHT, CHANNELS])    
    y = np.array(y)
    return X,y,label_map

In [ ]:
import keras
X, y, label_map = read_caltech101('padded_101')

# Transform labels to one-hot vectors
num_classes = 101
y_cat = keras.utils.to_categorical(y, num_classes)

print 'X: {}. y: {}'.format(X.shape, y.shape)

In [ ]:
for k in label_map:
    print k, label_map[k], len(filter(lambda x: x==k, y))

In [ ]:
# Visualize the dataset
i = np.random.randint(X.shape[0])
x = np.copy(X[i])
for j in range(3):
    x[:,:,j] += IMAGENET_RGB_MEAN[j]
x=x/255.
print i,label_map[y[i]]
plt.imshow(x, cmap='gray')

In [ ]:
##################################################################
# Baseline tests:
# We will try out a relatively simple task. We'll only consider
# four classes: Airplane, motorbike, faces, leopard (the most numerous in Caltech 101)
# We'll train a couple of well-known classifiers to see how well we can do
# with a straightforward approach
#
##################################################################

In [ ]:
# Filter the dataset so it only contains the two considered classes
# Warning! Make sure these indices correspond to the classes mentioned above. Check the dictionary object
classes = [0,2,3,5]
filtered_ids = filter(lambda i: y[i] in classes, np.arange(X.shape[0]))
X_filt = X[filtered_ids,:]
y_filt = y[filtered_ids]
len(filtered_ids)

In [ ]:
def get_splits(X, y, ratio=0.1, cat=False):
    """
    Finds a random split of size ratio*size(data).
    Returns the corresponding splits of X and y.
    """
    val_ids = np.random.choice(np.arange(X.shape[0]), int(X.shape[0]*ratio), replace=False)
    train_ids = np.delete(np.arange(X.shape[0]), val_ids)
    x_train = X[train_ids,:]
    x_val = X[val_ids,:]
    if cat:
        y_train = y[train_ids,:]
        y_val = y[val_ids,:]
    else:
        y_train = y[train_ids]
        y_val = y[val_ids]                
    return x_train, y_train, x_val, y_val

In [ ]:
from sklearn import metrics
def eval(y, preds, classes):
    """
    Given a set of labels y and predictions preds, computes precision, recall and F1.
    """
    for i in classes:
        preds_i = [1 if j==i else 0 for j in preds]
        y_i = [1 if j==i else 0 for j in y]
        print 'Class {}:'.format(label_map[i])    
        print 'Precision: {}'.format(metrics.precision_score(y_i, preds_i))
        print 'Recall: {}'.format(metrics.recall_score(y_i, preds_i))
        print 'F1: {}'.format(metrics.f1_score(y_i, preds_i))
        print ''

In [ ]:
###########################################################################
# Baseline tests: First, we will train a couple of well-known classifiers
# to see how a basic approach does
###########################################################################

In [ ]:
# Obtain train/test splits
x_train, y_train, x_test, y_test = get_splits(X_filt, y_filt, ratio=0.1, cat=False)
print 'Train: {}. Test: {}'.format(x_train.shape, x_test.shape)

In [ ]:
#====================
# Logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')

#Convert the images into 1-D vectors
x_train_shaped = x_train.reshape([x_train.shape[0], WIDTH*HEIGHT*CHANNELS])
clf.fit(x_train_shaped, y_train)

In [ ]:
#====================
# Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=10)

#Convert the images into 1-D vectors
x_train_shaped = x_train.reshape([x_train.shape[0], WIDTH*HEIGHT*CHANNELS])
clf.fit(x_train_shaped, y_train)

In [ ]:
# Evaluate the trained classifer
classes = np.unique(y_train)
print classes
x_train_shaped = x_train.reshape([x_train.shape[0], WIDTH*HEIGHT*CHANNELS])
preds = clf.predict(x_train_shaped)
print '='*50
print 'Train results:'
eval(y_train, preds, classes)
x_test_shaped = x_test.reshape([x_test.shape[0], WIDTH*HEIGHT*CHANNELS])
preds = clf.predict(x_test_shaped)
print '='*50
print 'Test results:'
eval(y_test, preds, classes)

In [ ]:
# Sanity check: predict random digits from the train/test set
# This way we can check visually if our classifier is well trained
import matplotlib.image as mpimg
check_set = x_test
check_y = y_test

i = np.random.choice(np.arange(check_set.shape[0]))
print i
img = np.copy(check_set[i])
x = np.copy(img.reshape([1, 224*224*3]))
for j in range(3):
    img[:,:,j] += IMAGENET_RGB_MEAN[j]
print 'Prediction: {}. Confidence: {}'.format(label_map[int(clf.predict(x))], np.max(clf.predict_proba(x)))
print 'Label: {}'.format(label_map[check_y[i]])
img /= 255.
plt.imshow(img,cmap='gray')

In [ ]:
# Import test images
# Here we load a bunch of images we got off the Internet (stored in test_imgs), 
# to see how well our classifier does in the wild.
# These were images I downloaded and saved in a local directory, so you'll have to do that yourself.
# If you want to, download a bunch of (preferrably square) images from the net and scale them to 224x224
# Notice that reusing convolutional nets is not so restrictive, as the convolutional filters can be applied to any
# input size.
# Remember the test I did: I took screenshots from youtube videos and fed them to the classifier. That way I could be
# sure that neither my classifier nor the VGG16 net had ever seen those, so no trick.
from os import listdir
from os.path import isfile, join
dir = 'test_imgs'
test_imgs = [join(dir,f) for f in listdir('test_imgs') if isfile(join(dir,f)) and '.jpg' in f]
print test_imgs

In [ ]:
i = 0

In [ ]:
# Predict images taken from the internet
import matplotlib.image as mpimg
img=mpimg.imread(test_imgs[i])
i+=1
x = np.array([img]).astype(np.float32)
for j in range(3):
    x[0,:,:,j] -= IMAGENET_RGB_MEAN[j]
x=x.reshape([1, 224*224*3])
print 'Prediction: {}. Confidence: {}'.format(label_map[int(clf.predict(x))], np.max(clf.predict_proba(x)))
plt.imshow(img,cmap='gray')

In [ ]:
###########################################################################
# Transfer learning
# We now adopt an alternative approach. We will take a very deep network trained on ImageNet
# (a data set of about 14M images) and remove the uppermost layers (the classifier). We will
# then train a classifier of our own.
#
# Take into account that if we take all the classes in Caltech 101, the classifier will not be as accurate.
# Some classes are not numerous enough and there is a considerable amount of variability to correctly learn 101 of them.
# However, remember we only tried a logistic regression model in the course. You can also stack a dense neural network
# on top of VGG-16. Perhaps you can do better with that approach in the full 101-class data set.
###########################################################################

In [ ]:
# Load VGG-16 trained on ImageNet
# Look at the arguments: 
# - we don't include the top of the network (i.e. we remove the classifier). We just want to 
# keep the feature extractors learned in the hidden layers. 
# - We also choose the imagenet weights, because we want to benefit from what was learned by the authors
#
# More info:
# - https://keras.io/applications/#vgg16
# - https://arxiv.org/pdf/1409.1556.pdf
from  keras import applications
from keras.models import Sequential
from keras.layers import Flatten
num_classes=4
pt_model = applications.vgg16.VGG16(include_top=False, 
                                        weights='imagenet', 
                                        input_shape=(224,224,3), 
                                        pooling=None, 
                                        classes=num_classes)

In [ ]:
# Print the network architecture
# Notice how big the network is (and we removed the classifier on top, which is by far the most complex part)
# For some reason not yet fully understood, deep neural networks can estimate millions of parameters "correctly"
# if they have enough data
pt_model.summary()

In [ ]:
#Convert to Sequential (because it's what we've seen in the course, 
# but the functional API is more convenient for this purpose)
model = Sequential()
for l in pt_model.layers:
    model.add(l)
model.set_weights(pt_model.get_weights())

# We add a flattening layer in order to train a classifier on the top
model.add(Flatten())

In [ ]:
# Transform training and validation sets to feature space (the representation learned by the pretrained net)
print 'Predicting train'
x_train_rep = model.predict(x_train, verbose=1)
print 'Predicting validation'
x_test_rep = model.predict(x_test, verbose=1)
print x_train_rep.shape, x_test_rep.shape

In [ ]:
#####################################################
# Train a logistic regression model on feature space
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
x_train_shaped = np.array(x_train_rep).reshape([x_train_rep.shape[0], 7*7*512])
clf.fit(x_train_shaped, y_train)

In [ ]:
classes = np.unique(y_train)
print classes
preds = clf.predict(x_train_shaped)
print 'Train results:'
eval(y_train, preds, classes)
x_test_shaped = x_test_rep.reshape([x_test_rep.shape[0], 7*7*512])
preds = clf.predict(x_test_shaped)
print 'Test results:'
eval(y_test, preds, classes)

In [ ]:
i=0

In [ ]:
# Predict random images with logistic regression on feature space
import matplotlib.image as mpimg
img=mpimg.imread(test_imgs[i])
i+=1
x = np.array([img]).astype(np.float32)
for j in range(3):
    x[0,:,:,j] -= IMAGENET_RGB_MEAN[j]
x=model.predict(x).reshape([1, 7*7*512])
prediction = np.argmax(clf.predict(x))

print 'Prediction: {}. Confidence: {}'.format(label_map[int(clf.predict(x))], np.max(clf.predict_proba(x)))
plt.imshow(img,cmap='gray')

We can also train a deep neural network on top of the VGG-16 features. In fact, for more complex tasks, that can be much better than a simple logistic regression classifier.

There are various ways we can do this. We can place the classifier on top of the VGG network and freeze the VGG layes (see Keras API). Alternatively, we can train the classifier on the transformed inputs and then stacked the resulting (trained) network on top of VGG. This way we make the transformation+classification in one step (model.predict(x)).


In [ ]:
########################################################
# Classifier on top of VGG-16

from keras.layers import Dense, Dropout

input_shape = [7*7*512,]
num_classes = 101

clf = Sequential()
clf.add(Dense(256, input_shape=input_shape, activation='relu'))
clf.add(Dropout(0.25))
clf.add(Dense(64, activation='relu'))
clf.add(Dropout(0.5))
clf.add(Dense(num_classes, activation='softmax'))

In [ ]:
########################################################
# Classifier on top of VGG-16

input_shape = [7*7*512,]
num_classes = 101

clf = Sequential()
clf.add(Dense(256, input_shape=input_shape, activation='relu'))
clf.add(Dropout(0.25))
clf.add(Dense(128, activation='relu'))
clf.add(Dropout(0.25))
clf.add(Dense(64, activation='relu'))
clf.add(Dropout(0.25))
clf.add(Dense(64, activation='relu'))
clf.add(Dropout(0.5))
clf.add(Dense(num_classes, activation='softmax'))

In [ ]:
clf.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [ ]:
clf.summary()

In [ ]:
# Remember: the last layer is softmax, so we need to transform the labels into one-hot vectors
y_train_cat = keras.utils.to_categorical(y_train, num_classes)
y_test_cat = keras.utils.to_categorical(y_test, num_classes)

In [ ]:
train_loss = []
val_loss = []

In [ ]:
# Train and monitor progress
fig = plt.figure()
ax = fig.gca()

x_train_shaped = x_train_rep.reshape([x_train_rep.shape[0],7*7*512])
x_test_shaped = x_test_rep.reshape([x_test_rep.shape[0],7*7*512])
                                     
for i in range(50):
    print i
    history = clf.fit(x_train_shaped, y_train_cat, epochs=1, batch_size=128, verbose=1, validation_data=(x_test_shaped, y_test_cat))
    
    train_loss.append(history.history['loss'])
    val_loss.append(history.history['val_loss'])
    
    ax.clear()    
    ax.plot(train_loss, color='red', label='Train')
    ax.plot(val_loss, color='blue', label='Validation')

    fig.canvas.draw()

In [ ]:
model.save('vgg_clf.h5')
model.save_weights('vgg_clf_weights.h5')