Based on https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html and the really helpful courses of http://www.fast.ai/ (dogs_cats_redux.ipynb).
General:
Use from keras.preprocessing import image
library for the image processing. It takes care of normalization, resizing, augmentation and converions to/from array.
Pretrained Models for TF:
In [1]:
import random
import time
import sys
stdout = sys.stdout
#sys.stdout = open(CW_DIR+'/keras_output.txt', 'a+')
#sys.stdout = stdout
from glob import glob
import utils
from utils import *
import importlib
importlib.reload(utils)
import twBase
importlib.reload(twBase)
from keras.preprocessing import image
from IPython.core.debugger import Tracer
#Tracer()()
#Allow relative imports to directories above cwd/
sys.path.insert(1, os.path.join(sys.path[0], '..'))
%matplotlib inline
current_dir = os.getcwd()
Out[1]:
Out[1]:
To start you will need to download and unzip the competition data from Kaggle and ensure your directory structure looks like this
└── data
├── results
├── sample
│ ├── results
│ ├── test
│ ├── train
│ │ ├── cats
│ │ └── dogs
│ └── valid
│ ├── cats
│ └── dogs
├── test
│ └── unknown
├── train
│ ├── cats
│ └── dogs
└── valid
├── cats
└── dogs
Download data into ./data
and unzip it.
https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data
In [16]:
#DATA_HOME_DIR = '/Users/q187392/kg/cac/data'
CW_DIR = os.environ['HOME'] + '/kg/cac'
DATA_HOME_DIR = os.environ['HOME'] + '/kg/cac/data'
IS_SAMPLE = False
if IS_SAMPLE:
DATA_WORK_DIR = DATA_HOME_DIR + '/sample'
number = 200
valid_number = 40
test_number = 40
else:
DATA_WORK_DIR = DATA_HOME_DIR
number = 23000
valid_number = 2000
test_number = 2000
In [ ]:
#Create directories
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown
%mkdir -p sample/test/unknown
In [ ]:
%cd $DATA_HOME_DIR/train
# move data as validation set
g = glob('dog.*.jpg')
shuf = np.random.permutation(g)
for i in range(1000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])
g = glob('cat.*.jpg')
shuf = np.random.permutation(g)
for i in range(1000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])
In [ ]:
%cd $DATA_HOME_DIR/train
# create sample set
g = glob('dog.*.jpg')
shuf = np.random.permutation(g)
for i in range(100): _ = copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])
g = glob('cat.*.jpg')
shuf = np.random.permutation(g)
for i in range(100): _ = copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])
In [ ]:
%cd $DATA_HOME_DIR/valid
g = glob('dog.*.jpg')
shuf = np.random.permutation(g)
for i in range(20): _ = copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])
g = glob('cat.*.jpg')
shuf = np.random.permutation(g)
for i in range(20): _ = copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])
%cd $DATA_HOME_DIR
In [ ]:
%cd $DATA_HOME_DIR/test
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(20):
_ = copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/unknown/' + shuf[i])
%cd $DATA_HOME_DIR
In [ ]:
#Divide cat/dog images into separate directories
%cd $DATA_HOME_DIR/sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/sample/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
In [ ]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/
In [5]:
%cd $DATA_WORK_DIR
import random
def get_randomImg(path='./train', target_size=(224, 224)):
random.seed = time.time
classtype = random.choice(['cats', 'dogs'])
if not path.endswith('/'):
path += '/'
img_path = random.choice(glob(path+'{}/*.jpg'.format(classtype)))
img = image.load_img(img_path, target_size=target_size)
testImg = image.img_to_array(img)
testImg = np.expand_dims(testImg, axis=0)
testImg = preprocess_input(testImg)
plt.imshow(image.array_to_img(testImg[0]))
plt.suptitle(img_path)
plt.show()
return testImg
testImg = get_randomImg('./train')
In [11]:
%cd $DATA_WORK_DIR
size = 224
batch_size = 64
# this is the augmentation configuration we will use for training
augment_datagen = image.ImageDataGenerator(
#rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
# this is the augmentation configuration we will use for testing: only rescaling
pure_datagen = image.ImageDataGenerator(
#rescale=1./255
)
# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = augment_datagen.flow_from_directory(
'train', # this is the target directory
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary', # since we use binary_crossentropy loss, we need binary labels
shuffle=False
)
# this is a similar generator, for validation data
validation_generator = pure_datagen.flow_from_directory(
'valid',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary',
shuffle=False
)
predict_generator = pure_datagen.flow_from_directory(
'test',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode=None,
shuffle=False # our data will be in order, so all first 1000 images will be cats, then 1000 dogs
)
VGG uses BGR instead of RGB ordering (CV2) and requires mean normalization
# 'RGB'->'BGR'
x = x[:, :, :, ::-1]
# Zero-center by mean pixel
x[:, :, :, 0] -= 103.939
x[:, :, :, 1] -= 116.779
x[:, :, :, 2] -= 123.68
Keras from keras.applications.vgg16 import preprocess_input, decode_predictions
takes care of the necessary transformations.
In [7]:
from keras.applications.vgg16 import preprocess_input, decode_predictions
full_model = VGG16(weights='imagenet', include_top=True)
#full_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
preds = full_model.predict(testImg)
# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('Predicted:', decode_predictions(preds, top=3)[0])
In [6]:
preds = full_model.predict_generator(predict_generator, 40)
In [7]:
results = decode_predictions(preds, top=1)
for result in results:
print('Predicted:', result)
We extract the features after the convolutional blocks and try put a bespoke fully connected dense layer on top. Thus we need to stream all pictures without labels through the convolutional layers and predict the features. Data augmentation cannot be done with this approach.
In [12]:
# create vgg16 feature mode (without dense layers)
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
import numpy as np
input_shape = (size, size, 3)
img = Input(shape=input_shape, name='image_input')
base_model = VGG16(weights='imagenet', include_top=False, input_tensor=img)
In [9]:
# only necessary for trainig
#base_model.compile(optimizer='adam', loss='binary_crossentropy')
#base_model.summary()
In [ ]:
# extract features for the first 32 pics
xxx = base_model.predict_generator(train_generator, 32)
xxx.shape
In [21]:
%%time
%cd $DATA_WORK_DIR
generator = pure_datagen.flow_from_directory(
'train',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode=None,
shuffle=False
)
bottleneck_features_train = base_model.predict_generator(generator, number - number%batch_size)
np.save('bottleneck_features_train.npy', bottleneck_features_train)
In [22]:
%%time
%cd $DATA_WORK_DIR
generator = pure_datagen.flow_from_directory(
'valid',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode=None,
shuffle=False
)
bottleneck_features_validation = base_model.predict_generator(generator, valid_number - valid_number%batch_size)
np.save('bottleneck_features_validation.npy', bottleneck_features_validation)
In [32]:
%cd $DATA_WORK_DIR
train_data = np.load('bottleneck_features_train.npy')
# the features were saved in order, so recreating the labels is easy
#train_labels = np.array([0] * 100 + [1] * 100)
train_labels = np.array([0] * (number//2) + [1] * (number//2 - number%batch_size))
validation_data = np.load('bottleneck_features_validation.npy')
validation_labels = np.array([0] * (valid_number//2) + [1] * (valid_number//2 - valid_number%batch_size))
train_data.shape, validation_data.shape
Out[32]:
In [33]:
dense_model = Sequential()
dense_model.add(Flatten(input_shape=train_data.shape[1:]))
dense_model.add(Dense(256, activation='relu'))
dense_model.add(Dropout(0.5))
dense_model.add(Dense(1, activation='sigmoid'))
opt = keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)
opt = keras.optimizers.Adam(lr=0.0001)
dense_model.compile(optimizer=opt,
loss='binary_crossentropy',
metrics=['accuracy'])
In [34]:
%%time
sys.stdout = open(CW_DIR+'/keras_output.txt', 'a+')
dense_model.fit(train_data, train_labels,
nb_epoch=10, batch_size=batch_size,
validation_data=(validation_data, validation_labels))
dense_model.save_weights('bottleneck_fc_model.h5')
sys.stdout = stdout
In [35]:
# make prediction
plt.imshow(image.array_to_img(testImg[0]))
dense_model.predict(base_model.predict(testImg))
Out[35]:
Out[35]:
We train now the full model, so data augmentation is possible.
Things to take into account (https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html):
in order to perform fine-tuning, all layers should start with properly trained weights: for instance you should not slap a randomly initialized fully-connected network on top of a pre-trained convolutional base. This is because the large gradient updates triggered by the randomly initialized weights would wreck the learned weights in the convolutional base. In our case this is why we first train the top-level classifier, and only then start fine-tuning convolutional weights alongside it.
we choose to only fine-tune the last convolutional block rather than the entire network in order to prevent overfitting, since the entire network would have a very large entropic capacity and thus a strong tendency to overfit. The features learned by low-level convolutional blocks are more general, less abstract than those found higher-up, so it is sensible to keep the first few blocks fixed (more general features) and only fine-tune the last one (more specialized features).
In [37]:
# create top_model to put on top of vgg16
input = Input(shape=base_model.output_shape[1:])
x = Flatten()(input)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
top_model = Model(input=input, output=x, name='top_model')
top_model_weights_path = 'bottleneck_fc_model.h5'
top_model.load_weights(top_model_weights_path)
top_model.summary()
In [38]:
from keras.optimizers import SGD
#x = base_model.output # shows all vgg16 layers in model.summary
x = base_model(img) # creates vgg16 layer im model.summary
x = top_model(x)
# this is the model we will train
#model = Model(input=base_model.input, output=x)
model = Model(input=img, output=x)
model.summary()
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional layers
for layer in base_model.layers:
log.info("Freeze layer in base_model", layer=layer.name)
layer.trainable = False
for layer in top_model.layers:
log.info("Unfreeze layer in top_model", layer=layer.name)
layer.trainable = True
# compile the model (should be done *after* setting layers to non-trainable)
model.compile(loss='binary_crossentropy',
optimizer=SGD(lr=1e-5, momentum=0.9),
metrics=['accuracy'])
In [39]:
# now we need the batches WITH labels
%cd $DATA_WORK_DIR
train_generator = augment_datagen.flow_from_directory(
'train',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary',
shuffle=False
)
# this is a similar generator, for validation data
validation_generator = pure_datagen.flow_from_directory(
'valid',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary',
shuffle=False
)
In [40]:
%%time
sys.stdout = open(CW_DIR+'/keras_output.txt', 'a+')
model.fit_generator(train_generator,
number,
validation_data=validation_generator,
nb_val_samples=valid_number,
nb_epoch=3)
model.save('model.ft.h5')
sys.stdout = stdout
In [41]:
%%time
sys.stdout = open(CW_DIR+'/keras_output.txt', 'a+')
model.fit_generator(train_generator,
number,
validation_data=validation_generator,
nb_val_samples=valid_number,
nb_epoch=7)
model.save('model.ft.h5')
sys.stdout = stdout
In [ ]:
# Load model
#model = load_model('model.ft.h5')
In [ ]:
# make prediction
plt.imshow(image.array_to_img(testImg[0]))
model.predict(testImg)
This is a simple recipie for customizing VGG16. We take the full VGG16 model including the Dense layers and just replace the last layer to fit our specific classes.
In [ ]:
# Generate a model with all layers (with top)
vgg16 = VGG16(weights='imagenet', include_top=True)
#Add a layer where input is the output of the second last layer
#x = Dense(8, activation='softmax', name='predictions')(vgg16.layers[-2].output)
x = Dense(1, activation='sigmoid', name='predictions')(vgg16.layers[-2].output)
#Then create the corresponding model
my_model = Model(input=vgg16.input, output=x)
my_model.summary()
In [ ]:
for i, layer in enumerate(my_model.layers[:-1]):
log.info("Freeze layer", layer=layer.name)
layer.trainable = False
In [ ]:
# compile the model (should be done *after* setting layers to non-trainable)
my_model.compile(loss='binary_crossentropy',
optimizer=SGD(lr=1e-4, momentum=0.9),
metrics=['accuracy'])
In [41]:
# now we need the batches WITH labels
%cd $DATA_WORK_DIR
train_generator = augment_datagen.flow_from_directory(
'train',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary',
shuffle=False
)
# this is a similar generator, for validation data
validation_generator = pure_datagen.flow_from_directory(
'valid',
save_to_dir=None,
target_size=(size, size),
batch_size=batch_size,
class_mode='binary',
shuffle=False
)
In [49]:
%%time
sys.stdout = open(CW_DIR+'/keras_output.txt', 'a+')
model.fit_generator(train_generator,
number,
validation_data=validation_generator,
nb_val_samples=valid_number,
nb_epoch=3)
model.save('model_simple.ft.h5')
sys.stdout = stdout
In [ ]:
# make prediction
plt.imshow(image.array_to_img(testImg[0]))
my_model.predict(testImg)
In [ ]:
%cd $DATA_WORK_DIR
def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, class_mode='binary'):
return gen.flow_from_directory(path,
target_size=(224,224),
class_mode=class_mode,
shuffle=shuffle,
batch_size=batch_size)
batch_path = 'test/'
test_batches = get_batches(batch_path, shuffle=False, batch_size=batch_size, class_mode='binary')
In [ ]:
# get predictions
preds = model.predict_generator(test_batches, test_batches.nb_sample)
In [ ]:
# print 5 predictions
start = 0
pics = []
filenames = test_batches.filenames
[print(p) for p in zip(preds[start:start+5, 0], filenames[start:start+5])]
for path in filenames[start:start+5]:
pics.append(image.load_img(batch_path+path, target_size=(224, 224)))
plots(pics, titles=preds[start:start+5, 0])
In [ ]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)
Keras' fit() function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("epoch" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting.
As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)
Calculate predictions on validation set, so we can find correct and incorrect examples:
In [ ]:
vgg.model.load_weights(results_path+latest_weights_filename)
In [ ]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)
In [ ]:
filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1
#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)
In [ ]:
from keras.preprocessing import image
#Helper function to plot images by index in the validation set
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
#Number of images to view for each visualization task
n_view = 4
In [ ]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])
In [ ]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])
In [ ]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])
In [ ]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])
In [ ]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])
In [ ]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])
In [ ]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])
Perhaps the most common way to analyze the result of a classification model is to use a confusion matrix. Scikit-learn has a convenient function we can use for this purpose:
In [ ]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)
We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).
In [ ]:
plot_confusion_matrix(cm, val_batches.class_indices)
Here's the format Kaggle requires for new submissions:
imageId,isDog
1242, .3984
3947, .1000
4539, .9082
2345, .0000
Kaggle wants the imageId followed by the probability of the image being a dog. Kaggle uses a metric called Log Loss to evaluate your submission.
In [ ]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')
In [ ]:
#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])
Log Loss doesn't support probability values of 0 or 1--they are undefined (and we have many). Fortunately, Kaggle helps us by offsetting our 0s and 1s by a very small value. So if we upload our submission now we will have lots of .99999999 and .000000001 values. This seems good, right?
Not so. There is an additional twist due to how log loss is calculated--log loss rewards predictions that are confident and correct (p=.9999,label=1), but it punishes predictions that are confident and wrong far more (p=.0001,label=1). See visualization below.
In [ ]:
#Visualize Log Loss when True value = 1
#y-axis is log loss, x-axis is probabilty that label = 1
#As you can see Log Loss increases rapidly as we approach 0
#But increases slowly as our predicted probability gets closer to 1
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import log_loss
x = [i*.0001 for i in range(1,10000)]
y = [log_loss([1],[[i*.0001,1-(i*.0001)]],eps=1e-15) for i in range(1,10000,1)]
plt.plot(x, y)
plt.axis([-.05, 1.1, -.8, 10])
plt.title("Log Loss when true label = 1")
plt.xlabel("predicted probability")
plt.ylabel("log loss")
plt.show()
In [ ]:
#So to play it safe, we use a sneaky trick to round down our edge predictions
#Swap all ones with .95 and all zeros with .05
isdog = isdog.clip(min=0.05, max=0.95)
In [ ]:
#Extract imageIds from the filenames in our test/unknown directory
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])
Here we join the two columns into an array of [imageId, isDog]
In [ ]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]
In [ ]:
%cd $DATA_HOME_DIR
submission_file_name = 'submission1.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')
In [ ]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/redux/'+submission_file_name)
You can download this file and submit on the Kaggle website or use the Kaggle command line tool's "submit" method.
We do not necessarily need to rely on dropout or other regularization approaches to reduce overfitting. There are other techniques we should try first, since regularlization, by definition, biases our model towards simplicity - which we only want to do if we know that's necessary. This is the order that we recommend using for reducing overfitting (more details about each in a moment):
We'll assume that you've already collected as much data as you can, so step (1) isn't relevant (this is true for most Kaggle competitions, for instance). So the next step (2) is data augmentation. This refers to creating additional synthetic data, based on reasonable modifications of your input data. For images, this is likely to involve one or more of: flipping, rotation, zooming, cropping, panning, minor color changes.
Which types of augmentation are appropriate depends on your data. For regular photos, for instance, you'll want to use horizontal flipping, but not vertical flipping (since an upside down car is much less common than a car the right way up, for instance!)
We recommend always using at least some light data augmentation, unless you have so much data that your model will never see the same input twice.