Competition Link: https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition
In [2]:
%pwd
Out[2]:
In [20]:
#Create references to important directories
import os, sys
from glob import glob
import numpy as np
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'
In [5]:
#Allow relative imports to directories above this folder
sys.path.insert(1, os.path.join(sys.path[0], '..'))
#Inline plotting tool
%matplotlib inline
#import modules
from utils import *
from utils.vgg16 import Vgg16
In [50]:
#Need to correctly import utils.py
import bcolz
from numpy.random import random, permutation
In [6]:
%cd $DATA_HOME_DIR
In [7]:
#Create directories
%mkdir results
%mkdir valid
%mkdir -p test/unknown
In [22]:
#Move a sample size into the valid folder
%cd $DATA_HOME_DIR/train
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])
In [23]:
%cd $DATA_HOME_DIR/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
In [33]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/
In [65]:
%cd $DATA_HOME_DIR
path = DATA_HOME_DIR + '/'
test_path = DATA_HOME_DIR + '/test/'
results_path=DATA_HOME_DIR + '/results/'
train_path=DATA_HOME_DIR + '/train/'
valid_path=DATA_HOME_DIR + '/valid/'
In [27]:
#import Vgg16 helper class
vgg = Vgg16()
In [28]:
#Set constants
batch_size=64
no_of_epochs=5
In [29]:
#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)
#Set the learning rate
vgg.model.optimizer.lr = 0.01
In [30]:
#Passing in the validation dataset to the fit() method
#For each epoch, test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
print "Running epoch: %d" % epoch
vgg.fit(batches, val_batches, nb_epoch=1)
latest_weights_filename = 'ft%d.h5' % epoch
vgg.model.save_weights(results_path+latest_weights_filename)
print "Completed %s fit operations" % no_of_epochs
In [34]:
batches, preds = vgg.test(test_path, batch_size = batch_size*2)
In [35]:
print preds[:5]
filenames = batches.filenames
print filenames[:5]
In [36]:
#Verify the column ordering by viewing some images
from PIL import Image
Image.open(test_path + filenames[2])
Out[36]:
In [95]:
from matplotlib import pyplot as plt
#Need to correctly import utils.py
def save_array(fname, arr):
c = bcolz.carray(arr, rootdir=fname, mode='w')
c.flush()
def load_array(fname):
return bcolz.open(fname)[:]
def plots(ims, figsize=(12,6), rows=1, interp=False, titles=None):
if type(ims[0]) is np.ndarray:
ims = np.array(ims).astype(np.uint8)
if (ims.shape[-1] != 3):
ims = ims.transpose((0,2,3,1))
f = plt.figure(figsize=figsize)
for i in range(len(ims)):
sp = f.add_subplot(rows, len(ims)//rows, i+1)
sp.axis('Off')
if titles is not None:
sp.set_title(titles[i], fontsize=16)
plt.imshow(ims[i], interpolation=None if interp else 'none')
In [46]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)
In [81]:
latest_weights_filename
Out[81]:
In [82]:
vgg.model.load_weights(results_path+latest_weights_filename)
In [83]:
valid_path
Out[83]:
In [75]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)
In [84]:
#For every image, vgg.test() generates two probabilities
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
#Our predictions to 0/1 to generate lables
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)
#Expected labels
expected_labels = val_batches.classes #0 or 1
filenames = val_batches.filenames
In [85]:
from keras.preprocessing import image
#Helper function to plot images by index in the validation set
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
#Number of images to view for each visualization task
n_view = 4
In [88]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])
In [89]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])
In [90]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])
In [91]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])
In [92]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])
In [93]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])
Use te Log loss metric to evaluate the image of being a dog
Format: imageId,isDog
In [96]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')
In [97]:
#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])
In [98]:
#Round down our edge predictions to avoid being penalized by Kaggle
#Swap all ones with .95 and all zeros with .05
isdog = isdog.clip(min=0.05, max=0.95)
In [99]:
#Extract imageIds from the filenames in our test/unknown directory
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])
In [100]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]
Out[100]:
In [101]:
%cd $DATA_HOME_DIR
submission_file_name = 'submission_1.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')
In [103]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/'+submission_file_name)
Out[103]:
In [ ]: