Based on fast.ai dogs_cats_redux notebook in order to make my own entry into the Kaggle competition.

https://www.kaggle.com/c/the-nature-conservancy-fisheries-monitoring

My dir structure is similar, but not exactly the same:

utils 
ncfish
  data
    train
    test

In [1]:
#Verify we are in the lesson1 directory
%pwd


Out[1]:
u'/home/rallen/Documents/PracticalDL4C/kaggle/ncfish'

In [2]:
%matplotlib inline

In [3]:
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
from utils import *
from vgg16 import Vgg16
from PIL import Image
from keras.preprocessing import image
from sklearn.metrics import confusion_matrix


Using gpu device 0: GeForce GTX 1070 (CNMeM is disabled, cuDNN 5105)
/home/rallen/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)
Using Theano backend.

Note: had to comment out vgg16bn in utils.py (whatever that is)


In [4]:
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'
categories = sorted([os.path.basename(x) for x in glob(DATA_HOME_DIR+'/train/*')])

Create validation set and sample

ONLY DO THIS ONCE.


In [ ]:
from shutil import copyfile
#Create directories
%cd $DATA_HOME_DIR
# did this once
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

In [ ]:
# Create subdirectories
for c in categories:
    %mkdir -p valid/{c}
    %mkdir -p sample/train/{c}
    %mkdir -p sample/valid/{c}

In [ ]:
%cd $DATA_HOME_DIR/train

In [ ]:
# how many images we talking about?
for c in categories:
    g = glob(c+"/*.jpg")
    print c, len(g)

This was original output:

ALB 1719
BET 200
DOL 117
LAG 67
NoF 465
OTHER 299
SHARK 176
YFT 734

In [ ]:
validation_ratio = 0.1
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    num_valid = int(validation_ratio*len(g))
    for i in range(num_valid): 
        #print shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i]
        os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])

In [ ]:
# Now, how many images we talking about?
for c in categories:
    g = glob(c+"/*.jpg")
    print c, len(g), 
    g = glob("../valid/"+c+"/*.jpg")
    print len(g)

In [ ]:
# now create the sample train subset of 10 per category
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    for i in range(10): 
        #print shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i]
        copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

In [ ]:
%cd $DATA_HOME_DIR/valid

In [ ]:
# now create the sample valid subset of 2 per category
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    for i in range(2): 
        #print shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i]
        copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

In [ ]:
!ls {DATA_HOME_DIR}/train/*/* |wc -l
!ls {DATA_HOME_DIR}/valid/*/* |wc -l
!ls {DATA_HOME_DIR}/sample/train/*/* |wc -l
!ls {DATA_HOME_DIR}/sample/valid/*/* |wc -l

Training & 10% for Validation numbers

ALB 1548 171
BET 180 20
DOL 106 11
LAG 61 6
NoF 419 46
OTHER 270 29
SHARK 159 17
YFT 661 73

Rearrange image files into their respective directories

ONLY DO THIS ONCE.


In [ ]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test_stg1
%mv *.jpg ../test/unknown/

In [ ]:
!ls {DATA_HOME_DIR}/test

Finetuning and Training

OKAY, ITERATE HERE


In [5]:
%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/'
#path = DATA_HOME_DIR + '/sample/'
test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'


/home/rallen/Documents/PracticalDL4C/kaggle/ncfish/data

In [6]:
vgg = Vgg16()


/home/rallen/anaconda2/lib/python2.7/site-packages/keras/layers/core.py:621: UserWarning: `output_shape` argument not specified for layer lambda_1 and cannot be automatically inferred with the Theano backend. Defaulting to output shape `(None, 3, 224, 224)` (same as input shape). If the expected output shape is different, specify it via the `output_shape` argument.
  .format(self.name, input_shape))

In [7]:
#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=1

In [8]:
#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01


Found 3404 images belonging to 8 classes.
Found 373 images belonging to 8 classes.

In [9]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None

#latest_weights_filename='ft24.h5'
#vgg.model.load_weights(results_path+latest_weights_filename)

if you are training, stay here. if you are loading & creating submission skip down from here.


In [10]:
# if you have run some epochs already...
epoch_offset=1 # trying again from ft1
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % (epoch + epoch_offset)
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % (epoch + epoch_offset)
    vgg.model.save_weights(results_path+latest_weights_filename)
print "Completed %s fit operations" % no_of_epochs


Running epoch: 0
Epoch 1/1
3404/3404 [==============================] - 114s - loss: 8.6975 - acc: 0.4489 - val_loss: 8.7288 - val_acc: 0.4584
Completed 1 fit operations

In [ ]:
# only if you have to
latest_weights_filename='ft1.h5'
vgg.model.load_weights(results_path+latest_weights_filename)

Validate Predictions


In [ ]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)
filenames = val_batches.filenames
expected_labels = val_batches.classes # 0 - 7

#Round our predictions to 0/1 to generate labels
#our_predictions = probs[:,0]
#our_labels = np.round(1-our_predictions)
our_labels = np.argmax(probs, axis=1)

In [ ]:
cm = confusion_matrix(expected_labels, our_labels)
plot_confusion_matrix(cm, val_batches.class_indices)

In [ ]:
#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [ ]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

In [ ]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [ ]:
val_batches.class_indices

In [ ]:
#3a. The images we most confident were X, and are actually X
X='YFT'
Xi=val_batches.class_indices[X]
correct_cats = np.where((our_labels==Xi) & (our_labels==expected_labels))[0]
print "Found %d confident correct %s labels" % (len(correct_cats),X)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [ ]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [ ]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [ ]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

In [ ]:

Generate Predictions


In [ ]:
batches, preds = vgg.test(test_path, batch_size = batch_size*2)
# Error allocating 3347316736 bytes of device memory (out of memory).
# got this error when batch-size = 128
# I see this pop up to 6GB memory with batch_size = 64 & this takes some time...

In [ ]:
#For every image, vgg.test() generates two probabilities 
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
print preds[:5]

filenames = batches.filenames
print filenames[:5]

In [ ]:
#You can verify the column ordering by viewing some images
Image.open(test_path + filenames[1])

In [ ]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)

Submit Predictions to Kaggle!


In [ ]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')

In [ ]:
#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])

In [ ]:
#play it safe, round down our edge predictions
#isdog = isdog.clip(min=0.05, max=0.95)
#isdog = isdog.clip(min=0.02, max=0.98)
isdog = isdog.clip(min=0.01, max=0.99)

In [ ]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])

In [ ]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [ ]:
%cd $DATA_HOME_DIR
submission_file_name = 'submission4.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [ ]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/'+submission_file_name)

In [ ]: