In [8]:
%matplotlib inline

In [9]:
from keras import backend
backend.set_image_dim_ordering('th')

Test submission to "Dogs vs Cats redux" Kaggle competition
https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition

1. Prepare data

We store data in two zip files. First we unzip and structure data as needed (create validation set + sample set).


In [10]:
import zipfile
import tempfile
import os

In [11]:
tmp_dir = tempfile.mkdtemp()
tmp_dir


Out[11]:
'/var/folders/sn/f79nd7k93ync98sklrcsf9cm0000gn/T/tmpxncswquo'

In [12]:
zf = zipfile.ZipFile("../data/redux/train.zip")
zf.extractall(tmp_dir)
zf.close


Out[12]:
<bound method ZipFile.close of <zipfile.ZipFile filename='../data/redux/train.zip' mode='r'>>

In [13]:
zf = zipfile.ZipFile("../data/redux/test.zip")
zf.extractall(tmp_dir)
zf.close


Out[13]:
<bound method ZipFile.close of <zipfile.ZipFile filename='../data/redux/test.zip' mode='r'>>

In [14]:
import sys
sys.path.append('../../nbs')
import utils
from utils import *

In [15]:
import random
import shutil

g = glob(os.path.join(tmp_dir, 'train', '*.jpg'))

In [16]:
# create sample set
smp_dir = os.path.join(tmp_dir, 'sample')
os.makedirs(os.path.join(smp_dir, 'train'))
random.shuffle(g)
for file in g[0:100]:
    shutil.copy(file, os.path.join(smp_dir, 'train', os.path.basename(file)))

In [17]:
# create validation set
os.makedirs(os.path.join(tmp_dir, 'valid'))
random.shuffle(g)
for file in g[0:3750]:
    shutil.move(file, os.path.join(tmp_dir, 'valid', os.path.basename(file)))

In [18]:
# create validation set inside sample
g = glob(os.path.join(smp_dir, 'train', '*.jpg'))
random.shuffle(g)
os.makedirs(os.path.join(smp_dir, 'valid'))
for file in g[0:20]:
    shutil.move(file, os.path.join(smp_dir, 'valid', os.path.basename(file)))

In [19]:
# create sample test set
g = glob(os.path.join(tmp_dir, 'test', '*.jpg'))
random.shuffle(g)
os.makedirs(os.path.join(smp_dir, 'test', 'class'))
for file in g[0:20]:
    shutil.copy(file, os.path.join(smp_dir, 'test', 'class', os.path.basename(file)))

In [20]:
# move test set for image batch generator
g = glob(os.path.join(tmp_dir, 'test', '*.jpg'))
os.makedirs(os.path.join(tmp_dir, 'test', 'class'))
for file in g:
    shutil.move(file, os.path.join(tmp_dir, 'test', 'class', os.path.basename(file)))

In [21]:
# separate classes
def sepClasses(path):
    os.mkdir(os.path.join(path, 'cats'))
    os.mkdir(os.path.join(path, 'dogs'))
    for file in glob(os.path.join(path, 'cat*.jpg')):
        shutil.move(file, os.path.join(path, 'cats', os.path.basename(file)))
    for file in glob(os.path.join(path, 'dog*.jpg')):
        shutil.move(file, os.path.join(path, 'dogs', os.path.basename(file)))

In [22]:
sepClasses(os.path.join(smp_dir, 'train'))
sepClasses(os.path.join(smp_dir, 'valid'))
sepClasses(os.path.join(tmp_dir, 'train'))
sepClasses(os.path.join(tmp_dir, 'valid'))

2. Train model


In [23]:
path = smp_dir # for sample train
# path = tmp_dir # for real train

In [24]:
if path == smp_dir:
    batch_size=5
else:
    batch_size=64

In [25]:
import importlib
import vgg16; importlib.reload(vgg16)
from vgg16 import Vgg16

In [30]:
vgg = Vgg16()
batches = vgg.get_batches(os.path.join(path,'train'), batch_size=batch_size)
val_batches = vgg.get_batches(os.path.join(path,'valid'), batch_size=batch_size)

In [32]:
vgg.finetune(batches)

In [33]:
# keras output problem workaround
oldStdout = sys.stdout
sys.stdout = open("keras.out", 'w')

In [34]:
vgg.fit(batches, val_batches, nb_epoch=2)

In [35]:
# keras output problem workaround
sys.stdout = oldStdout

In [36]:
# save model
if path == smp_dir:
    model_name = 'sample_model.h5'
else:
    model_name = 'model.h5'
    
vgg.model.save(model_name)

3. Predict


In [76]:
# load model
# vgg.model.load_weights('sample_model.h5')
# vgg.model.load_weights('model.h5')

In [77]:
imgs,labels = next(val_batches)

In [78]:
plots(imgs, titles=labels)



In [79]:
vgg.predict(imgs, True)


Out[79]:
(array([ 0.8188,  0.7688,  0.9728,  0.5169,  1.    ], dtype=float32),
 array([1, 1, 0, 1, 0]),
 ['dogs', 'dogs', 'cats', 'dogs', 'cats'])

In [95]:
test_batches, pred = vgg.test(os.path.join(tmp_dir, 'test'), batch_size=5)


Found 12500 images belonging to 1 classes.

In [96]:
filenames = test_batches.filenames

In [97]:
ids = [os.path.splitext(os.path.basename(file))[0] for file in filenames]

In [100]:
submission = np.stack([ids, pred[:,1]], axis=1)
submission


Out[100]:
array([['1', '0.48160186409950256'],
       ['10', '0.9999433755874634'],
       ['100', '0.9836655259132385'],
       ..., 
       ['9997', '0.13110901415348053'],
       ['9998', '0.9992672801017761'],
       ['9999', '0.9926143884658813']], 
      dtype='<U32')

In [101]:
np.savetxt('submission.csv', submission, header='id,label', fmt='%s,%s')

In [ ]: