In [1]:
# step to run on theano. Data is mounted at /data
# floyd run --mode jupyter --gpu --env theano:py2 --data matsaleh/datasets/vgg16_model/1:data
In [20]:
# if bcolz gives error, uncomment and run
# !pip install bcolz
# if keras gives error on importing l2, uncomment and run following
# !pip uninstall -y keras
# !pip install keras==1.2.2
In [ ]:
# !wget https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/sample_submission.csv -P data
# !ls -ltr /output/data
In [21]:
# !mkdir data
# !wget https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/test.zip -P data
# !wget https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/train.zip -P data
# !unzip /output/data/test.zip
# !unzip /output/data/train.zip
In [4]:
%matplotlib inline
In [5]:
#Create references to important directories we will use over and over
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/'
In [6]:
#Create directories
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown
In [7]:
%cd $DATA_HOME_DIR/train
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000):
os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])
from shutil import copyfile
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200):
copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])
%cd $DATA_HOME_DIR/valid
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50):
copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])
#Divide cat/dog images into separate directories
%cd $DATA_HOME_DIR/sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/sample/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/
In [8]:
%cd $DATA_HOME_DIR
#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/' #'/sample/'
test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'
In [9]:
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
In [10]:
import utils; reload(utils)
from utils import plots
In [13]:
# As large as you can, but no larger than 64 is recommended.
# If you have an older or cheaper GPU, you'll run out of memory, so will have to decrease this.
batch_size=64
no_of_epochs=3
In [12]:
# Import our class, and instantiate
import vgg16; reload(vgg16)
from vgg16 import Vgg16
In [14]:
%%time
#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)
#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01
latest_weights_filename = 'ft0d.h5'
vgg.model.save_weights(results_path+latest_weights_filename)
In [15]:
batches, preds = vgg.test(test_path, batch_size = batch_size*2)
In [16]:
#For every image, vgg.test() generates two probabilities
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
print preds[:5]
filenames = batches.filenames
print filenames[:5]
In [17]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)
In [18]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')
In [ ]: