Wayne Nixalo - 20 May 2017
FAI1 - Practical Deep Learning I - Week 3 HW: Kaggle StateFarm Distracted Driver Competition
In [1]:
import keras
import os, sys
import numpy as np
import pandas as pd
from glob import glob
from keras.optimizers import Adam
from keras.layers.core import Dense
from keras.preprocessing import image
In [2]:
# will need this to access any libraries in superdirectories
sys.path.insert(1, os.path.join(os.getcwd(), '../utils'))
import utils
from vgg16 import Vgg16
In [3]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]
In [4]:
HOME_DIR = os.getcwd()
DATA_DIR = HOME_DIR + '/data'
TRAIN_DIR = DATA_DIR + '/train'
VAL_DIR = DATA_DIR + '/valid'
TEST_DIR = DATA_DIR + '/test'
In [5]:
# create the validation directories
# os.mkdir(VAL_PATH)
# for i in xrange(10):
# os.mkdir(VAL_PATH + '/c' + str(i))
# # another way to do this:
# %mkdir $VAL_PATH
# for i in xrange(10):
# %mkdir $VAL_PATH/c"$i"
Grab a random permutation from the training data for validation. Do this until validation accuracy matches test accuracy. Also see: http://stackoverflow.com/questions/2632205/how-to-count-the-number-of-files-in-a-directory-using-python
In [5]:
# %cd $TRAIN_PATH
# VAL_PORTION = 0.2
# for i in xrange(10):
# %cd c"$i"
# g = glob('*.jpg')
# number = len(g)
# shuff = np.random.permutation(g)
# for n in xrange(int(number * VAL_PORTION)):
# os.rename(shuff[n], VAL_DIR + '/c' + str(i) + '/' + shuff[n])
# % cd ..
def reset_valid(verbose=1):
"""Moves all images in validation set back to
their respective classes in the training set."""
counter = 0
%cd $valid_path
for i in xrange(10):
%cd c"$i"
g = glob('*.jpg')
for n in xrange(len(g)):
os.rename(g[n], TRAIN_DIR + '/c' + str(i) + '/' + g[n])
counter += 1
% cd ..
if verbose: print("Moved {} files.".format(counter))
# %mv $VALID_DIR/c"$i"/$*.jpg $TRAIN_DIR/c"$i"/$*.jpg
# modified from: http://forums.fast.ai/t/statefarm-kaggle-comp/183/20
def set_valid(number=1, verbose=1):
"""Moves <number> of subjects from training to validation
directories. Verbosity: 0: Silent; 1: print no. files moved;
2: print each move operation"""
counter = 0
if number < 0: number = 0
for n in xrange(number):
# read CSV file into Pandas DataFrame
dil = pd.read_csv(data_path + 'driver_imgs_list.csv')
# group frame by subject in image
grouped_subjects = dil.groupby('subject')
# pick <number> subjects at random
subject = grouped_subjects.groups.keys()[np.random.randint(0, high=len(grouped_subjects.groups))] # <-- groups?
# get the group assoc w/ subject
group = grouped_subjects.get_group(subject)
# loop over gropu & move imgs to validation dir
for (subject, clssnm, img) in group.values:
source = '{}train/{}/{}'.format(data_path, clssnm, img)
target = source.replace('train', 'valid')
if verbose > 1: print('mv {} {}'.format(source, target))
os.rename(source, target)
counter += 1
if verbose: print "Files moved: {}".format(counter)
In [28]:
# dil = pd.read_csv(data_path + 'driver_imgs_list.csv')
# grouped_subjects = dil.groupby('subject')
In [35]:
# dil.keys()
Out[35]:
In [42]:
# len(grouped_subjects.groups) # <-- that's what I'm looking for
Out[42]:
In [68]:
reset_valid()
In [69]:
set_valid(number=3, verbose=1)
In [6]:
# some more setup
data_path = DATA_DIR + '/'
train_path = TRAIN_DIR + '/'
valid_path = VAL_DIR + '/'
test_path = TEST_DIR + '/'
results_path = DATA_DIR + '/results/'
In [7]:
# looks like batch size of 64 is just past what my GPU can handle
# would using bcolz to save precomputed arrays help?
batch_size=32
target_size=(224,224) # for gen.flow_from_directory(..)
In [8]:
# batch generator to feed data into the model
gen = image.ImageDataGenerator()
trn_batches = gen.flow_from_directory(train_path, target_size=target_size,
class_mode='categorical', shuffle=True, batch_size=batch_size)
val_batches = gen.flow_from_directory(valid_path, target_size=target_size,
class_mode='categorical', shuffle=False, batch_size=batch_size)
In [9]:
trn_batches.n
Out[9]:
NOTE: I'll want a way to clear GPU memory in the future. Right now all I know is restarting the kernel.
In [10]:
# load the VGG model, download its weights, and finetune it to the data
VGG = Vgg16()
VGG.model.pop()
for layer in VGG.model.layers: layer.trainable = False
VGG.model.add(Dense(10, activation='softmax'))
VGG.model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
In [11]:
# run the model until it overfits
VGG.model.optimizer.lr = 0.001
VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=1, verbose=1,
validation_data=val_batches, nb_val_samples=val_batches.n)
Out[11]:
In [13]:
def train_model(lr=0.001, epochs=1, verbose=0):
VGG.model.optimizer.lr=lr
VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=epochs, verbose=verbose,
validation_data=val_batches, nb_val_samples=val_batches.n)
In [14]:
train_model(lr=0.1, epochs=2, verbose=1)
In [15]:
VGG.model.save_weights(data_path + 'finetune01.h5')
In [16]:
train_model(lr=0.01, epochs=2, verbose=1)
VGG.model.save_weights(data_path + 'finetune02.h5')
train_model(lr=0.001, epochs=2, verbose=1)
VGG.model.save_weights(data_path + 'finetune03.h5')
In [17]:
train_model(lr=0.0001, epochs=4, verbose=1)
In [11]:
# saving weights
# VGG.model.save_weights(data_path + 'finetune01.h5')
In [18]:
VGG.model.load_weights(data_path + 'finetune03.h5')
In [11]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, batch_size=4, class_mode='categorical',
target_size=(224,224)):
return gen.flow_from_directory(dirname, target_size=target_size,
class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)
def get_data(path, target_size=(224,224)):
batches = get_batches(path, shuffle=False, batch_size=1, class_mode=None, target_size=target_size)
return np.concatenate([batches.next() for i in xrange(batches.nb_sample)])
In [19]:
# output test data predictions
gen = image.ImageDataGenerator()
tst_batches = gen.flow_from_directory(test_path, target_size=target_size,
class_mode='categorical', shuffle=False, batch_size=batch_size*2)
# predictions = VGG.model.predict_on_batch(tst_batches)
# predictions = VGG.model.predict(tst_batches, batch_size=batch_size*2, verbose=1)
In [20]:
tst_batches.n
tst_batches.nb_sample
Out[20]:
enumerate is zero-indexed. The first Conv layer is the 3rd, so that'll give an index of 2. Therefore, when defining a model consisting of all Conv layers, remember to include 1-past the index. Also, Python index notation [x:y] translates to [start-before-here:until-just-before-here] equivalently, [start-with-this:until-this]
So that's why, if your model is VGG.model, entering VGG.model.layers[last_conv_idx] will give you the last convolutional layer, but entering: VGG.model.layers[:last_conv_idx] will give you all layers up to the last Conv layer, but not including it.
In [22]:
from keras.layers.convolutional import Convolution2D
last_conv_idx = [index for index, layer in enumerate(VGG.model.layers) if type(layer) is Convolution2D][-1]
conv_layers = VGG.model.layers[:last_conv_idx + 1]
# NOTE: enumerate is zero-indexed.
In [38]:
first_conv_idx = [idx for idx, layer in enumerate(VGG.model.layers) if type(layer) is Convolution2D][0]
first_conv_idx
Out[38]:
In [33]:
# print(last_conv_idx)
VGG.model.layers
Out[33]:
In [34]:
conv_layers
Out[34]:
In [46]:
# from keras.layers.pooling import MaxPooling2D
# ??MaxPooling2D
# ??Convolution2D
# conv_layers[-1].output_shape
# VGG.model.layers[last_conv_idx + 1]
Out[46]:
In [47]:
# getting some insight into Conv output_shape and MaxPool2D input_shapes
# Theano ordering: (samples, channels, rows, cols) <- Conv-out / MxP-in
#
print(VGG.model.layers[last_conv_idx].output_shape)
print(VGG.model.layers[last_conv_idx + 1].input_shape)
So, I don't know if, when building an FC model taking input from a Vgg16bn Conv model, I'll have to specifiy input shape to the first MaxPooling2D layer to be all but the first part of the output_shape (tensor?) from the ConvLayer. We'll see. Does it matter because it's None? or will it be inferred automatically? Or does it need to take in a 3-dim input -- but then why would it be taking in a 4-dim one in the full model's case? Hmm.
In [48]:
predictions = VGG.test(test_path, batch_size=32)
In [49]:
save_array(results_path + 'raw_predictions01.bc', predictions[1])
In [50]:
len(predictions[1])
Out[50]:
In [51]:
predictions[1].shape
Out[51]:
In [52]:
preds = predictions[1]
# clipping & renorm to score better on logloss metric
preds = utils.do_clip(preds, mx=0.95)
In [53]:
filenames = tst_batches.filenames
# ids = np.array([str(f[8:f.find('.')]) for f in filenames])
ids = np.array([str(f[8:]) for f in filenames])
In [54]:
print(ids.shape)
print(preds.shape)
In [55]:
import pandas as pd
In [56]:
# submissions = np.stack([ids, preds], axis=1)
# couldn't get the older method of using np.stack to work, so trying pandas
classes = sorted(trn_batches.class_indices, key=trn_batches.class_indices.get)
submission = pd.DataFrame(preds, columns=classes)
# submission.insert(0, 'img', [f[12:] for f in filenames])
submission.insert(0, 'img', [f[8:] for f in filenames])
submission.head()
submission.to_csv(results_path + 'submission.csv', index=False, compression=None)
In [57]:
# ??pd.DataFrame.to_csv
In [58]:
from IPython.display import FileLink
FileLink(results_path + 'submission.csv')
Out[58]:
In [1]:
??submission.insert
Results: place|score: 658/1440|1.50925
In [ ]: