Wayne Nixalo  -  20 May 2017

FAI1 - Practical Deep Learning I - Week 3 HW: Kaggle StateFarm Distracted Driver Competition

Imports



In [1]:

    
import keras
import os, sys
import numpy as np
import pandas as pd
from glob import glob
from keras.optimizers import Adam
from keras.layers.core import Dense
from keras.preprocessing import image









    



Using Theano backend.
/home/wnixalo/miniconda3/envs/FAI/lib/python2.7/site-packages/theano/gpuarray/dnn.py:135: UserWarning: Your cuDNN version is more recent than Theano. If you encounter problems, try updating Theano or downgrading cuDNN to version 5.1.
  warnings.warn("Your cuDNN version is more recent than "
Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 870M (0000:01:00.0)



In [2]:

    
# will need this to access any libraries in superdirectories
sys.path.insert(1, os.path.join(os.getcwd(), '../utils'))
import utils
from vgg16 import Vgg16



In [3]:

    
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

1. Run this the First Time Only

Download the Data and get it into the right directories. Fortunately SF already organized the data, so it's just a matter of assigning path variables. NOTE: kaggle-cli needs to be set up beforhand. Also path vars must be assigned each time this notebook is run.



In [4]:

    
HOME_DIR = os.getcwd()
DATA_DIR = HOME_DIR + '/data'
TRAIN_DIR = DATA_DIR + '/train'
VAL_DIR = DATA_DIR + '/valid'
TEST_DIR = DATA_DIR + '/test'



In [5]:

    
# create the validation directories
# os.mkdir(VAL_PATH)
# for i in xrange(10):
#     os.mkdir(VAL_PATH + '/c' + str(i))

# # another way to do this:
# %mkdir $VAL_PATH
# for i in xrange(10):
#     %mkdir $VAL_PATH/c"$i"

2. Run this if you don't have an Accurate Validation Set

Grab a random permutation from the training data for validation. Do this until validation accuracy matches test accuracy. Also see: http://stackoverflow.com/questions/2632205/how-to-count-the-number-of-files-in-a-directory-using-python



In [5]:

    
# %cd $TRAIN_PATH

# VAL_PORTION = 0.2
# for i in xrange(10):
#     %cd c"$i"
#     g = glob('*.jpg')
#     number = len(g)
#     shuff = np.random.permutation(g)
#     for n in xrange(int(number * VAL_PORTION)):
#         os.rename(shuff[n], VAL_DIR + '/c' + str(i) + '/' + shuff[n])
#     % cd ..

def reset_valid(verbose=1):
    """Moves all images in validation set back to 
    their respective classes in the training set."""
    counter = 0
    %cd $valid_path
    for i in xrange(10):
        %cd c"$i"
        g = glob('*.jpg')
        for n in xrange(len(g)):
            os.rename(g[n], TRAIN_DIR + '/c' + str(i) + '/' + g[n])
            counter += 1
        % cd ..
    if verbose: print("Moved {} files.".format(counter))
#         %mv $VALID_DIR/c"$i"/$*.jpg $TRAIN_DIR/c"$i"/$*.jpg

# modified from: http://forums.fast.ai/t/statefarm-kaggle-comp/183/20
def set_valid(number=1, verbose=1):
    """Moves <number> of subjects from training to validation 
    directories. Verbosity: 0: Silent; 1: print no. files moved; 
    2: print each move operation"""
    counter = 0
    if number < 0: number = 0
    for n in xrange(number):
        # read CSV file into Pandas DataFrame
        dil = pd.read_csv(data_path + 'driver_imgs_list.csv')
        # group frame by subject in image
        grouped_subjects = dil.groupby('subject')
        # pick <number> subjects at random
        subject = grouped_subjects.groups.keys()[np.random.randint(0, high=len(grouped_subjects.groups))] # <-- groups?
        # get the group assoc w/ subject
        group = grouped_subjects.get_group(subject)
        # loop over gropu & move imgs to validation dir
        for (subject, clssnm, img) in group.values:
            source = '{}train/{}/{}'.format(data_path, clssnm, img)
            target = source.replace('train', 'valid')
            if verbose > 1: print('mv {} {}'.format(source, target))
            os.rename(source, target)
            counter += 1
    if verbose: print "Files moved: {}".format(counter)



In [28]:

    
# dil = pd.read_csv(data_path + 'driver_imgs_list.csv')
# grouped_subjects = dil.groupby('subject')



In [35]:

    
# dil.keys()









    Out[35]:





Index([u'subject', u'classname', u'img'], dtype='object')



In [42]:

    
# len(grouped_subjects.groups) # <-- that's what I'm looking for









    Out[42]:





26



In [68]:

    
reset_valid()









    



/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c0
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c1
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c2
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c3
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c4
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c5
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c6
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c7
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c8
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid/c9
/home/wnixalo/Kaukasos/FAI/lesson3/data/valid
Moved 2110 files.



In [69]:

    
set_valid(number=3, verbose=1)









    



Files moved: 3039



In [6]:

    
# some more setup
data_path    = DATA_DIR  + '/'
train_path   = TRAIN_DIR + '/'
valid_path   = VAL_DIR   + '/'
test_path    = TEST_DIR  + '/'
results_path = DATA_DIR  + '/results/'



In [7]:

    
# looks like batch size of 64 is just past what my GPU can handle
# would using bcolz to save precomputed arrays help?
batch_size=32
target_size=(224,224) # for gen.flow_from_directory(..)



In [8]:

    
# batch generator to feed data into the model
gen = image.ImageDataGenerator()
trn_batches = gen.flow_from_directory(train_path, target_size=target_size,
                class_mode='categorical', shuffle=True, batch_size=batch_size)
val_batches = gen.flow_from_directory(valid_path, target_size=target_size,
                class_mode='categorical', shuffle=False, batch_size=batch_size)









    



Found 19385 images belonging to 10 classes.
Found 3039 images belonging to 10 classes.



In [9]:

    
trn_batches.n









    Out[9]:





19385

NOTE: I'll want a way to clear GPU memory in the future. Right now all I know is restarting the kernel.



In [10]:

    
# load the VGG model, download its weights, and finetune it to the data
VGG = Vgg16()
VGG.model.pop()
for layer in VGG.model.layers: layer.trainable = False
VGG.model.add(Dense(10, activation='softmax'))
VGG.model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])



In [11]:

    
# run the model until it overfits
VGG.model.optimizer.lr = 0.001
VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=1, verbose=1,
                       validation_data=val_batches, nb_val_samples=val_batches.n)









    



Epoch 1/1
19385/19385 [==============================] - 1096s - loss: 1.7957 - acc: 0.4986 - val_loss: 1.4063 - val_acc: 0.5745






    Out[11]:





<keras.callbacks.History at 0x7f08d5573a50>



In [13]:

    
def train_model(lr=0.001, epochs=1, verbose=0):
    VGG.model.optimizer.lr=lr
    VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=epochs, verbose=verbose,
                           validation_data=val_batches, nb_val_samples=val_batches.n)



In [14]:

    
train_model(lr=0.1, epochs=2, verbose=1)









    



Epoch 1/2
19385/19385 [==============================] - 1096s - loss: 1.0948 - acc: 0.6601 - val_loss: 1.4272 - val_acc: 0.5926
Epoch 2/2
19385/19385 [==============================] - 1096s - loss: 0.9876 - acc: 0.7021 - val_loss: 1.6798 - val_acc: 0.4656



In [15]:

    
VGG.model.save_weights(data_path + 'finetune01.h5')



In [16]:

    
train_model(lr=0.01, epochs=2, verbose=1)
VGG.model.save_weights(data_path + 'finetune02.h5')
train_model(lr=0.001, epochs=2, verbose=1)
VGG.model.save_weights(data_path + 'finetune03.h5')









    



Epoch 1/2
19385/19385 [==============================] - 1096s - loss: 1.0026 - acc: 0.7094 - val_loss: 1.2893 - val_acc: 0.6147
Epoch 2/2
19385/19385 [==============================] - 1095s - loss: 1.0080 - acc: 0.7159 - val_loss: 1.3807 - val_acc: 0.5683
Epoch 1/2
19385/19385 [==============================] - 1092s - loss: 0.9848 - acc: 0.7269 - val_loss: 1.1980 - val_acc: 0.6081
Epoch 2/2
19385/19385 [==============================] - 1093s - loss: 1.0269 - acc: 0.7290 - val_loss: 1.1896 - val_acc: 0.6190



In [17]:

    
train_model(lr=0.0001, epochs=4, verbose=1)









    



Epoch 1/4
19385/19385 [==============================] - 1095s - loss: 0.9968 - acc: 0.7379 - val_loss: 1.2430 - val_acc: 0.6242
Epoch 2/4
19385/19385 [==============================] - 1096s - loss: 0.9956 - acc: 0.7378 - val_loss: 1.2325 - val_acc: 0.6252
Epoch 3/4
19385/19385 [==============================] - 1092s - loss: 1.0270 - acc: 0.7388 - val_loss: 1.7015 - val_acc: 0.5318
Epoch 4/4
19385/19385 [==============================] - 1092s - loss: 1.0012 - acc: 0.7448 - val_loss: 1.5951 - val_acc: 0.5670



In [11]:

    
# saving weights
# VGG.model.save_weights(data_path + 'finetune01.h5')



In [18]:

    
VGG.model.load_weights(data_path + 'finetune03.h5')



In [11]:

    
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, batch_size=4, class_mode='categorical',
                target_size=(224,224)):
    return gen.flow_from_directory(dirname, target_size=target_size,
            class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

def get_data(path, target_size=(224,224)):
    batches = get_batches(path, shuffle=False, batch_size=1, class_mode=None, target_size=target_size)
    return np.concatenate([batches.next() for i in xrange(batches.nb_sample)])



In [19]:

    
# output test data predictions
gen = image.ImageDataGenerator()
tst_batches = gen.flow_from_directory(test_path, target_size=target_size,
                class_mode='categorical', shuffle=False, batch_size=batch_size*2)
# predictions = VGG.model.predict_on_batch(tst_batches)
# predictions = VGG.model.predict(tst_batches, batch_size=batch_size*2, verbose=1)









    



Found 79726 images belonging to 1 classes.



In [20]:

    
tst_batches.n
tst_batches.nb_sample









    Out[20]:





79726

enumerate is zero-indexed. The first Conv layer is the 3rd, so that'll give an index of 2. Therefore, when defining a model consisting of all Conv layers, remember to include 1-past the index. Also, Python index notation [x:y] translates to [start-before-here:until-just-before-here] equivalently, [start-with-this:until-this]

So that's why, if your model is VGG.model, entering VGG.model.layers[last_conv_idx] will give you the last convolutional layer, but entering: VGG.model.layers[:last_conv_idx] will give you all layers up to the last Conv layer, but not including it.



In [22]:

    
from keras.layers.convolutional import Convolution2D
last_conv_idx = [index for index, layer in enumerate(VGG.model.layers) if type(layer) is Convolution2D][-1]
conv_layers = VGG.model.layers[:last_conv_idx + 1]
# NOTE: enumerate is zero-indexed.



In [38]:

    
first_conv_idx = [idx for idx, layer in enumerate(VGG.model.layers) if type(layer) is Convolution2D][0]
first_conv_idx









    Out[38]:





2



In [33]:

    
# print(last_conv_idx)
VGG.model.layers









    Out[33]:





[<keras.layers.core.Lambda at 0x7f08d8b4dad0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d8035550>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d800a3d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f6fc50>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f7c990>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d8035510>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f46e10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eeb750>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef2790>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f089d0>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7f46dd0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef23d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eab690>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7eb8650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eb8a90>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ed5950>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e6be50>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7ef21d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e77650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e8f890>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e8f450>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e2dd50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e38d10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e53250>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7e77610>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dd97d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7df0c50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dfe150>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e12150>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7da0110>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7daf610>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7dd9790>,
 <keras.layers.core.Flatten at 0x7f08d7dbbad0>,
 <keras.layers.core.Dense at 0x7f08d7d50f50>,
 <keras.layers.core.Dropout at 0x7f088e4e0ad0>,
 <keras.layers.core.Dense at 0x7f08d7a1d410>,
 <keras.layers.core.Dropout at 0x7f08d7a27f50>,
 <keras.layers.core.Dense at 0x7f08d8b4db50>]



In [34]:

    
conv_layers









    Out[34]:





[<keras.layers.core.Lambda at 0x7f08d8b4dad0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d8035550>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d800a3d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f6fc50>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f7c990>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d8035510>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f46e10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eeb750>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef2790>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f089d0>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7f46dd0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef23d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eab690>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7eb8650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eb8a90>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ed5950>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e6be50>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7ef21d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e77650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e8f890>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e8f450>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e2dd50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e38d10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e53250>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7e77610>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dd97d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7df0c50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dfe150>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e12150>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7da0110>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7daf610>]



In [46]:

    
# from keras.layers.pooling import MaxPooling2D
# ??MaxPooling2D
# ??Convolution2D
# conv_layers[-1].output_shape
# VGG.model.layers[last_conv_idx + 1]









    Out[46]:





[<keras.layers.core.Lambda at 0x7f08d8b4dad0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d8035550>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d800a3d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f6fc50>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f7c990>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d8035510>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7f46e10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eeb750>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef2790>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7f089d0>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7f46dd0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ef23d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eab690>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7eb8650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7eb8a90>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7ed5950>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e6be50>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7ef21d0>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e77650>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e8f890>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e8f450>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e2dd50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7e38d10>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e53250>,
 <keras.layers.pooling.MaxPooling2D at 0x7f08d7e77610>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dd97d0>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7df0c50>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7dfe150>,
 <keras.layers.convolutional.Convolution2D at 0x7f08d7e12150>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f08d7da0110>]



In [47]:

    
# getting some insight into Conv output_shape and MaxPool2D input_shapes
# Theano ordering: (samples, channels, rows, cols) <- Conv-out / MxP-in
# 
print(VGG.model.layers[last_conv_idx].output_shape)
print(VGG.model.layers[last_conv_idx + 1].input_shape)









    



(None, 512, 14, 14)
(None, 512, 14, 14)

So, I don't know if, when building an FC model taking input from a Vgg16bn Conv model, I'll have to specifiy input shape to the first MaxPooling2D layer to be all but the first part of the output_shape (tensor?) from the ConvLayer. We'll see. Does it matter because it's None? or will it be inferred automatically? Or does it need to take in a 3-dim input -- but then why would it be taking in a 4-dim one in the full model's case? Hmm.



In [48]:

    
predictions = VGG.test(test_path, batch_size=32)









    



Found 79726 images belonging to 1 classes.



In [49]:

    
save_array(results_path + 'raw_predictions01.bc', predictions[1])



In [50]:

    
len(predictions[1])









    Out[50]:





79726



In [51]:

    
predictions[1].shape









    Out[51]:





(79726, 10)



In [52]:

    
preds = predictions[1]
# clipping & renorm to score better on logloss metric
preds = utils.do_clip(preds, mx=0.95)



In [53]:

    
filenames = tst_batches.filenames
# ids = np.array([str(f[8:f.find('.')]) for f in filenames])
ids = np.array([str(f[8:]) for f in filenames])



In [54]:

    
print(ids.shape)
print(preds.shape)









    



(79726,)
(79726, 10)



In [55]:

    
import pandas as pd



In [56]:

    
# submissions = np.stack([ids, preds], axis=1)

# couldn't get the older method of using np.stack to work, so trying pandas
classes = sorted(trn_batches.class_indices, key=trn_batches.class_indices.get)
submission = pd.DataFrame(preds, columns=classes)
# submission.insert(0, 'img', [f[12:] for f in filenames])
submission.insert(0, 'img', [f[8:] for f in filenames])
submission.head()
submission.to_csv(results_path + 'submission.csv', index=False, compression=None)



In [57]:

    
# ??pd.DataFrame.to_csv



In [58]:

    
from IPython.display import FileLink
FileLink(results_path + 'submission.csv')









    Out[58]:




/home/wnixalo/Kaukasos/FAI/lesson3/data/results/submission.csv



In [1]:

    
??submission.insert









    



Object `submission.insert` not found.

Results: place|score: 658/1440|1.50925



In [ ]: