StateFarm Distracted Driver Detection Full Dataset


In [1]:
%cd /home/ubuntu/kaggle/state-farm-distracted-driver-detection
# Make sure you are in the main directory (state-farm-distracted-driver-detection)
%pwd


/home/ubuntu/kaggle/state-farm-distracted-driver-detection
Out[1]:
u'/home/ubuntu/kaggle/state-farm-distracted-driver-detection'

In [2]:
# Create references to key directories
import os, sys
from glob import glob
from matplotlib import pyplot as plt
import numpy as np
import keras
np.set_printoptions(precision=4, linewidth=100)
current_dir = os.getcwd()
CHALLENGE_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'


Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

In [3]:
#Allow relative imports to directories
sys.path.insert(1, os.path.join(sys.path[0], '..'))

#import modules
from utils import *
from utils.vgg16 import Vgg16

import utils; reload(utils)
from utils import *
from utils.utils import *

#Instantiate plotting tool
%matplotlib inline

In [4]:
#Need to correctly import utils.py
import bcolz 
from numpy.random import random, permutation

In [5]:
%cd $DATA_HOME_DIR

path = DATA_HOME_DIR + '/'
test_path = path + 'test/' 
results_path= path + 'results/'
train_path=path + 'train/'
valid_path=path + 'valid/'


/home/ubuntu/kaggle/state-farm-distracted-driver-detection/data

In [6]:
#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=3

Create Batches


In [7]:
batches = get_batches(train_path, batch_size=batch_size)
val_batches = get_batches(valid_path, batch_size=batch_size*2, shuffle=False)


Found 12424 images belonging to 10 classes.
Found 10000 images belonging to 10 classes.

In [8]:
(val_classes, trn_classes, val_labels, trn_labels, val_filenames, filenames,
    test_filename) = get_classes(path)


Found 12424 images belonging to 10 classes.
Found 10000 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.

Use Previous Conv sample model on full dataset

The previous model used in the sample data should work better with more data. Lets try it out


In [11]:
def simple_conv(batches):
    model = Sequential([
            BatchNormalization(axis=1, input_shape=(3,224,224)),
            Convolution2D(32,3,3, activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Convolution2D(64,3,3, activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Flatten(),
            Dense(200, activation='relu'),
            BatchNormalization(),
            Dense(10, activation='softmax')
        ])
    
    model.compile(Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit_generator(batches, batches.nb_sample, nb_epoch=2, validation_data=val_batches, 
                     nb_val_samples=val_batches.nb_sample)
    model.optimizer.lr = 0.001
    model.fit_generator(batches, batches.nb_sample, nb_epoch=4, validation_data=val_batches, 
                     nb_val_samples=val_batches.nb_sample)
    return model

In [13]:
model = simple_conv(batches)


Epoch 1/2
12424/12424 [==============================] - 318s - loss: 0.3666 - acc: 0.9045 - val_loss: 1.6560 - val_acc: 0.3318
Epoch 2/2
12424/12424 [==============================] - 277s - loss: 0.0259 - acc: 0.9967 - val_loss: 0.2006 - val_acc: 0.9642
Epoch 1/4
12424/12424 [==============================] - 286s - loss: 0.0076 - acc: 0.9998 - val_loss: 0.0348 - val_acc: 0.9928
Epoch 2/4
12424/12424 [==============================] - 279s - loss: 0.0042 - acc: 0.9999 - val_loss: 0.0371 - val_acc: 0.9908
Epoch 3/4
12424/12424 [==============================] - 278s - loss: 0.0032 - acc: 0.9998 - val_loss: 0.0160 - val_acc: 0.9965
Epoch 4/4
12424/12424 [==============================] - 280s - loss: 0.0016 - acc: 1.0000 - val_loss: 0.0152 - val_acc: 0.9966

In [14]:
model.save_weights(path+'models/simple_conv.h5')

Improve with Data Augmentation


In [9]:
gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.1, 
                shear_range=0.1, channel_shift_range=25, width_shift_range=0.1)
da_batches = get_batches(train_path, gen_t, batch_size=batch_size)


Found 12424 images belonging to 10 classes.

In [12]:
model = simple_conv(da_batches)


Epoch 1/2
12424/12424 [==============================] - 340s - loss: 1.7863 - acc: 0.4099 - val_loss: 2.2373 - val_acc: 0.1540
Epoch 2/2
12424/12424 [==============================] - 283s - loss: 1.1903 - acc: 0.6138 - val_loss: 1.0019 - val_acc: 0.6912
Epoch 1/4
12424/12424 [==============================] - 294s - loss: 0.9287 - acc: 0.7014 - val_loss: 0.5502 - val_acc: 0.8476
Epoch 2/4
12424/12424 [==============================] - 286s - loss: 0.7538 - acc: 0.7675 - val_loss: 0.4268 - val_acc: 0.8761
Epoch 3/4
12424/12424 [==============================] - 282s - loss: 0.6258 - acc: 0.8078 - val_loss: 0.3782 - val_acc: 0.8831
Epoch 4/4
12424/12424 [==============================] - 282s - loss: 0.5316 - acc: 0.8436 - val_loss: 0.3502 - val_acc: 0.8916

In [13]:
model.save_weights(path+'models/simple_conv_da_1.h5')

In [14]:
model.optimizer.lr = 0.0001
model.fit_generator(da_batches, da_batches.nb_sample, nb_epoch=4, validation_data=val_batches,
                nb_val_samples=val_batches.nb_sample)


Epoch 1/4
12424/12424 [==============================] - 291s - loss: 0.4807 - acc: 0.8578 - val_loss: 0.2249 - val_acc: 0.9435
Epoch 2/4
12424/12424 [==============================] - 282s - loss: 0.4156 - acc: 0.8793 - val_loss: 0.1827 - val_acc: 0.9531
Epoch 3/4
12424/12424 [==============================] - 289s - loss: 0.3721 - acc: 0.8942 - val_loss: 0.1860 - val_acc: 0.9507
Epoch 4/4
12424/12424 [==============================] - 280s - loss: 0.3536 - acc: 0.8991 - val_loss: 0.1818 - val_acc: 0.9464
Out[14]:
<keras.callbacks.History at 0x7f8fbeffc890>

In [15]:
model.save_weights(path+'models/simple_conv_da_2.h5')

Deeper Conv/Pooling pair model + Dropout

If the results are still unstable - the validation accuracy jumps from epoch to epoch, creating a deeper model with dropout will help.

Create a Deeper model with dropout


In [9]:
gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.1, 
                shear_range=0.1, channel_shift_range=25, width_shift_range=0.1)
batches = get_batches(train_path, gen_t, batch_size=batch_size)


Found 12424 images belonging to 10 classes.

In [10]:
model = Sequential([
        BatchNormalization(axis=1, input_shape=(3,224,224)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Convolution2D(128,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(10, activation='softmax')
    ])

In [11]:
model.compile(Adam(lr=10e-5), loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model.fit_generator(batches, batches.nb_sample, nb_epoch=2, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


Epoch 1/2
12424/12424 [==============================] - 296s - loss: 2.9667 - acc: 0.1811 - val_loss: 2.3992 - val_acc: 0.1697
Epoch 2/2
12424/12424 [==============================] - 292s - loss: 2.3848 - acc: 0.2956 - val_loss: 1.5577 - val_acc: 0.4321
Out[19]:
<keras.callbacks.History at 0x7f8fc5616290>

In [20]:
model.save_weights(path+'models/deep_conv_da_1.h5')

In [12]:
model.load_weights(path+'models/deep_conv_da_1.h5')

The model is underfitting, lets increase the learning rate


In [13]:
model.optimizer.lr=0.001
model.fit_generator(batches, batches.nb_sample, nb_epoch=10, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


Epoch 1/10
12424/12424 [==============================] - 344s - loss: 2.2840 - acc: 0.2881 - val_loss: 1.2327 - val_acc: 0.5672
Epoch 2/10
12424/12424 [==============================] - 299s - loss: 1.7103 - acc: 0.4173 - val_loss: 0.9193 - val_acc: 0.7055
Epoch 3/10
12424/12424 [==============================] - 294s - loss: 1.2727 - acc: 0.5551 - val_loss: 0.6873 - val_acc: 0.7514
Epoch 4/10
12424/12424 [==============================] - 301s - loss: 1.0184 - acc: 0.6524 - val_loss: 0.4880 - val_acc: 0.8470
Epoch 5/10
12424/12424 [==============================] - 294s - loss: 0.7546 - acc: 0.7413 - val_loss: 0.3059 - val_acc: 0.9081
Epoch 6/10
12424/12424 [==============================] - 295s - loss: 0.7110 - acc: 0.7612 - val_loss: 0.4365 - val_acc: 0.8597
Epoch 7/10
12424/12424 [==============================] - 300s - loss: 0.5882 - acc: 0.8004 - val_loss: 0.2431 - val_acc: 0.9206
Epoch 8/10
12424/12424 [==============================] - 296s - loss: 0.4779 - acc: 0.8417 - val_loss: 0.1974 - val_acc: 0.9405
Epoch 9/10
12424/12424 [==============================] - 298s - loss: 0.4039 - acc: 0.8645 - val_loss: 0.1777 - val_acc: 0.9507
Epoch 10/10
12424/12424 [==============================] - 297s - loss: 0.4011 - acc: 0.8711 - val_loss: 0.1882 - val_acc: 0.9429
Out[13]:
<keras.callbacks.History at 0x7fc216a6bb10>

In [14]:
model.save_weights(path+'models/deep_conv_da_2.h5')

If the model was overfitting, you would need to decrease the learning rate.

Let me decrease the learning rate and see if we get better results


In [15]:
model.optimizer.lr=0.00001
model.fit_generator(batches, batches.nb_sample, nb_epoch=5, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


Epoch 1/5
12424/12424 [==============================] - 299s - loss: 0.3193 - acc: 0.8970 - val_loss: 0.1791 - val_acc: 0.9440
Epoch 2/5
12424/12424 [==============================] - 298s - loss: 0.2818 - acc: 0.9074 - val_loss: 0.1448 - val_acc: 0.9560
Epoch 3/5
12424/12424 [==============================] - 300s - loss: 0.2729 - acc: 0.9140 - val_loss: 0.2174 - val_acc: 0.9310
Epoch 4/5
12424/12424 [==============================] - 297s - loss: 0.2591 - acc: 0.9168 - val_loss: 0.5392 - val_acc: 0.8720
Epoch 5/5
12424/12424 [==============================] - 295s - loss: 0.2364 - acc: 0.9237 - val_loss: 0.1355 - val_acc: 0.9557
Out[15]:
<keras.callbacks.History at 0x7fc210b27590>

In [16]:
model.save_weights(path+'models/deep_conv_da_3.h5')

The accuracy is similar and there is more stability. However, its try with VGG16 model

Use ImageNet Conv Features

Since we have so little data, and it is similar to imagenet images (full color photos), using pre-trained VGG weights is likely to be helpful - in fact it seems likely that we won't need to fine-tune the convolutional layer weights much, if at all. So we can pre-compute the output of the last convolutional layer, as we did in lesson 3 when we experimented with dropout. (However this means that we can't use full data augmentation, since we can't pre-compute something that changes every image.)


In [ ]:
vgg = Vgg16()
model=vgg.model
last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
conv_layers = model.layers[:last_conv_idx+1]

In [ ]:
conv_model = Sequential(conv_layers)

In [ ]:
# Lets pre-compute the features. Thus, shuffle should be set to False
batches = get_batches(train_path, batch_size=batch_size, shuffle=False)

In [ ]:
(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)

In [ ]:
# Compute features for the conv layers for the training, validation, and test data
conv_feat = conv_model.predict_generator(batches, batches.nb_sample)
conv_val_feat = conv_model.predict_generator(val_batches, val_batches.nb_sample)
conv_test_feat = conv_model.predict_generator(test_batches, test_batches.nb_sample)

In [ ]:
# save the features for future use
save_array(path+'results/conv_val_feat.dat', conv_val_feat)
save_array(path+'results/conv_test_feat.dat', conv_test_feat)
save_array(path+'results/conv_feat.dat', conv_feat)

In [ ]:
conv_val_feat.shape

Create Batchnorm dense layers under the Conv layers

Create a network that would sit under the prior conv layers to predict the 10 classes. This is a simplified version on the VGG's dense layers


In [ ]:
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p/2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(p/2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
        ]

In [ ]:
p=0.8

In [ ]:
bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [ ]:
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels)

In [ ]:
bn_model.optimizer.lr = 0.01

In [ ]:
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, nb_epoch=2, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.save_weights(path+'models/bn_dense.h5')

Pre-computed data augmentation + more dropout

Lets add the augmented data and adding larger dense layers, and therefore more dropout to the pre-trained model


In [ ]:
gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.1, 
                shear_range=0.1, channel_shift_range=25, width_shift_range=0.1)
batches = get_batches(train_path, gen_t, batch_size=batch_size, shuffle=False)

Create a dataset of convolutional features that is 5x bigger than the original training set (5 variations of data augmentation from the ImageDataGenerator)


In [ ]:
da_conv_feat = conv_model.predict_generator(da_batches, da_batches.nb_sample*5)

In [ ]:
save_array(path+'results/da_conv_feat.dat', da_conv_feat)

Add the real training data in its non-augmented form


In [ ]:
da_conv_feat = np.concatenate([da_conv_feat, conv_feat])

In [ ]:
# Since we've now gotten a dataset 6x bigger than before, we'll need to copy our labels 6x too
da_trn_labels = np.concatenate([trn_labels]*6)

In [ ]:
def get_bn_da_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
        ]

In [ ]:
p=0.8

In [ ]:
bn_model = Sequential(get_bn_da_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [ ]:
# Lets train the model with the larger set of pre-computed augemented data
bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.optimizer.lr=0.01
bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.optimizer.lr=0.0001
bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.save_weights(path+'models/bn_da_dense.h5')

Pseudo Labeling

Try using a combination of pseudo labeling and knowledge distillation to allow us to use unlabeled data (i.e. do semi-supervised learning). For our initial experiment we'll use the validation set as the unlabeled data, so that we can see that it is working without using the test set


In [ ]:
val_pseudo = bn_model.predict(conv_val_feat, batch_size=batch_size)

In [ ]:
# Concatenate them with the original training set
comb_pseudo = np.concatenate([da_trn_labels, val_pseudo])

In [ ]:
comb_feat = np.concatenate([da_conv_feat, conv_val_feat])

In [ ]:
# fine-tune the model using this combined training set
bn_model.load_weights(path+'models/bn_da_dense.h5')

In [ ]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
bn_model.optimizer.lr=0.00001
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))

In [ ]:
# There is a distinct improvement - altough the validation set isn't large. 
# A sigfniicant improvement can be found when using the test data
bn_model.save_weights(path+'models/bn-ps8.h5')

Generate Predictions from Test data


In [15]:
test_batches = get_batches(test_path, shuffle=False, batch_size=batch_size)


Found 79726 images belonging to 1 classes.

In [16]:
preds = model.predict_generator(test_batches, test_batches.nb_sample)

In [17]:
preds[:2]


Out[17]:
array([[  1.2277e-03,   1.0727e-02,   1.3244e-04,   1.1853e-04,   1.1995e-04,   4.3428e-02,
          1.2036e-02,   6.8307e-01,   2.3336e-01,   1.5780e-02],
       [  5.6099e-02,   1.2775e-03,   8.4133e-06,   7.1406e-01,   3.1234e-04,   1.8992e-01,
          8.4862e-03,   9.0735e-03,   1.6331e-02,   4.4324e-03]], dtype=float32)

Submit to competition


In [18]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [19]:
keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval()


---------------------------------------------------------
NameError               Traceback (most recent call last)
<ipython-input-19-c49c79881ea6> in <module>()
----> 1 keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval()

NameError: name 'val_preds' is not defined

In [20]:
subm = do_clip(preds,0.93)

In [31]:
subm_name = path+'results/subm.csv'

In [22]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [34]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[8:] for a in test_filename])
submission.head()


Out[34]:
img c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
0 img_81601.jpg 0.007778 0.010727 0.007778 0.007778 0.007778 0.043428 0.012036 0.683075 0.233357 0.015780
1 img_14887.jpg 0.056099 0.007778 0.007778 0.714062 0.007778 0.189917 0.008486 0.009074 0.016331 0.007778
2 img_62885.jpg 0.007778 0.007778 0.007778 0.393101 0.554579 0.008782 0.025158 0.007778 0.015522 0.007778
3 img_45125.jpg 0.043711 0.090787 0.007778 0.008041 0.126157 0.007778 0.345583 0.155502 0.223942 0.007778
4 img_22633.jpg 0.007778 0.930000 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778

In [35]:
submission.tail()


Out[35]:
img c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
79721 img_19465.jpg 0.007778 0.930000 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.066957 0.007778
79722 img_91995.jpg 0.007778 0.067422 0.007778 0.007778 0.014438 0.007778 0.857751 0.007778 0.007778 0.035709
79723 img_98750.jpg 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.007778 0.930000 0.007778
79724 img_42858.jpg 0.291367 0.037793 0.071602 0.099231 0.080819 0.193121 0.040258 0.088981 0.060749 0.036080
79725 img_98905.jpg 0.038733 0.007778 0.031885 0.007778 0.811478 0.009088 0.007778 0.104785 0.007778 0.007778

In [36]:
submission.to_csv(subm_name, index=False, encoding='utf-8')

In [ ]: