What I'll need to do:
These are general imports, always make sure to run these
In [1]:
import os
import zipfile
import shutil
import csv
import bcolz
os.environ["KERAS_BACKEND"] = "theano"
import keras
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
model_url = "http://files.fast.ai/models/"
model_name = "vgg16.h5"
cache_dir = "models"
In [ ]:
raw_path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
processed_path = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')
# Make directories sample, valid, train, test, first check if this whole step is necessary
if os.path.exists(os.path.join(processed_path, 'sample')):
print 'Sample directory already exists, no need to do data structuring!'
else:
os.mkdir(os.path.join(processed_path, 'sample'))
os.mkdir(os.path.join(processed_path, 'sample', 'train'))
os.mkdir(os.path.join(processed_path, 'sample', 'valid'))
os.mkdir(os.path.join(processed_path, 'valid'))
# Extract Kaggle zipfiles to correct path
print 'Extracting zips, this may take a while...'
img_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'imgs.zip'), 'r')
img_zip_handle.extractall(processed_path)
img_zip_handle.close()
csv_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'driver_imgs_list.csv.zip'), 'r')
csv_zip_handle.extractall(processed_path)
csv_zip_handle.close()
print 'Done extracting zips!'
# Set up sample directory structure
for i in range(10):
dirname = 'c' + str(i)
os.mkdir(os.path.join(processed_path, 'sample', 'train', dirname))
os.mkdir(os.path.join(processed_path, 'sample', 'valid', dirname))
os.mkdir(os.path.join(processed_path, 'valid', dirname))
os.mkdir(os.path.join(processed_path, 'test', 'unknown'))
for filename in os.listdir(os.path.join(processed_path, 'test')):
if filename.endswith('.jpg'):
src = os.path.join(processed_path, 'test', filename)
dest = os.path.join(processed_path, 'test', 'unknown', filename)
shutil.move(src, dest)
data = np.genfromtxt(os.path.join(processed_path, 'driver_imgs_list.csv'), delimiter=',', dtype=None)
data = data[1:,:]
drivers = np.unique(data[:,0])
num_drivers = drivers.shape[0]
# Throw 15% of train data into sample folder
sample_drivers_amount = int(np.floor(num_drivers*0.15))
sample_drivers = np.random.choice(drivers, sample_drivers_amount, replace=False)
# Throw 20% of train data into valid folder
validation_drivers_amount = int(np.floor(num_drivers*0.2))
validation_drivers = np.random.choice(drivers, validation_drivers_amount, replace=False)
# Set up sample set
for i in range(sample_drivers_amount):
driver_name = sample_drivers[i]
driver_columns = data[data[:,0] == driver_name]
for j in range(10):
driver_class = 'c' + str(j)
dest = os.path.join(processed_path, 'sample', 'train', driver_class)
class_columns = driver_columns[driver_columns[:,1] == driver_class]
for filename in class_columns[:,2]:
src = os.path.join(processed_path, 'train', driver_class, filename)
shutil.copyfile(src, os.path.join(dest, filename))
# Now move from sample_train to sample_validation a fraction of ~40%
sample_drivers_validation_amount = int(np.floor(sample_drivers_amount*0.4))
sample_drivers_validation = np.random.choice(sample_drivers,
sample_drivers_validation_amount,
replace=False)
for i in range(sample_drivers_validation_amount):
driver_name = sample_drivers_validation[i]
driver_columns = data[data[:,0] == driver_name]
for j in range(10):
driver_class = 'c' + str(j)
class_columns = driver_columns[driver_columns[:,1] == driver_class]
for filename in class_columns[:,2]:
dest = os.path.join(processed_path, 'sample', 'valid', driver_class, filename)
src = os.path.join(processed_path, 'sample', 'train', driver_class, filename)
shutil.move(src, dest)
# Set up validation set
for i in range(validation_drivers_amount):
driver_name = validation_drivers[i]
driver_columns = data[data[:,0] == driver_name]
for j in range(10):
driver_class = 'c' + str(j)
class_columns = driver_columns[driver_columns[:,1] == driver_class]
for filename in class_columns[:,2]:
src = os.path.join(processed_path, 'train', driver_class, filename)
dest = os.path.join(processed_path, 'valid', driver_class, filename)
shutil.move(src, dest)
In [ ]:
def add_conv_block(model, layers, filters):
for i in range(layers):
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(filters, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))
return model
def add_fc_block(model, dropout):
model.add(Dense(4096, activation='relu'))
model.add(Dropout(dropout))
return model
In [ ]:
class vgg16():
def __init__(self, dropout=0.5):
self.vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])
self.create(dropout)
def create(self, dropout):
def vgg_preprocess(x, mean):
mean = np.array(mean)
x = x - mean
return x[:,:,::-1]
model = self.model = Sequential()
model.add(Lambda(vgg_preprocess,
input_shape=(3, 244, 244),
output_shape=(3, 244, 244),
arguments = {'mean': self.vgg_mean.tolist()}
))
model = add_conv_block(model, 2, 64)
model = add_conv_block(model, 2, 128)
model = add_conv_block(model, 3, 256)
model = add_conv_block(model, 3, 512)
model = add_conv_block(model, 3, 512)
model.add(Flatten())
model = add_fc_block(model, dropout)
model = add_fc_block(model, dropout)
model.add(Dense(1000, activation='softmax'))
model = model.load_weights(get_file(model_name, model_url+model_name, cache_subdir=cache_dir))
In [ ]:
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
path = os.path.join(data_dir, 'processed', 'sample')
batch_size = 4
epochs = 2
elif DEBUG == False:
path = os.path.join(data_dir, 'processed')
batch_size = 64
epochs = 5
train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_batches = ImageDataGenerator().flow_from_directory(train_path,
target_size=(244,244),
batch_size=batch_size,
shuffle=True)
val_batches = ImageDataGenerator().flow_from_directory(val_path,
target_size=(244,244),
batch_size=batch_size,
shuffle=True)
In [ ]:
lr = 0.001
model = vgg16(dropout=0.5).model
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(train_batches,
samples_per_epoch=train_batches.nb_sample,
nb_epoch=epochs,
validation_data=val_batches,
nb_val_samples=val_batches.nb_sample)
model.save(os.path.join(model_dir, 'model_with_new_top.h5'))
First, import the model from when we saved it. Then:
In [6]:
old_model = load_model(os.path.join(os.getcwd(),
os.pardir,
'models',
'model_with_new_top.h5'))
Let's implement batch normalisation first. It'll speed up our looking for the adequate learning rate. From this link we know that BatchNorm() needs to be applied after the activation.
In [7]:
flatten_index = [index for index,layer in enumerate(old_model.layers) if type(layer).__name__ == 'Flatten'][0]
conv_model_layers = old_model.layers[1:flatten_index-1]
conv_model = Sequential(conv_model_layers)
In [4]:
def fc_model(dropout):
model = Sequential()
model.add(MaxPooling2D(input_shape=conv_model.layers[-1].output_shape[1:]))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(10, activation='softmax'))
return model
Let's set up new batch generators, this time making use of augmented data. Remember, we only seek to augment our training input, no need to augment validation input (there's no learning taking place).
train_batches generator is set to False because we're going to be saving it, and need reproducible inputs.
In [17]:
DEBUG = False
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
test_path = os.path.join(path, 'test')
if DEBUG == True:
path = os.path.join(data_dir, 'processed', 'sample')
batch_size = 4
epochs = 2
elif DEBUG == False:
path = os.path.join(data_dir, 'processed')
batch_size = 64
epochs = 5
In [ ]:
train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_image_gen = ImageDataGenerator(rotation_range=15,
height_shift_range=0.05,
width_shift_range=0.1,
shear_range = 0.1,
channel_shift_range=20,
)
aug_train_batches = train_image_gen.flow_from_directory(train_path,
target_size=(244,244),
batch_size=batch_size,
class_mode='categorical',
shuffle=False)
train_batches = ImageDataGenerator().flow_from_directory(train_path,
target_size=(244,244),
batch_size=batch_size,
class_mode='categorical',
shuffle=False)
val_batches = ImageDataGenerator().flow_from_directory(val_path,
target_size=(244,244),
batch_size=batch_size,
shuffle=False)
print 'Predicting, this may take a while...'
conv_model_predictions_augmented = conv_model.predict_generator(aug_train_batches,
aug_train_batches.nb_sample*2,
)
conv_model_predictions = conv_model.predict_generator(train_batches,
train_batches.nb_sample,
)
val_predictions = conv_model.predict_generator(val_batches,
val_batches.nb_sample,
)
print 'Done predicting!'
# Concatenating augmented and non-augmented predictions
conv_model_predictions = np.concatenate([conv_model_predictions_augmented, conv_model_predictions])
prediction_labels = to_categorical(train_batches.classes)
prediction_labels = np.concatenate([prediction_labels]*3)
In [ ]:
test_path = os.path.join(path, 'test')
test_generator = ImageDataGenerator().flow_from_directory(test_path,
target_size=(244,244),
batch_size=batch_size,
class_mode='categorical',
shuffle=False)
print 'Predicting test features, this might take a while...'
conv_model_test_inputs = conv_model.predict_generator(test_generator,
test_generator.nb_sample
)
print 'Done predicting!'
In [ ]:
save_array(os.path.join(model_dir, 'test_inputs.bc'), conv_model_test_inputs)
In [3]:
def save_array(location, array):
instance = bcolz.carray(array, rootdir=location, mode='w')
instance.flush()
def load_array(location):
return bcolz.open(location)[:]
In [ ]:
save_array(os.path.join(model_dir, 'conv_predictions.bc'), conv_model_predictions)
save_array(os.path.join(model_dir, 'conv_labels.bc'), prediction_labels)
save_array(os.path.join(model_dir, 'val_predictions.bc'), val_predictions)
save_array(os.path.join(model_dir, 'val_labels.bc'), to_categorical(val_batches.classes))
In [ ]:
conv_predictions = load_array(os.path.join(model_dir, 'conv_predictions.bc'))
conv_labels = load_array(os.path.join(model_dir, 'conv_labels.bc'))
conv_val_predictions = load_array(os.path.join(model_dir, 'val_predictions.bc'))
conv_val_labels = load_array(os.path.join(model_dir, 'val_labels.bc'))
In [ ]:
dropout = 0.8
model = fc_model(dropout)
epochs = 10
lr = 0.0001
model.compile(optimizer=Adam(lr),
loss='categorical_crossentropy',
metrics=['accuracy'])
model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
conv_labels,
batch_size=batch_size,
nb_epoch=epochs,
validation_data=(conv_val_predictions, conv_val_labels))
In [ ]:
lr = 0.00001
epochs = 2
model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
conv_labels,
batch_size=batch_size,
nb_epoch=epochs,
validation_data=(conv_val_predictions, conv_val_labels))
In [ ]:
model.save_weights(os.path.join(model_dir, 'final_predictor.h5'))
In [8]:
dropout = 0.8
model = fc_model(dropout)
lr = 0.0001
model.compile(optimizer=Adam(lr),
loss='categorical_crossentropy',
metrics=['accuracy'])
model.optimizer.lr.set_value(lr)
model.load_weights(os.path.join(model_dir, 'final_predictor.h5'))
In [ ]:
test_input = load_array(os.path.join(model_dir, 'test_inputs.bc'))
In [10]:
test_predictions = model.predict(test_input)
In [14]:
test_predictions[1:3,:]
Out[14]:
In [15]:
clipped_predictions = np.clip(test_predictions, 0.02, 0.98)
In [21]:
filename_list = [filename for filename in os.listdir(os.path.join(test_path, 'unknown'))]
In [31]:
filename_array = np.transpose(np.array(filename_list, ndmin=2))
In [38]:
csv_headless = np.concatenate([filename_array, clipped_predictions], axis=1)
In [46]:
header_list = [
'img',
'c0',
'c1',
'c2',
'c3',
'c4',
'c5',
'c6',
'c7',
'c8',
'c9',
]
header_line = np.array(header_list, ndmin=2)
In [53]:
ans_array = np.concatenate([header_line, csv_headless])
# ans_array = ans_array.astype('|S10')
In [54]:
np.savetxt(os.path.join(data_dir, "submission.csv"), ans_array, delimiter=',', fmt='%s')
In [55]:
data_dir
Out[55]:
In [ ]: