This is for CNN models built from scratch, using Keras based on TensorFlow. First, some preparation work.
In [1]:
from keras.layers import Conv2D, MaxPooling2D, Input, Dense, Flatten, Activation, add
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling2D
from keras.optimizers import RMSprop
from keras.models import Model, Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as ktf
from keras.preprocessing.image import ImageDataGenerator
from lib.data_utils import get_MNIST_data
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook
Read the MNIST data. Notice that we assume that it's 'kaggle-DigitRecognizer/data/train.csv', and we use helper function to read into a dictionary.
In [2]:
# by default, there would be 41000 training data, 1000 test data and 1000 validation data(within traning set)
data = get_MNIST_data(num_validation=4000)
# see if we get the data correctly
print('image size: ', data['X_train'].shape)
Build a simple CNN model using Keras and then train from scratch.
In [11]:
# model architecture
# [batchnorm-Conv-Conv-maxpool]x2 - [dense]x2 - [softmax]
# new lowest: 1.01 0.79 (0.0121, 0.76, 1974, True, False)
# new lowest: 1.23 0.73 (0.0044, 0.45, 1392, True, False)
simple_CNN = Sequential()
simple_CNN.add(BatchNormalization(input_shape=(28, 28, 1)))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (14,14,32)
simple_CNN.add(Dropout(0.2))
simple_CNN.add(BatchNormalization())
simple_CNN.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (7,7,64)
simple_CNN.add(Dropout(0.2))
simple_CNN.add(Flatten())
simple_CNN.add(Dense(1392, activation='relu'))
simple_CNN.add(Dropout(0.45))
simple_CNN.add(Dense(10, activation='softmax'))
# set loss and optimizer
rmsprop = RMSprop(lr=0.0044, decay=0.99)
simple_CNN.compile(loss='sparse_categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('../models/simpleCNN_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=3)
# use test data to monitor early stopping
simple_CNN.fit(data['X_train'], data['y_train'].reshape(-1,1),
batch_size=64,
epochs=200,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
callbacks=[checkpoint, earlystop],
initial_epoch=0)
In [3]:
# resume training
model = load_model('../models/simpleCNN_86-0.0034.h5')
# set the loss and optimizer
rmsprop = RMSprop(lr=0.0000000044)
model.compile(optimizer=rmsprop, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('../models/simpleCNN_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=5)
model.fit(data['X_train'], data['y_train'].reshape(-1,1),
batch_size=64,
epochs=200,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
callbacks=[checkpoint, earlystop],
initial_epoch=87)
Build the small ResNet with 22 layers using Keras and train from scratch.
In [ ]:
# model architecture
# [Conv-batchnorm-relu]x4 - [residual: [Conv-batchnorm-relu]x2-Conv-batchnorm-add-relu]x6
# 4
inputs = Input(shape=(28, 28, 1))
x = Conv2D(64, (7, 7), padding='same')(inputs)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(64, (1, 1), padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(64, (3, 3), padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(256, (1, 1), padding='same')(x)
x = BatchNormalization()(x)
res = MaxPooling2D((2, 2))(x) # (14, 14, 64)
# repeated residual modules
for i in range(6): # 6x3 = 18
x = Conv2D(64, (1, 1), padding='same')(res)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(64, (3, 3), padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(256, (1, 1), padding='same')(x)
x = BatchNormalization()(x)
x = add([x, res])
res = Activation('relu')(x)
x = GlobalAveragePooling2D(data_format='channels_last')(res) #(,256)
predictions = Dense(10, activation='softmax')(x)
# connect the model
mini_ResNet = Model(inputs=inputs, outputs=predictions)
# set loss and optimizer
rmsprop = RMSprop(lr=0.1, decay=0.9999)
mini_ResNet.compile(loss='sparse_categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('miniResNet_{epoch:02d}-{accuracy:.2f}.h5',
monitor='accuracy',
save_best_only=True)
plateau = ReduceLROnPlateau(factor=0.1, patience=3, min_lr=0.0001)
mini_ResNet.fit(data['X_train'], data['y_train'].reshape(-1, 1),
batch_size=32, epochs=10,
callbacks=[checkpoint, plateau])
# test the model and see accuracy
score = mini_ResNet.evaluate(data['X_test'], data['y_test'].reshape(-1, 1), batch_size=32)
print(score)
In [ ]:
# save the model: 0.903
mini_ResNet.save('mini_ResNet.h5')
Inspired by ResNet, we try to add residual connections to the simple CNN model above and see if there exists difference of performance.
In [17]:
# model architecture
# [Conv] - [batchnorm-Conv-Conv-add-maxpool]x2 - [dense]x2 - [softmax]
inputs = Input(shape=(28,28,1))
x = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
res = BatchNormalization()(x) # (28, 28, 64)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(res)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = add([res, x])
x = MaxPooling2D((2, 2))(x)
res = BatchNormalization()(x) # (14, 14, 64)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(res)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = add([res, x])
x = MaxPooling2D((2, 2))(x)
x = GlobalAveragePooling2D(data_format='channels_last')(x)
predictions = Dense(10, activation='softmax')(x)
simple_resCNN = Model(inputs=inputs,outputs=predictions)
# set loss and optimizer
rmsprop = RMSprop(lr=0.01, decay=0.978)
simple_resCNN.compile(loss='sparse_categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('../models/simpleResCNN_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=5)
# use test data to monitor early stopping
simple_resCNN.fit(data['X_train'], data['y_train'].reshape(-1,1),
batch_size=64,
epochs=200,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
callbacks=[checkpoint, earlystop],
initial_epoch=0)
In [6]:
# resume training
model = load_model('../models/simpleCNN_29-0.4773.h5')
# set the loss and optimizer
rmsprop = RMSprop(lr=0.00001,decay=0.978)
model.compile(optimizer=rmsprop, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('../models/simpleCNN_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=5)
model.fit(data['X_train'], data['y_train'].reshape(-1,1),
batch_size=64,
epochs=200,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
callbacks=[checkpoint, earlystop],
initial_epoch=26)
In [14]:
# validate the structure
inputs = Input(shape=(28,28,1))
x = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
res = BatchNormalization()(x) # (28, 28, 64)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(res)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = add([res, x])
x = MaxPooling2D((2, 2))(x)
res = BatchNormalization()(x) # (14, 14, 64)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(res)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = add([res, x])
x = MaxPooling2D((2, 2))(x)
x = GlobalAveragePooling2D(data_format='channels_last')(x)
predictions = Dense(10, activation='softmax')(x)
simple_resCNN = Model(inputs=inputs,outputs=predictions)
# set loss and optimizer
rmsprop = RMSprop(lr=0.001, decay=0.978)
simple_resCNN.compile(loss='sparse_categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
# use test data to monitor early stopping
simple_resCNN.fit(data['X_val'], data['y_val'].reshape(-1,1),
batch_size=64,
epochs=1,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
initial_epoch=0)
Out[14]:
It's often helpful to see how the model classifies the wrong results. Here we randomly pick 10 wrong classified images from test set.
In [3]:
model = load_model('../models/simpleResCNN.h5')
pred = np.argmax(model.predict(data['X_test']), axis=1)
wrong_idx = [i for i in range(len(pred)) if pred[i] != data['y_test'][i]]
In [12]:
np.random.shuffle(wrong_idx)
fig = plt.figure(figsize=(4, 5))
for i in range(1,6):
for j in range(1,3):
idx = wrong_idx.pop()
a = fig.add_subplot(j,5,i)
plt.imshow(data['X_test'][idx].reshape((28,28)))
plt.axis('off')
plt.title(pred[idx])
plt.show()
In [ ]:
In [ ]:
We finetune the hyperparameters and also structures of SimpleCNN. Here we use random search for following hyperparameters:
In [4]:
# validate the model and return the test error
def simpleCNN_model(lr=0.001, dropout=0.5, dense_dim=1024, drop_conv=True, avgpool=True):
simple_CNN = Sequential()
simple_CNN.add(BatchNormalization(input_shape=(28, 28, 1)))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (14,14,32)
if drop_conv:
simple_CNN.add(Dropout(0.2))
simple_CNN.add(BatchNormalization())
simple_CNN.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (7,7,64)
if drop_conv:
simple_CNN.add(Dropout(0.2))
if avgpool:
simple_CNN.add(GlobalAveragePooling2D())
else:
simple_CNN.add(Flatten())
simple_CNN.add(Dense(dense_dim, activation='relu'))
simple_CNN.add(Dropout(dropout))
simple_CNN.add(Dense(10, activation='softmax'))
# set loss and optimizer
rmsprop = RMSprop(lr=lr, decay=0.999)
simple_CNN.compile(loss='sparse_categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
# use test data to monitor early stopping
history = simple_CNN.fit(data['X_val'], data['y_val'].reshape(-1,1),
batch_size=64,
epochs=1,
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)),
initial_epoch=0,
verbose=False)
return history.history['val_loss'][0], history.history['val_acc'][0]
First, random search. Each run is only 1 epoch with 4000 validation data. Hyperparamters are sampled from uniform distribution with different range, while hidden unit size requires integer. Structure 'switches' are sampled from binomial distribution(p=0.5).
In [7]:
# validation: 4000; 1 epoch
# finetune list: initial learning rate, dropout rate, hidden unit size
best_parameters={'lr': 0.001, 'dropout': 0.5, 'dense_dim': 1024}
lowest_err = 1000
lr_range = (0.1,0.0001); dropout_range = (0.3,0.8); dense_range = (512,2048)
while True:
lr = np.random.uniform(lr_range[0], lr_range[1])
dropout = np.random.uniform(dropout_range[0], dropout_range[1])
dense_dim = int(np.random.uniform(dense_range[0], dense_range[1]))
drop_conv, avgpool = np.random.binomial(1,0.5,2)
ktf.clear_session()
test_err, test_acc = simpleCNN_model(lr, dropout, dense_dim, drop_conv, avgpool)
if test_err < lowest_err:
print('new lowest: ', round(test_err,2), round(test_acc,2),
(round(lr,4), round(dropout,2), dense_dim, bool(drop_conv), bool(avgpool)))
lowest_err = test_err
best_parameters['lr'] = lr
best_parameters['dropout'] = dropout
best_parameters['dense_dim'] = dense_dim
Here, we try to reduce the error in a different way, from a perspective of data. In theory, the classifier can learn better with more data. Data augmentation is a way to increase the training set size. It realizes this by slightly transforming the original data, and using original and also this extra transformed dataset to train the model.
In [11]:
# set the data generator to transform the data
idg = ImageDataGenerator(width_shift_range=0.05,
fill_mode='constant')
# build the model
simple_CNN = Sequential()
simple_CNN.add(BatchNormalization(input_shape=(28, 28, 1)))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (14,14,32)
simple_CNN.add(Dropout(0.2))
simple_CNN.add(BatchNormalization())
simple_CNN.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
simple_CNN.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
simple_CNN.add(MaxPooling2D((2, 2))) # (7,7,64)
simple_CNN.add(Dropout(0.2))
simple_CNN.add(Flatten())
simple_CNN.add(Dense(1392, activation='relu'))
simple_CNN.add(Dropout(0.45))
simple_CNN.add(Dense(10, activation='softmax'))
# set loss and optimizer
rmsprop = RMSprop(lr=0.0044, decay=0.99)
simple_CNN.compile(loss='sparse_categorical_crossentropy',
optimizer=rmsprop,
metrics=['accuracy'])
# train the model using indefinite number of training data
checkpoint = ModelCheckpoint('../models/simpleCNN_aug_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=3)
simple_CNN.fit_generator(idg.flow(data['X_train'],
data['y_train'].reshape(-1, 1),
batch_size=64),
steps_per_epoch=len(data['X_train'])/64,
initial_epoch=0,
epochs=100,
callbacks=[checkpoint, earlystop],
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)))
In [6]:
# resume training
model = load_model('../models/simpleCNN_aug_44-0.9536.h5')
# set the loss and optimizer
rmsprop = RMSprop(lr=0.00044,decay=0.99)
model.compile(optimizer=rmsprop, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# train the model
checkpoint = ModelCheckpoint('../models/simpleCNN_aug_{epoch:02d}-{loss:.4f}.h5',
monitor='loss',
save_best_only=True)
earlystop = EarlyStopping(min_delta=0.0001, patience=5)
model.fit_generator(idg.flow(data['X_train'],
data['y_train'].reshape(-1, 1),
batch_size=64),
steps_per_epoch=len(data['X_train'])/64,
initial_epoch=45,
epochs=100,
callbacks=[checkpoint, earlystop],
validation_data=(data['X_test'], data['y_test'].reshape(-1, 1)))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Load the saved trained models and produce predictions for submission on Kaggle.
In [4]:
from lib.data_utils import create_submission
from keras.models import load_model
# for simple CNN model
model = load_model('../models/simpleCNN_86-0.0034.h5')
print('Load model successfully.')
create_submission(model, '../data/test.csv', '../submission/submission_simpleCNN_tuned_87.csv', 128)
In [8]:
history = simpleCNN_model()
In [10]:
print(history.history)
In [ ]:
new lowest: 1.85 0.76 (0.0009, 0.58, 1892, False, False)
new lowest: 1.78 0.52 (0.0077, 0.34, 867, True, False)