In [1]:
from __future__ import print_function
import time
import numpy as np
import theano
import theano.tensor as T
import lasagne

from cifar10_data import load_cifar10
import lasagne_trainer


Using gpu device 0: GRID K520 (CNMeM is enabled)

In [3]:
# verify that cuDNN is enabled
theano.sandbox.cuda.dnn.version()


Out[3]:
(3002, 3002)

In [4]:
# set up plots

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# reload external libs during development
%load_ext autoreload
%autoreload 2

load data


In [13]:
# get data
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)


Train data shape:  (49000, 3, 32, 32)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3, 32, 32)
Validation labels shape:  (1000,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)

theano input_var


In [7]:
input_var = T.tensor4('inputs')

two-layer network


In [4]:
def create_twolayer(input_var, input_shape=(3, 32, 32),
              num_hidden_units=100, num_classes=10,
              **junk):
    # input layer
    network = lasagne.layers.InputLayer(shape=(None,) + input_shape,
                                        input_var=input_var)
    # fc-relu
    network = lasagne.layers.DenseLayer(
        network, num_units=num_hidden_units,
        nonlinearity=lasagne.nonlinearities.rectify)
    # output layer
    network = lasagne.layers.DenseLayer(
        lasagne.layers.dropout(network, p=.5),
        num_units=num_classes,
        nonlinearity=lasagne.nonlinearities.softmax)
    return network

In [ ]:
# defaults
param=dict(reg=1e-1,
           update='momentum', momentum=0.9,
           learning_rate=1e-4, learning_rate_decay = 0.95,
           num_epochs=5)

In [ ]:
# [loss] train_acc val_acc

# 1.049 0.650 0.551
# 1.106 0.667 0.561
# => test accuracy: 0.548
param.update(reg=1e0, hidden_size=200, num_epochs=40, learning_rate=1e-4)

# 1.649 0.424 0.426
param.update(reg=1e1, hidden_size=200, num_epochs=40, learning_rate=1e-4)

# ????? 0.868 0.555
# 0.477 0.860 0.546
param.update(reg=1e-1, hidden_size=200, num_epochs=40, learning_rate=1e-4)

# ????? 0.578 0.419
param.update(reg=1e-1, hidden_size=200, num_epochs=40, learning_rate=1e-3)

# 1.054 0.566 0.505
param.update(reg=1e-1, hidden_size=200, num_epochs=40, learning_rate=1e-5)

# 0.724 0.537
param.update(reg=1e-1, hidden_size=200, num_epochs=20, learning_rate=1e-4)

# 0.606 0.530
param.update(reg=1.0, hidden_size=200, num_epochs=20, learning_rate=1e-4)

# 0.534 0.474
param.update(reg=1e-1, hidden_size=200, num_epochs=20, learning_rate=1e-5)

# 0.504 0.469
param.update(reg=1.0, hidden_size=200, num_epochs=20, learning_rate=1e-5)

In [ ]:
# two-layer

network = create_twolayer(input_var, num_hidden_units=128, num_classes=10)
# overfit check
xxx, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
          network, input_var, X_train[:50], y_train[:50], X_val, y_val,
          learning_rate=1e-4, batch_size=10, num_epochs=10)

In [57]:
model, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
    network, input_var, X_train, y_train, X_val, y_val,
    learning_rate=1e-4, num_epochs=10, batch_size=50)
print('%.3f' % min(loss_history), max(train_acc_history), max(val_acc_history), \
    ' '.join('%s=%s' % (k,param[k]) for k in param))


Compiling...
Training...
epoch 1 / 10 in 0.9s: cost 19.949498, train: 0.262000, val 0.229000, lr 1.000000e-04
epoch 2 / 10 in 0.9s: cost 3.706224, train: 0.279000, val 0.283000, lr 1.000000e-04
epoch 3 / 10 in 0.9s: cost 3.098231, train: 0.287000, val 0.298000, lr 1.000000e-04
epoch 4 / 10 in 0.9s: cost 2.809862, train: 0.321000, val 0.312000, lr 1.000000e-04
epoch 5 / 10 in 0.9s: cost 2.680390, train: 0.340000, val 0.306000, lr 1.000000e-04
epoch 6 / 10 in 0.9s: cost 2.550387, train: 0.336000, val 0.315000, lr 1.000000e-04
epoch 7 / 10 in 0.9s: cost 2.479211, train: 0.313000, val 0.321000, lr 1.000000e-04
epoch 8 / 10 in 0.9s: cost 2.420109, train: 0.372000, val 0.332000, lr 1.000000e-04
epoch 9 / 10 in 0.9s: cost 2.367520, train: 0.359000, val 0.355000, lr 1.000000e-04
epoch 10 / 10 in 0.9s: cost 2.320339, train: 0.364000, val 0.345000, lr 1.000000e-04
('1.633', 0.37199999690055846, 0.35499999970197677, 'crp_filter_size=7 num_fc=1 learning_rate=0.0001 batch_size=100 fc_num_units=32 num_epochs=5 crp_num_filters=32 reg=0.001 num_crp=1')

v1: [conv-relu-pool]xN - conv - relu - [affine]xM - [softmax or SVM]

v2: [conv-relu-pool]XN - [affine]XM - [softmax or SVM]


In [5]:
def create_v1(input_var, input_shape=(3, 32, 32),
              num_crp=1, crp_num_filters=32, crp_filter_size=5,
              num_cr=1,
              num_fc=1, fc_num_units=64,
              output_type='softmax', num_classes=10,
              **junk):
    # input layer
    network = lasagne.layers.InputLayer(shape=(None,) + input_shape,
                                        input_var=input_var)
    # conv-relu-pool layers
    for i in range(num_crp):
        network = lasagne.layers.Conv2DLayer(
            network, num_filters=crp_num_filters,
            filter_size=(crp_filter_size, crp_filter_size),
            pad='same',
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform(gain='relu'))
        network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
    # conv-relu
    for i in range(num_cr):
        network = lasagne.layers.Conv2DLayer(
            network, num_filters=crp_num_filters,
            filter_size=(crp_filter_size, crp_filter_size),
            pad='same',
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform(gain='relu'))
    # fc-relu
    for i in range(num_fc - 1):
        network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=fc_num_units,
            nonlinearity=lasagne.nonlinearities.rectify)
    # output layer
    assert output_type=='softmax'
    network = lasagne.layers.DenseLayer(
        lasagne.layers.dropout(network, p=.5),
        num_units=num_classes,
        nonlinearity=lasagne.nonlinearities.softmax)
    return network

In [113]:
# defaut params
param = dict(num_crp=1, crp_filter_size=7, crp_num_filters=32,
             num_fc=1, fc_num_units=32,
             reg=1e-3,
             learning_rate=1e-4, learning_rate_decay=0.95,
             momentum=0.9, momentum_decay=0.9,
             batch_size=100, num_epochs=20)

In [114]:
param.update(num_crp=3, crp_filter_size=3, crp_num_filters=128,
             num_fc=3, fc_num_units=256, num_epochs=100,
             learning_rate=1e-4, batch_size=128)

In [116]:
network = create_v1(input_var, **param)
# overfit
xxx, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
          network, input_var, X_train[:50], y_train[:50], X_val, y_val,
          learning_rate=1e-4, batch_size=10, num_epochs=20)


Compiling...
Training...
epoch 1 / 20 in 0.9s: loss 71.401446, train: 0.140, val 0.116, lr 9.500000e-05 mom 9.100000e-01
epoch 2 / 20 in 0.9s: loss 33.159449, train: 0.100, val 0.112, lr 9.025000e-05 mom 9.190000e-01
epoch 3 / 20 in 0.9s: loss 14.697625, train: 0.220, val 0.128, lr 8.573750e-05 mom 9.270999e-01
epoch 4 / 20 in 0.9s: loss 7.475414, train: 0.220, val 0.130, lr 8.145062e-05 mom 9.343899e-01
epoch 5 / 20 in 0.9s: loss 4.678493, train: 0.260, val 0.148, lr 7.737809e-05 mom 9.409509e-01
epoch 6 / 20 in 0.9s: loss 3.727562, train: 0.200, val 0.127, lr 7.350919e-05 mom 9.468558e-01
epoch 7 / 20 in 0.9s: loss 3.771379, train: 0.220, val 0.108, lr 6.983373e-05 mom 9.521703e-01
epoch 8 / 20 in 0.9s: loss 3.426452, train: 0.260, val 0.112, lr 6.634204e-05 mom 9.569532e-01
epoch 9 / 20 in 0.9s: loss 3.173775, train: 0.240, val 0.111, lr 6.302494e-05 mom 9.612579e-01
epoch 10 / 20 in 0.9s: loss 2.937779, train: 0.200, val 0.113, lr 5.987370e-05 mom 9.651321e-01
epoch 11 / 20 in 0.9s: loss 2.691136, train: 0.260, val 0.111, lr 5.688001e-05 mom 9.686189e-01
epoch 12 / 20 in 0.9s: loss 2.438592, train: 0.300, val 0.111, lr 5.403601e-05 mom 9.717571e-01
epoch 13 / 20 in 0.9s: loss 2.427982, train: 0.240, val 0.113, lr 5.133421e-05 mom 9.745814e-01
epoch 14 / 20 in 0.9s: loss 2.301744, train: 0.300, val 0.120, lr 4.876750e-05 mom 9.771232e-01
epoch 15 / 20 in 0.9s: loss 2.302263, train: 0.320, val 0.124, lr 4.632913e-05 mom 9.794109e-01
epoch 16 / 20 in 0.9s: loss 2.260264, train: 0.300, val 0.117, lr 4.401267e-05 mom 9.814698e-01
epoch 17 / 20 in 0.9s: loss 2.365086, train: 0.260, val 0.118, lr 4.181203e-05 mom 9.833229e-01
epoch 18 / 20 in 0.9s: loss 2.178240, train: 0.320, val 0.123, lr 3.972143e-05 mom 9.849906e-01
epoch 19 / 20 in 0.9s: loss 2.342404, train: 0.340, val 0.119, lr 3.773536e-05 mom 9.864916e-01
epoch 20 / 20 in 0.9s: loss 2.191207, train: 0.400, val 0.110, lr 3.584859e-05 mom 9.878424e-01

In [117]:
model, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
    network, input_var, X_train, y_train, X_val, y_val,
    learning_rate=param['learning_rate'], num_epochs=param['num_epochs'],
    batch_size=param['batch_size'])
print('%.3f' % min(loss_history), max(train_acc_history), max(val_acc_history), \
    ' '.join('%s=%s' % (k,param[k]) for k in param))


Compiling...
Training...
epoch 1 / 100 in 119.0s: loss 2.318504, train: 0.134, val 0.157, lr 9.500000e-05 mom 9.100000e-01
epoch 2 / 100 in 118.9s: loss 2.300581, train: 0.166, val 0.146, lr 9.025000e-05 mom 9.190000e-01
epoch 3 / 100 in 119.0s: loss 2.292756, train: 0.206, val 0.191, lr 8.573750e-05 mom 9.270999e-01
epoch 4 / 100 in 119.0s: loss 2.275991, train: 0.248, val 0.232, lr 8.145062e-05 mom 9.343899e-01
epoch 5 / 100 in 119.0s: loss 2.255572, train: 0.265, val 0.253, lr 7.737809e-05 mom 9.409509e-01
epoch 6 / 100 in 119.0s: loss 2.229092, train: 0.292, val 0.258, lr 7.350919e-05 mom 9.468558e-01
epoch 7 / 100 in 119.0s: loss 2.200565, train: 0.289, val 0.285, lr 6.983373e-05 mom 9.521703e-01
epoch 8 / 100 in 119.0s: loss 2.173981, train: 0.307, val 0.299, lr 6.634204e-05 mom 9.569532e-01
epoch 9 / 100 in 119.0s: loss 2.142996, train: 0.318, val 0.311, lr 6.302494e-05 mom 9.612579e-01
epoch 10 / 100 in 118.9s: loss 2.109364, train: 0.335, val 0.338, lr 5.987370e-05 mom 9.651321e-01
epoch 11 / 100 in 118.9s: loss 2.075525, train: 0.367, val 0.372, lr 5.688001e-05 mom 9.686189e-01
epoch 12 / 100 in 118.9s: loss 2.042083, train: 0.369, val 0.379, lr 5.403601e-05 mom 9.717571e-01
epoch 13 / 100 in 119.0s: loss 2.003143, train: 0.355, val 0.374, lr 5.133421e-05 mom 9.745814e-01
epoch 14 / 100 in 119.0s: loss 1.968326, train: 0.382, val 0.401, lr 4.876750e-05 mom 9.771232e-01
epoch 15 / 100 in 118.9s: loss 1.935849, train: 0.400, val 0.416, lr 4.632913e-05 mom 9.794109e-01
epoch 16 / 100 in 119.0s: loss 1.896845, train: 0.394, val 0.410, lr 4.401267e-05 mom 9.814698e-01
epoch 17 / 100 in 118.9s: loss 1.870250, train: 0.405, val 0.429, lr 4.181203e-05 mom 9.833229e-01
epoch 18 / 100 in 118.9s: loss 1.837316, train: 0.415, val 0.439, lr 3.972143e-05 mom 9.849906e-01
epoch 19 / 100 in 118.9s: loss 1.807514, train: 0.432, val 0.434, lr 3.773536e-05 mom 9.864916e-01
epoch 20 / 100 in 118.9s: loss 1.771687, train: 0.423, val 0.464, lr 3.584859e-05 mom 9.878424e-01
epoch 21 / 100 in 118.9s: loss 1.737611, train: 0.438, val 0.467, lr 3.405616e-05 mom 9.890581e-01
epoch 22 / 100 in 118.9s: loss 1.711730, train: 0.448, val 0.477, lr 3.235336e-05 mom 9.901523e-01
epoch 23 / 100 in 119.0s: loss 1.673019, train: 0.477, val 0.499, lr 3.073569e-05 mom 9.911371e-01
epoch 24 / 100 in 118.9s: loss 1.644735, train: 0.460, val 0.488, lr 2.919891e-05 mom 9.920233e-01
epoch 25 / 100 in 119.0s: loss 1.610350, train: 0.485, val 0.493, lr 2.773896e-05 mom 9.928210e-01
epoch 26 / 100 in 119.0s: loss 1.587085, train: 0.500, val 0.503, lr 2.635201e-05 mom 9.935389e-01
epoch 27 / 100 in 118.9s: loss 1.553481, train: 0.500, val 0.513, lr 2.503441e-05 mom 9.941850e-01
epoch 28 / 100 in 118.9s: loss 1.533502, train: 0.517, val 0.536, lr 2.378269e-05 mom 9.947665e-01
epoch 29 / 100 in 118.9s: loss 1.506634, train: 0.499, val 0.532, lr 2.259355e-05 mom 9.952899e-01
epoch 30 / 100 in 118.9s: loss 1.482524, train: 0.516, val 0.526, lr 2.146388e-05 mom 9.957609e-01
epoch 31 / 100 in 118.9s: loss 1.463194, train: 0.539, val 0.535, lr 2.039068e-05 mom 9.961848e-01
epoch 32 / 100 in 119.0s: loss 1.448789, train: 0.546, val 0.535, lr 1.937115e-05 mom 9.965663e-01
epoch 33 / 100 in 118.9s: loss 1.430224, train: 0.544, val 0.549, lr 1.840259e-05 mom 9.969097e-01
epoch 34 / 100 in 119.0s: loss 1.404319, train: 0.567, val 0.552, lr 1.748246e-05 mom 9.972187e-01
epoch 35 / 100 in 118.9s: loss 1.385945, train: 0.540, val 0.545, lr 1.660834e-05 mom 9.974968e-01
epoch 36 / 100 in 119.0s: loss 1.367607, train: 0.561, val 0.569, lr 1.577792e-05 mom 9.977472e-01
epoch 37 / 100 in 118.9s: loss 1.341838, train: 0.583, val 0.562, lr 1.498903e-05 mom 9.979725e-01
epoch 38 / 100 in 118.9s: loss 1.329648, train: 0.580, val 0.552, lr 1.423957e-05 mom 9.981753e-01
epoch 39 / 100 in 119.0s: loss 1.310204, train: 0.592, val 0.573, lr 1.352760e-05 mom 9.983577e-01
epoch 40 / 100 in 118.9s: loss 1.289379, train: 0.593, val 0.589, lr 1.285122e-05 mom 9.985219e-01
epoch 41 / 100 in 118.9s: loss 1.267939, train: 0.597, val 0.587, lr 1.220866e-05 mom 9.986697e-01
epoch 42 / 100 in 119.0s: loss 1.255105, train: 0.622, val 0.590, lr 1.159822e-05 mom 9.988028e-01
epoch 43 / 100 in 119.0s: loss 1.239694, train: 0.602, val 0.590, lr 1.101831e-05 mom 9.989225e-01
epoch 44 / 100 in 119.0s: loss 1.210926, train: 0.612, val 0.600, lr 1.046740e-05 mom 9.990303e-01
epoch 45 / 100 in 118.9s: loss 1.196522, train: 0.637, val 0.637, lr 9.944027e-06 mom 9.991273e-01
epoch 46 / 100 in 118.9s: loss 1.179403, train: 0.641, val 0.621, lr 9.446825e-06 mom 9.992145e-01
epoch 47 / 100 in 118.9s: loss 1.173354, train: 0.625, val 0.617, lr 8.974484e-06 mom 9.992931e-01
epoch 48 / 100 in 118.9s: loss 1.148145, train: 0.671, val 0.635, lr 8.525760e-06 mom 9.993638e-01
epoch 49 / 100 in 119.0s: loss 1.130034, train: 0.665, val 0.654, lr 8.099471e-06 mom 9.994274e-01
epoch 50 / 100 in 118.9s: loss 1.127878, train: 0.662, val 0.651, lr 7.694498e-06 mom 9.994847e-01
epoch 51 / 100 in 118.9s: loss 1.110297, train: 0.696, val 0.661, lr 7.309773e-06 mom 9.995362e-01
epoch 52 / 100 in 119.0s: loss 1.080579, train: 0.682, val 0.647, lr 6.944284e-06 mom 9.995826e-01
epoch 53 / 100 in 118.9s: loss 1.067140, train: 0.696, val 0.660, lr 6.597070e-06 mom 9.996243e-01
epoch 54 / 100 in 119.0s: loss 1.057995, train: 0.698, val 0.663, lr 6.267217e-06 mom 9.996619e-01
epoch 55 / 100 in 119.0s: loss 1.042966, train: 0.722, val 0.667, lr 5.953856e-06 mom 9.996957e-01
epoch 56 / 100 in 118.9s: loss 1.027454, train: 0.711, val 0.677, lr 5.656163e-06 mom 9.997261e-01
epoch 57 / 100 in 119.0s: loss 1.011541, train: 0.715, val 0.686, lr 5.373355e-06 mom 9.997535e-01
epoch 58 / 100 in 119.0s: loss 0.995829, train: 0.734, val 0.679, lr 5.104687e-06 mom 9.997782e-01
epoch 59 / 100 in 118.9s: loss 0.993285, train: 0.708, val 0.690, lr 4.849453e-06 mom 9.998003e-01
epoch 60 / 100 in 118.9s: loss 0.984133, train: 0.732, val 0.686, lr 4.606980e-06 mom 9.998203e-01
epoch 61 / 100 in 118.9s: loss 0.974277, train: 0.720, val 0.651, lr 4.376631e-06 mom 9.998382e-01
epoch 62 / 100 in 118.9s: loss 0.967135, train: 0.715, val 0.675, lr 4.157799e-06 mom 9.998544e-01
epoch 63 / 100 in 119.0s: loss 0.959195, train: 0.709, val 0.710, lr 3.949909e-06 mom 9.998689e-01
epoch 64 / 100 in 119.0s: loss 0.958395, train: 0.717, val 0.645, lr 3.752414e-06 mom 9.998820e-01
epoch 65 / 100 in 118.9s: loss 0.961230, train: 0.749, val 0.667, lr 3.564793e-06 mom 9.998938e-01
epoch 66 / 100 in 119.0s: loss 0.956396, train: 0.728, val 0.682, lr 3.386554e-06 mom 9.999000e-01
epoch 67 / 100 in 118.9s: loss 0.943995, train: 0.748, val 0.684, lr 3.217226e-06 mom 9.999000e-01
epoch 68 / 100 in 119.0s: loss 0.928266, train: 0.702, val 0.675, lr 3.056365e-06 mom 9.999000e-01
epoch 69 / 100 in 119.0s: loss 0.921137, train: 0.727, val 0.706, lr 2.903546e-06 mom 9.999000e-01
epoch 70 / 100 in 119.0s: loss 0.916508, train: 0.750, val 0.692, lr 2.758369e-06 mom 9.999000e-01
epoch 71 / 100 in 119.0s: loss 0.915388, train: 0.710, val 0.688, lr 2.620450e-06 mom 9.999000e-01
epoch 72 / 100 in 119.0s: loss 0.929153, train: 0.728, val 0.682, lr 2.489428e-06 mom 9.999000e-01
epoch 73 / 100 in 119.0s: loss 0.922188, train: 0.733, val 0.665, lr 2.364957e-06 mom 9.999000e-01
epoch 74 / 100 in 119.0s: loss 0.906357, train: 0.729, val 0.658, lr 2.246709e-06 mom 9.999000e-01
epoch 75 / 100 in 119.0s: loss 0.911330, train: 0.729, val 0.658, lr 2.134374e-06 mom 9.999000e-01
epoch 76 / 100 in 119.0s: loss 0.903888, train: 0.731, val 0.673, lr 2.027655e-06 mom 9.999000e-01
epoch 77 / 100 in 118.9s: loss 0.911090, train: 0.721, val 0.674, lr 1.926272e-06 mom 9.999000e-01
epoch 78 / 100 in 119.0s: loss 0.898858, train: 0.728, val 0.665, lr 1.829958e-06 mom 9.999000e-01
epoch 79 / 100 in 119.0s: loss 0.903461, train: 0.723, val 0.684, lr 1.738461e-06 mom 9.999000e-01
epoch 80 / 100 in 119.0s: loss 0.899644, train: 0.738, val 0.685, lr 1.651538e-06 mom 9.999000e-01
epoch 81 / 100 in 119.0s: loss 0.892099, train: 0.741, val 0.670, lr 1.568961e-06 mom 9.999000e-01
epoch 82 / 100 in 119.0s: loss 0.888836, train: 0.753, val 0.693, lr 1.490513e-06 mom 9.999000e-01
epoch 83 / 100 in 119.0s: loss 0.887223, train: 0.717, val 0.652, lr 1.415987e-06 mom 9.999000e-01
epoch 84 / 100 in 119.0s: loss 0.878957, train: 0.739, val 0.667, lr 1.345188e-06 mom 9.999000e-01
epoch 85 / 100 in 118.9s: loss 0.872763, train: 0.772, val 0.703, lr 1.277928e-06 mom 9.999000e-01
epoch 86 / 100 in 119.0s: loss 0.861406, train: 0.748, val 0.669, lr 1.214032e-06 mom 9.999000e-01
epoch 87 / 100 in 119.0s: loss 0.864802, train: 0.741, val 0.677, lr 1.153330e-06 mom 9.999000e-01
epoch 88 / 100 in 119.0s: loss 0.859530, train: 0.752, val 0.700, lr 1.095664e-06 mom 9.999000e-01
epoch 89 / 100 in 119.0s: loss 0.857563, train: 0.746, val 0.694, lr 1.040881e-06 mom 9.999000e-01
epoch 90 / 100 in 119.0s: loss 0.844187, train: 0.750, val 0.699, lr 9.888365e-07 mom 9.999000e-01
epoch 91 / 100 in 119.0s: loss 0.840757, train: 0.747, val 0.679, lr 9.393947e-07 mom 9.999000e-01
epoch 92 / 100 in 119.0s: loss 0.832068, train: 0.754, val 0.688, lr 8.924250e-07 mom 9.999000e-01
epoch 93 / 100 in 119.0s: loss 0.827147, train: 0.749, val 0.696, lr 8.478037e-07 mom 9.999000e-01
epoch 94 / 100 in 119.0s: loss 0.817345, train: 0.748, val 0.683, lr 8.054135e-07 mom 9.999000e-01
epoch 95 / 100 in 119.0s: loss 0.820755, train: 0.754, val 0.684, lr 7.651429e-07 mom 9.999000e-01
epoch 96 / 100 in 119.0s: loss 0.816936, train: 0.775, val 0.698, lr 7.268857e-07 mom 9.999000e-01
epoch 97 / 100 in 119.0s: loss 0.815445, train: 0.759, val 0.701, lr 6.905415e-07 mom 9.999000e-01
epoch 98 / 100 in 119.0s: loss 0.795261, train: 0.779, val 0.708, lr 6.560144e-07 mom 9.999000e-01
epoch 99 / 100 in 119.0s: loss 0.803294, train: 0.765, val 0.701, lr 6.232137e-07 mom 9.999000e-01
epoch 100 / 100 in 119.0s: loss 0.804475, train: 0.767, val 0.703, lr 5.920530e-07 mom 9.999000e-01
0.470 0.779017857143 0.709821428571 crp_filter_size=3 momentum_decay=0.9 num_fc=3 learning_rate=0.0001 batch_size=128 fc_num_units=256 learning_rate_decay=0.95 num_epochs=100 num_crp=3 reg=0.001 momentum=0.9 crp_num_filters=128

In [118]:
plt.subplot(2, 1, 1)
plt.plot(np.array(loss_history).clip(max=3))
plt.xlabel('iteration')
plt.ylabel('loss')
plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()



In [121]:
import pickle
with open('v1.pickle', 'wb') as f:
    pickle.dump(model, f, -1)

param


Out[121]:
{'batch_size': 128,
 'crp_filter_size': 3,
 'crp_num_filters': 128,
 'fc_num_units': 256,
 'learning_rate': 0.0001,
 'learning_rate_decay': 0.95,
 'momentum': 0.9,
 'momentum_decay': 0.9,
 'num_crp': 3,
 'num_epochs': 100,
 'num_fc': 3,
 'reg': 0.001}

v3: [conv-relu-conv-relu-pool]xN - [affine]xM - [softmax or SVM]

VGG-ish

  • input: 32x32x3
  • CONV3-64: 32x32x64
  • CONV3-64: 32x32x64
  • POOL2: 16x16x64
  • CONV3-128: 16x16x128
  • CONV3-128: 16x16x128
  • POOL2: 8x8x128
  • FC: 1x1x512
  • FC: 1x1x512
  • FC: 1x1x10

In [14]:
def create_v3(input_var, input_shape=(3, 32, 32),
              ccp_num_filters=[64, 128], ccp_filter_size=3,
              fc_num_units=[128, 128], num_classes=10,
              **junk):
    # input layer
    network = lasagne.layers.InputLayer(shape=(None,) + input_shape,
                                        input_var=input_var)
    # conv-relu-conv-relu-pool layers
    for num_filters in ccp_num_filters:
        network = lasagne.layers.Conv2DLayer(
            network, num_filters=num_filters,
            filter_size=(ccp_filter_size, ccp_filter_size),
            pad='same',
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform(gain='relu'))
        network = lasagne.layers.Conv2DLayer(
            network, num_filters=num_filters,
            filter_size=(ccp_filter_size, ccp_filter_size),
            pad='same',
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform(gain='relu'))
        network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
    # fc-relu
    for num_units in fc_num_units:
        network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=num_units,
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform(gain='relu'))
    # output layer
    network = lasagne.layers.DenseLayer(
        lasagne.layers.dropout(network, p=.5),
        num_units=num_classes,
        nonlinearity=lasagne.nonlinearities.softmax)
    return network

In [21]:
param = dict(ccp_num_filters=[64, 128], ccp_filter_size=3,
             fc_num_units=[256, 256], num_classes=10,
             learning_rate=1e-2, learning_rate_decay=0.5,
             momentum=0.9, momentum_decay=0.5,
             decay_after_epochs=10,
             batch_size=128, num_epochs=50)

In [23]:
network = create_v3(input_var, **param)
# overfit
xxx, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
          network, input_var, X_train[:50], y_train[:50], X_val, y_val,
          learning_rate=1e-2, batch_size=10, num_epochs=30, decay_after_epochs=3)


Compiling...
Training...
epoch 1 / 30 in 0.8s: loss 3.247790, train: 0.160, val 0.110, lr 1.000000e-02 mom 9.000000e-01
epoch 2 / 30 in 0.8s: loss 2.458449, train: 0.160, val 0.119, lr 1.000000e-02 mom 9.000000e-01
epoch 3 / 30 in 0.8s: loss 2.399592, train: 0.180, val 0.088, lr 1.000000e-02 mom 9.000000e-01
epoch 4 / 30 in 0.8s: loss 2.211803, train: 0.160, val 0.079, lr 9.500000e-03 mom 9.050000e-01
epoch 5 / 30 in 0.8s: loss 2.236917, train: 0.200, val 0.122, lr 9.500000e-03 mom 9.050000e-01
epoch 6 / 30 in 0.8s: loss 2.406256, train: 0.300, val 0.156, lr 9.500000e-03 mom 9.050000e-01
epoch 7 / 30 in 0.8s: loss 2.243320, train: 0.360, val 0.162, lr 9.025000e-03 mom 9.097500e-01
epoch 8 / 30 in 0.8s: loss 2.254551, train: 0.320, val 0.158, lr 9.025000e-03 mom 9.097500e-01
epoch 9 / 30 in 0.8s: loss 2.170130, train: 0.320, val 0.145, lr 9.025000e-03 mom 9.097500e-01
epoch 10 / 30 in 0.8s: loss 2.164701, train: 0.360, val 0.126, lr 8.573750e-03 mom 9.142625e-01
epoch 11 / 30 in 0.8s: loss 2.091385, train: 0.340, val 0.153, lr 8.573750e-03 mom 9.142625e-01
epoch 12 / 30 in 0.8s: loss 2.147110, train: 0.280, val 0.151, lr 8.573750e-03 mom 9.142625e-01
epoch 13 / 30 in 0.8s: loss 2.098304, train: 0.380, val 0.156, lr 8.145062e-03 mom 9.185494e-01
epoch 14 / 30 in 0.8s: loss 1.919416, train: 0.460, val 0.159, lr 8.145062e-03 mom 9.185494e-01
epoch 15 / 30 in 0.8s: loss 1.965349, train: 0.420, val 0.151, lr 8.145062e-03 mom 9.185494e-01
epoch 16 / 30 in 0.8s: loss 2.081868, train: 0.480, val 0.167, lr 7.737809e-03 mom 9.226219e-01
epoch 17 / 30 in 0.8s: loss 1.955984, train: 0.560, val 0.183, lr 7.737809e-03 mom 9.226219e-01
epoch 18 / 30 in 0.8s: loss 1.939489, train: 0.580, val 0.161, lr 7.737809e-03 mom 9.226219e-01
epoch 19 / 30 in 0.8s: loss 2.001198, train: 0.560, val 0.157, lr 7.350919e-03 mom 9.264908e-01
epoch 20 / 30 in 0.8s: loss 1.742004, train: 0.540, val 0.179, lr 7.350919e-03 mom 9.264908e-01
epoch 21 / 30 in 0.8s: loss 1.729675, train: 0.640, val 0.173, lr 7.350919e-03 mom 9.264908e-01
epoch 22 / 30 in 0.8s: loss 1.593313, train: 0.560, val 0.197, lr 6.983373e-03 mom 9.301662e-01
epoch 23 / 30 in 0.8s: loss 1.687712, train: 0.700, val 0.185, lr 6.983373e-03 mom 9.301662e-01
epoch 24 / 30 in 0.8s: loss 1.387340, train: 0.640, val 0.172, lr 6.983373e-03 mom 9.301662e-01
epoch 25 / 30 in 0.8s: loss 1.491013, train: 0.780, val 0.188, lr 6.634204e-03 mom 9.336579e-01
epoch 26 / 30 in 0.8s: loss 1.406746, train: 0.820, val 0.204, lr 6.634204e-03 mom 9.336579e-01
epoch 27 / 30 in 0.8s: loss 1.489491, train: 0.740, val 0.183, lr 6.634204e-03 mom 9.336579e-01
epoch 28 / 30 in 0.8s: loss 1.459224, train: 0.720, val 0.154, lr 6.302494e-03 mom 9.369751e-01
epoch 29 / 30 in 0.8s: loss 1.486630, train: 0.540, val 0.148, lr 6.302494e-03 mom 9.369751e-01
epoch 30 / 30 in 0.8s: loss 1.628676, train: 0.680, val 0.191, lr 6.302494e-03 mom 9.369751e-01

In [24]:
model, loss_history, train_acc_history, val_acc_history = lasagne_trainer.train(
    network, input_var, X_train, y_train, X_val, y_val,
    learning_rate=param['learning_rate'], learning_rate_decay=param['learning_rate_decay'],
    momentum=param['momentum'], momentum_decay=param['momentum_decay'],
    decay_after_epochs=param['decay_after_epochs'],
    batch_size=param['batch_size'], num_epochs=param['num_epochs'],
    save_path='net_v3')
print('%.3f' % min(loss_history), max(train_acc_history), max(val_acc_history), \
    ' '.join('%s=%s' % (k,param[k]) for k in param))


Compiling...
Training...
epoch 1 / 50 in 88.7s: loss 1.820408, train: 0.460, val 0.493, lr 1.000000e-02 mom 9.000000e-01
epoch 2 / 50 in 88.7s: loss 1.497272, train: 0.546, val 0.562, lr 1.000000e-02 mom 9.000000e-01
epoch 3 / 50 in 88.7s: loss 1.332127, train: 0.622, val 0.623, lr 1.000000e-02 mom 9.000000e-01
epoch 4 / 50 in 88.7s: loss 1.191845, train: 0.634, val 0.666, lr 1.000000e-02 mom 9.000000e-01
epoch 5 / 50 in 88.7s: loss 1.086699, train: 0.684, val 0.703, lr 1.000000e-02 mom 9.000000e-01
epoch 6 / 50 in 88.7s: loss 1.004664, train: 0.710, val 0.721, lr 1.000000e-02 mom 9.000000e-01
epoch 7 / 50 in 88.7s: loss 0.951341, train: 0.718, val 0.728, lr 1.000000e-02 mom 9.000000e-01
epoch 8 / 50 in 88.7s: loss 0.906197, train: 0.734, val 0.735, lr 1.000000e-02 mom 9.000000e-01
epoch 9 / 50 in 88.7s: loss 0.861574, train: 0.771, val 0.748, lr 1.000000e-02 mom 9.000000e-01
epoch 10 / 50 in 88.7s: loss 0.823561, train: 0.772, val 0.748, lr 1.000000e-02 mom 9.000000e-01
epoch 11 / 50 in 88.7s: loss 0.794733, train: 0.795, val 0.767, lr 5.000000e-03 mom 9.500000e-01
epoch 12 / 50 in 88.7s: loss 0.759413, train: 0.791, val 0.776, lr 5.000000e-03 mom 9.500000e-01
epoch 13 / 50 in 88.7s: loss 0.734678, train: 0.815, val 0.788, lr 5.000000e-03 mom 9.500000e-01
epoch 14 / 50 in 88.7s: loss 0.707664, train: 0.817, val 0.795, lr 5.000000e-03 mom 9.500000e-01
epoch 15 / 50 in 88.7s: loss 0.685139, train: 0.828, val 0.782, lr 5.000000e-03 mom 9.500000e-01
epoch 16 / 50 in 88.7s: loss 0.667930, train: 0.846, val 0.791, lr 5.000000e-03 mom 9.500000e-01
epoch 17 / 50 in 88.7s: loss 0.644620, train: 0.855, val 0.801, lr 5.000000e-03 mom 9.500000e-01
epoch 18 / 50 in 88.7s: loss 0.620334, train: 0.857, val 0.795, lr 5.000000e-03 mom 9.500000e-01
epoch 19 / 50 in 88.7s: loss 0.605781, train: 0.855, val 0.797, lr 5.000000e-03 mom 9.500000e-01
epoch 20 / 50 in 88.8s: loss 0.592860, train: 0.864, val 0.811, lr 5.000000e-03 mom 9.500000e-01
epoch 21 / 50 in 88.7s: loss 0.579383, train: 0.863, val 0.809, lr 2.500000e-03 mom 9.750000e-01
epoch 22 / 50 in 88.7s: loss 0.553778, train: 0.864, val 0.802, lr 2.500000e-03 mom 9.750000e-01
epoch 23 / 50 in 88.7s: loss 0.540664, train: 0.886, val 0.814, lr 2.500000e-03 mom 9.750000e-01
epoch 24 / 50 in 88.8s: loss 0.520104, train: 0.894, val 0.820, lr 2.500000e-03 mom 9.750000e-01
epoch 25 / 50 in 88.7s: loss 0.510712, train: 0.897, val 0.808, lr 2.500000e-03 mom 9.750000e-01
epoch 26 / 50 in 88.7s: loss 0.498141, train: 0.894, val 0.809, lr 2.500000e-03 mom 9.750000e-01
epoch 27 / 50 in 88.7s: loss 0.490399, train: 0.901, val 0.819, lr 2.500000e-03 mom 9.750000e-01
epoch 28 / 50 in 88.8s: loss 0.468590, train: 0.900, val 0.824, lr 2.500000e-03 mom 9.750000e-01
epoch 29 / 50 in 88.7s: loss 0.462874, train: 0.920, val 0.816, lr 2.500000e-03 mom 9.750000e-01
epoch 30 / 50 in 88.8s: loss 0.445270, train: 0.925, val 0.827, lr 2.500000e-03 mom 9.750000e-01
epoch 31 / 50 in 88.8s: loss 0.446132, train: 0.930, val 0.828, lr 1.250000e-03 mom 9.875000e-01
epoch 32 / 50 in 88.8s: loss 0.423588, train: 0.919, val 0.811, lr 1.250000e-03 mom 9.875000e-01
epoch 33 / 50 in 88.7s: loss 0.412946, train: 0.930, val 0.829, lr 1.250000e-03 mom 9.875000e-01
epoch 34 / 50 in 88.7s: loss 0.401293, train: 0.939, val 0.821, lr 1.250000e-03 mom 9.875000e-01
epoch 35 / 50 in 88.7s: loss 0.389699, train: 0.942, val 0.828, lr 1.250000e-03 mom 9.875000e-01
epoch 36 / 50 in 88.8s: loss 0.386479, train: 0.943, val 0.821, lr 1.250000e-03 mom 9.875000e-01
epoch 37 / 50 in 88.8s: loss 0.381396, train: 0.940, val 0.845, lr 1.250000e-03 mom 9.875000e-01
epoch 38 / 50 in 88.7s: loss 0.368441, train: 0.952, val 0.838, lr 1.250000e-03 mom 9.875000e-01
epoch 39 / 50 in 88.7s: loss 0.358264, train: 0.953, val 0.831, lr 1.250000e-03 mom 9.875000e-01
epoch 40 / 50 in 88.7s: loss 0.358964, train: 0.961, val 0.830, lr 1.250000e-03 mom 9.875000e-01
epoch 41 / 50 in 88.8s: loss 0.366442, train: 0.965, val 0.842, lr 6.250000e-04 mom 9.937500e-01
epoch 42 / 50 in 88.8s: loss 0.337541, train: 0.956, val 0.830, lr 6.250000e-04 mom 9.937500e-01
epoch 43 / 50 in 88.8s: loss 0.333747, train: 0.967, val 0.825, lr 6.250000e-04 mom 9.937500e-01
epoch 44 / 50 in 88.8s: loss 0.329299, train: 0.965, val 0.835, lr 6.250000e-04 mom 9.937500e-01
epoch 45 / 50 in 88.8s: loss 0.319019, train: 0.964, val 0.831, lr 6.250000e-04 mom 9.937500e-01
epoch 46 / 50 in 88.7s: loss 0.307978, train: 0.970, val 0.848, lr 6.250000e-04 mom 9.937500e-01
epoch 47 / 50 in 88.8s: loss 0.305475, train: 0.974, val 0.829, lr 6.250000e-04 mom 9.937500e-01
epoch 48 / 50 in 88.7s: loss 0.299597, train: 0.979, val 0.840, lr 6.250000e-04 mom 9.937500e-01
epoch 49 / 50 in 88.8s: loss 0.288366, train: 0.975, val 0.839, lr 6.250000e-04 mom 9.937500e-01
epoch 50 / 50 in 88.7s: loss 0.278476, train: 0.980, val 0.836, lr 6.250000e-04 mom 9.937500e-01
0.105 0.979910714286 0.848214285714 learning_rate=0.01 ccp_num_filters=[64, 128] batch_size=128 decay_after_epochs=10 fc_num_units=[256, 256] momentum=0.9 ccp_filter_size=3 num_epochs=50 momentum_decay=0.5 num_classes=10 learning_rate_decay=0.5

In [25]:
plt.subplot(2, 1, 1)
plt.plot(np.array(loss_history).clip(max=3))
plt.xlabel('iteration')
plt.ylabel('loss')
plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()



In [ ]: