Very deep networks with repeating elements


In [1]:
import mxnet as mx
from mxnet import gluon
import numpy as np
from tqdm import tqdm_notebook as tqdm
mx.random.seed(1)

Context


In [2]:
ctx = mx.gpu()

MNIST


In [3]:
batch_size = 64

In [4]:
def transform(data, label):
    return mx.nd.transpose(data.astype(np.float32), (2,0,1)) / 255, label.astype(np.float32)

In [5]:
train_data = mx.gluon.data.DataLoader(dataset=mx.gluon.data.vision.MNIST(train=True, transform=transform),
                                      batch_size=batch_size,
                                      shuffle=True)
test_data = mx.gluon.data.DataLoader(dataset=mx.gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size=batch_size,
                                     shuffle=False)

VGG


In [6]:
from mxnet.gluon import nn

def vgg_block(num_convs, channels):
    out = nn.Sequential()
    for _ in range(num_convs):
        out.add(nn.Conv2D(channels=channels,
                          kernel_size=[3, 3],
                          padding=1,
                          activation='relu'))
    out.add(nn.MaxPool2D(pool_size=2,
                         strides=2))
    return out

def vgg_stack(architecture):
    out = nn.Sequential()
    for (num_convs, channels) in architecture:
        out.add(vgg_block(num_convs, channels))
    return out

num_outputs = 10
architecture = ((1,64), (1,128), (2,256), (2,512))
net = nn.Sequential()
with net.name_scope():
    net.add(vgg_stack(architecture))
    net.add(nn.Flatten())
    net.add(nn.Dense(units=512,
                     activation="relu"))
    net.add(nn.Dropout(rate=0.5))
    net.add(nn.Dense(units=512,
                     activation="relu"))
    net.add(nn.Dropout(rate=0.5))
    net.add(nn.Dense(units=num_outputs))

In [7]:
print(net)


Sequential(
  (0): Sequential(
    (0): Sequential(
      (0): Conv2D(None -> 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (1): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2D(None -> 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (1): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv2D(None -> 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (1): Conv2D(None -> 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (2): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    )
    (3): Sequential(
      (0): Conv2D(None -> 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (1): Conv2D(None -> 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
      (2): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    )
  )
  (1): Flatten
  (2): Dense(None -> 512, Activation(relu))
  (3): Dropout(p = 0.5, axes=())
  (4): Dense(None -> 512, Activation(relu))
  (5): Dropout(p = 0.5, axes=())
  (6): Dense(None -> 10, linear)
)

Parameter initialization


In [8]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

Optimizer


In [9]:
trainer = gluon.Trainer(params=net.collect_params(),
                        optimizer='sgd',
                        optimizer_params={'learning_rate': .05})

Softmax cross-entropy loss


In [10]:
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

Evaluation


In [11]:
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for d, l in data_iterator:
        data = d.as_in_context(ctx)
        label = l.as_in_context(ctx)
        output = net(data)
        predictions = mx.nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

Training


In [12]:
epochs = 1
smoothing_constant = .01

for e in range(epochs):
    for i, (d, l) in tqdm(enumerate(train_data)):
        data = d.as_in_context(ctx)
        label = l.as_in_context(ctx)
        with mx.autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(data.shape[0])

        ##########################
        #  Keep a moving average of the losses
        ##########################
        curr_loss = mx.nd.mean(loss).asscalar()
        moving_loss = (curr_loss if ((i == 0) and (e == 0))
                       else (1 - smoothing_constant) * moving_loss + smoothing_constant * curr_loss)

        if i > 0 and i % 200 == 0:
            print('Batch %d. Loss: %f' % (i, moving_loss))

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))


/usr/lib/python3/dist-packages/ipywidgets/widgets/widget.py:494: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  self.log.warn(message)
Widget Javascript not detected.  It may not be installed or enabled properly.
Batch 200. Loss: 2.298344
Batch 400. Loss: 2.179981
Batch 600. Loss: 0.993611
Batch 800. Loss: 0.392861

Epoch 0. Loss: 0.25404232464152704, Train_acc 0.94845, Test_acc 0.9511