02 - Multilayer perceptrons in gluon


In [1]:
import mxnet as mx
import numpy as np
from mxnet import gluon
from tqdm import tqdm

Context


In [2]:
data_ctx = mx.cpu()
model_ctx = mx.cpu()

MNIST Dataset


In [3]:
batch_size = 64
num_inputs = 784
num_outputs = 10
num_examples = 60000

In [4]:
def transform(data, label):
    return data.astype(np.float32) / 255, label.astype(np.float32)

In [5]:
train_data = gluon.data.DataLoader(dataset=gluon.data.vision.MNIST(train=True, transform=transform),
                                   batch_size=batch_size,
                                   shuffle=True)
test_data = gluon.data.DataLoader(dataset=gluon.data.vision.MNIST(train=False, transform=transform),
                                  batch_size=batch_size,
                                  shuffle=False)

Define MLP model with mx.Block


In [6]:
class MLP(gluon.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense0 = gluon.nn.Dense(64)
            self.dense1 = gluon.nn.Dense(64)
            self.dense2 = gluon.nn.Dense(10)

    def forward(self, x):
        x = mx.nd.relu(self.dense0(x))
        x = mx.nd.relu(self.dense1(x))
        x = self.dense2(x)
        return x

In [7]:
net = MLP()
net.collect_params().initialize(mx.init.Normal(sigma=.01), 
                                ctx=model_ctx)

Example of a single forward pass


In [8]:
data = mx.nd.ones(shape=[1, 784])

In [9]:
class MLP(gluon.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense0 = gluon.nn.Dense(units=64, activation="relu")
            self.dense1 = gluon.nn.Dense(units=64, activation="relu")
            self.dense2 = gluon.nn.Dense(units=10)

    def forward(self, x):
        x = self.dense0(x)
        print("-" * 70)
        print("Hidden Representation 1: %s" % x)
        x = self.dense1(x)
        print("-" * 70)
        print("Hidden Representation 2: %s" % x)
        x = self.dense2(x)
        print("-" * 70)
        print("Network output: %s" % x)
        print("-" * 70)
        return x

net = MLP()
net.collect_params().initialize(mx.init.Normal(sigma=.01), ctx=model_ctx)
net(data.as_in_context(model_ctx))


----------------------------------------------------------------------
Hidden Representation 1: 
[[0.         0.25953296 0.5081844  0.47407073 0.5739144  0.04646487
  0.3490802  0.         0.         0.         0.         0.
  0.09897906 0.         0.44429356 0.5806929  0.         0.
  0.07937321 0.13445261 0.17002776 0.         0.59629107 0.
  0.51476306 0.2620116  0.07252947 0.         0.44609177 0.
  0.10297956 0.12023637 0.01070242 0.14927042 0.         0.11931495
  0.06247869 0.34996682 0.23720959 0.33213574 0.         0.
  0.35576025 0.02980644 0.         0.         0.3602543  0.01930529
  0.5578985  0.         0.         0.22368181 0.3668564  0.0344954
  0.16685106 0.         0.07805604 0.04645126 0.46009526 0.
  0.         0.         0.         0.4059968 ]]
<NDArray 1x64 @cpu(0)>
----------------------------------------------------------------------
Hidden Representation 2: 
[[0.         0.         0.00471901 0.00809325 0.00563266 0.00358269
  0.01304015 0.         0.         0.0179144  0.00409093 0.01971137
  0.01811438 0.         0.         0.03330275 0.03080758 0.
  0.01005297 0.         0.         0.         0.         0.
  0.         0.         0.         0.01851467 0.         0.00467824
  0.         0.00476716 0.00890849 0.         0.01493133 0.
  0.01890475 0.         0.01004198 0.         0.         0.
  0.         0.         0.0218619  0.         0.01256697 0.
  0.00875257 0.01837254 0.         0.012395   0.         0.
  0.         0.         0.03347883 0.         0.00547096 0.0096815
  0.03013829 0.         0.02648943 0.        ]]
<NDArray 1x64 @cpu(0)>
----------------------------------------------------------------------
Network output: 
[[ 0.0010479  -0.00023263  0.00024665 -0.00137001 -0.00089217 -0.00043491
   0.0017453  -0.00114445  0.00024293 -0.0004818 ]]
<NDArray 1x10 @cpu(0)>
----------------------------------------------------------------------
Out[9]:
[[ 0.0010479  -0.00023263  0.00024665 -0.00137001 -0.00089217 -0.00043491
   0.0017453  -0.00114445  0.00024293 -0.0004818 ]]
<NDArray 1x10 @cpu(0)>

Faster modeling with gluon.nn.Sequential


In [10]:
num_hidden = 64

In [11]:
# Defining a sequential model
net = gluon.nn.Sequential()
with net.name_scope():
    net.add(gluon.nn.Dense(units=num_hidden, 
                           activation="relu"))
    net.add(gluon.nn.Dense(units=num_hidden, 
                           activation="relu"))
    net.add(gluon.nn.Dense(units=num_outputs))

In [12]:
# Parameter initialization
net.collect_params().initialize(mx.init.Normal(sigma=.1), 
                                ctx=model_ctx)

In [13]:
# Softmax cross-entropy
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

In [14]:
# Optimizer
trainer = gluon.Trainer(params=net.collect_params(),
                        optimizer='sgd',
                        optimizer_params={'learning_rate': 0.01})

Evaluation


In [15]:
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        output = net(data)
        predictions = mx.nd.argmax(data=output,
                                   axis=1)
        # Updating accuracy metric
        acc.update(preds=predictions, 
                   labels=label)
    return acc.get()[1]

Training


In [16]:
epochs = 10
smoothing_constant = .01

In [17]:
for e in tqdm(range(epochs)):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        with mx.autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(data.shape[0])
        cumulative_loss += mx.nd.sum(loss).asscalar()


    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" %
          (e, cumulative_loss/num_examples, train_accuracy, test_accuracy))


 10%|█         | 1/10 [00:31<04:41, 31.26s/it]
Epoch 0. Loss: 1.2724453049023947, Train_acc 0.8347, Test_acc 0.8434
 20%|██        | 2/10 [01:05<04:20, 32.56s/it]
Epoch 1. Loss: 0.48276078701019287, Train_acc 0.87955, Test_acc 0.8843
 30%|███       | 3/10 [01:35<03:43, 31.87s/it]
Epoch 2. Loss: 0.37886862614949546, Train_acc 0.8980166666666667, Test_acc 0.9013
 40%|████      | 4/10 [02:05<03:08, 31.34s/it]
Epoch 3. Loss: 0.33094510640303293, Train_acc 0.9094666666666666, Test_acc 0.9133
 50%|█████     | 5/10 [02:34<02:34, 30.99s/it]
Epoch 4. Loss: 0.30067180269559224, Train_acc 0.9174833333333333, Test_acc 0.9204
 60%|██████    | 6/10 [03:04<02:03, 30.76s/it]
Epoch 5. Loss: 0.2779835115830104, Train_acc 0.9238333333333333, Test_acc 0.9227
 70%|███████   | 7/10 [03:34<01:31, 30.63s/it]
Epoch 6. Loss: 0.2602314037243525, Train_acc 0.9279833333333334, Test_acc 0.9272
 80%|████████  | 8/10 [04:03<01:00, 30.50s/it]
Epoch 7. Loss: 0.24440940081278484, Train_acc 0.93445, Test_acc 0.9343
 90%|█████████ | 9/10 [04:33<00:30, 30.38s/it]
Epoch 8. Loss: 0.23077674449682237, Train_acc 0.9357666666666666, Test_acc 0.9345
100%|██████████| 10/10 [05:03<00:00, 30.31s/it]
Epoch 9. Loss: 0.21937609051863352, Train_acc 0.9384166666666667, Test_acc 0.9363


In [18]:
train_accuracy


Out[18]:
0.9384166666666667

In [19]:
test_accuracy


Out[19]:
0.9363