02 - Multilayer perceptrons in gluon



In [1]:

    
import mxnet as mx
import numpy as np
from mxnet import gluon
from tqdm import tqdm

Context



In [2]:

    
data_ctx = mx.cpu()
model_ctx = mx.cpu()

MNIST Dataset



In [3]:

    
batch_size = 64
num_inputs = 784
num_outputs = 10
num_examples = 60000



In [4]:

    
def transform(data, label):
    return data.astype(np.float32) / 255, label.astype(np.float32)



In [5]:

    
train_data = gluon.data.DataLoader(dataset=gluon.data.vision.MNIST(train=True, transform=transform),
                                   batch_size=batch_size,
                                   shuffle=True)
test_data = gluon.data.DataLoader(dataset=gluon.data.vision.MNIST(train=False, transform=transform),
                                  batch_size=batch_size,
                                  shuffle=False)

Define MLP model with mx.Block



In [6]:

    
class MLP(gluon.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense0 = gluon.nn.Dense(64)
            self.dense1 = gluon.nn.Dense(64)
            self.dense2 = gluon.nn.Dense(10)

    def forward(self, x):
        x = mx.nd.relu(self.dense0(x))
        x = mx.nd.relu(self.dense1(x))
        x = self.dense2(x)
        return x



In [7]:

    
net = MLP()
net.collect_params().initialize(mx.init.Normal(sigma=.01), 
                                ctx=model_ctx)

Example of a single forward pass



In [8]:

    
data = mx.nd.ones(shape=[1, 784])



In [9]:

    
class MLP(gluon.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense0 = gluon.nn.Dense(units=64, activation="relu")
            self.dense1 = gluon.nn.Dense(units=64, activation="relu")
            self.dense2 = gluon.nn.Dense(units=10)

    def forward(self, x):
        x = self.dense0(x)
        print("-" * 70)
        print("Hidden Representation 1: %s" % x)
        x = self.dense1(x)
        print("-" * 70)
        print("Hidden Representation 2: %s" % x)
        x = self.dense2(x)
        print("-" * 70)
        print("Network output: %s" % x)
        print("-" * 70)
        return x

net = MLP()
net.collect_params().initialize(mx.init.Normal(sigma=.01), ctx=model_ctx)
net(data.as_in_context(model_ctx))









    



----------------------------------------------------------------------
Hidden Representation 1: 
[[0.         0.25953296 0.5081844  0.47407073 0.5739144  0.04646487
  0.3490802  0.         0.         0.         0.         0.
  0.09897906 0.         0.44429356 0.5806929  0.         0.
  0.07937321 0.13445261 0.17002776 0.         0.59629107 0.
  0.51476306 0.2620116  0.07252947 0.         0.44609177 0.
  0.10297956 0.12023637 0.01070242 0.14927042 0.         0.11931495
  0.06247869 0.34996682 0.23720959 0.33213574 0.         0.
  0.35576025 0.02980644 0.         0.         0.3602543  0.01930529
  0.5578985  0.         0.         0.22368181 0.3668564  0.0344954
  0.16685106 0.         0.07805604 0.04645126 0.46009526 0.
  0.         0.         0.         0.4059968 ]]
<NDArray 1x64 @cpu(0)>
----------------------------------------------------------------------
Hidden Representation 2: 
[[0.         0.         0.00471901 0.00809325 0.00563266 0.00358269
  0.01304015 0.         0.         0.0179144  0.00409093 0.01971137
  0.01811438 0.         0.         0.03330275 0.03080758 0.
  0.01005297 0.         0.         0.         0.         0.
  0.         0.         0.         0.01851467 0.         0.00467824
  0.         0.00476716 0.00890849 0.         0.01493133 0.
  0.01890475 0.         0.01004198 0.         0.         0.
  0.         0.         0.0218619  0.         0.01256697 0.
  0.00875257 0.01837254 0.         0.012395   0.         0.
  0.         0.         0.03347883 0.         0.00547096 0.0096815
  0.03013829 0.         0.02648943 0.        ]]
<NDArray 1x64 @cpu(0)>
----------------------------------------------------------------------
Network output: 
[[ 0.0010479  -0.00023263  0.00024665 -0.00137001 -0.00089217 -0.00043491
   0.0017453  -0.00114445  0.00024293 -0.0004818 ]]
<NDArray 1x10 @cpu(0)>
----------------------------------------------------------------------






    Out[9]:





[[ 0.0010479  -0.00023263  0.00024665 -0.00137001 -0.00089217 -0.00043491
   0.0017453  -0.00114445  0.00024293 -0.0004818 ]]
<NDArray 1x10 @cpu(0)>

Faster modeling with gluon.nn.Sequential



In [10]:

    
num_hidden = 64



In [11]:

    
# Defining a sequential model
net = gluon.nn.Sequential()
with net.name_scope():
    net.add(gluon.nn.Dense(units=num_hidden, 
                           activation="relu"))
    net.add(gluon.nn.Dense(units=num_hidden, 
                           activation="relu"))
    net.add(gluon.nn.Dense(units=num_outputs))



In [12]:

    
# Parameter initialization
net.collect_params().initialize(mx.init.Normal(sigma=.1), 
                                ctx=model_ctx)



In [13]:

    
# Softmax cross-entropy
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()



In [14]:

    
# Optimizer
trainer = gluon.Trainer(params=net.collect_params(),
                        optimizer='sgd',
                        optimizer_params={'learning_rate': 0.01})

Evaluation



In [15]:

    
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        output = net(data)
        predictions = mx.nd.argmax(data=output,
                                   axis=1)
        # Updating accuracy metric
        acc.update(preds=predictions, 
                   labels=label)
    return acc.get()[1]

Training



In [16]:

    
epochs = 10
smoothing_constant = .01



In [17]:

    
for e in tqdm(range(epochs)):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        with mx.autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(data.shape[0])
        cumulative_loss += mx.nd.sum(loss).asscalar()


    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" %
          (e, cumulative_loss/num_examples, train_accuracy, test_accuracy))









    



 10%|█         | 1/10 [00:31<04:41, 31.26s/it]





    



Epoch 0. Loss: 1.2724453049023947, Train_acc 0.8347, Test_acc 0.8434






    



 20%|██        | 2/10 [01:05<04:20, 32.56s/it]





    



Epoch 1. Loss: 0.48276078701019287, Train_acc 0.87955, Test_acc 0.8843






    



 30%|███       | 3/10 [01:35<03:43, 31.87s/it]





    



Epoch 2. Loss: 0.37886862614949546, Train_acc 0.8980166666666667, Test_acc 0.9013






    



 40%|████      | 4/10 [02:05<03:08, 31.34s/it]





    



Epoch 3. Loss: 0.33094510640303293, Train_acc 0.9094666666666666, Test_acc 0.9133






    



 50%|█████     | 5/10 [02:34<02:34, 30.99s/it]





    



Epoch 4. Loss: 0.30067180269559224, Train_acc 0.9174833333333333, Test_acc 0.9204






    



 60%|██████    | 6/10 [03:04<02:03, 30.76s/it]





    



Epoch 5. Loss: 0.2779835115830104, Train_acc 0.9238333333333333, Test_acc 0.9227






    



 70%|███████   | 7/10 [03:34<01:31, 30.63s/it]





    



Epoch 6. Loss: 0.2602314037243525, Train_acc 0.9279833333333334, Test_acc 0.9272






    



 80%|████████  | 8/10 [04:03<01:00, 30.50s/it]





    



Epoch 7. Loss: 0.24440940081278484, Train_acc 0.93445, Test_acc 0.9343






    



 90%|█████████ | 9/10 [04:33<00:30, 30.38s/it]





    



Epoch 8. Loss: 0.23077674449682237, Train_acc 0.9357666666666666, Test_acc 0.9345






    



100%|██████████| 10/10 [05:03<00:00, 30.31s/it]





    



Epoch 9. Loss: 0.21937609051863352, Train_acc 0.9384166666666667, Test_acc 0.9363



In [18]:

    
train_accuracy









    Out[18]:





0.9384166666666667



In [19]:

    
test_accuracy









    Out[19]:





0.9363