MNIST with PyTorch

Based on the code from PyTorch / examples.


In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.autograd import Variable
from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['image.cmap'] = 'gray'
%matplotlib inline

In [61]:
# input batch size for training (default: 64)
batch_size = 64

# input batch size for testing (default: 1000)
test_batch_size = 1000

# number of epochs to train (default: 10)
epochs = 10

# learning rate (default: 0.01)
lr = 0.01

# SGD momentum (default: 0.5)
momentum = 0.5

# disables CUDA training
no_cuda = True

# random seed (default: 1)
seed = 1

# how many batches to wait before logging training status
log_interval = 10

# Setting seed for reproducibility.
torch.manual_seed(seed)

cuda = not no_cuda and torch.cuda.is_available()
print("CUDA: {}".format(cuda))


CUDA: False

Setting up the data loaders


In [62]:
if cuda:
    torch.cuda.manual_seed(seed)
cudakwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

mnist_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # Precalcualted values.
])

train_set = datasets.MNIST(
    root='data',
    train=True,
    transform=mnist_transform,
    download=True,
)

test_set = datasets.MNIST(
    root='data',
    train=False,
    transform=mnist_transform,
    download=True,
)

train_loader = torch.utils.data.DataLoader(
    dataset=train_set,
    batch_size=batch_size,
    shuffle=True,
    **cudakwargs
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_set,
    batch_size=test_batch_size,
    shuffle=True,
    **cudakwargs
)

Encapsulate training and testing in functions

The function train implements one epoch of training. It will loop through the training data, fetching training batches and computing the model output. The predicted output is compared to the target output using the negative log likelihood function torch.nn.functional.nll_loss().


In [63]:
def train(model, loader, optimizer, epoch, log_interval=100):
    model.train() # Set model to training mode.
    for batch_idx, (data, target) in enumerate(loader): # Getting the next batch.
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target) 
        optimizer.zero_grad() # Setting gradients to zero, to avoid accumulation.
        output = model.forward(data) # Passing data through the network.
        loss = F.nll_loss(output, target) # Calculating the loss.
        loss.backward() # Compute gradients.
        optimizer.step() # Update weights.
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))

Implementing the test procedure


In [64]:
def test(model, loader):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.cross_entropy(output, target).data[0]
        pred = output.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()

    test_loss = test_loss
    test_loss /= len(loader) # loss function already averages over batch size

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(loader.dataset),
        100. * correct / len(loader.dataset)))

nn.Module API


In [65]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv2d_1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2d_2 = nn.Conv2d(32, 32, kernel_size=3)
        self.dense_1 = nn.Linear(3872, 64)
        self.dense_2 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv2d_1(x), kernel_size=2))
        x = F.relu(self.conv2d_2(x))
        x = F.dropout(x, training=self.training)
        x = x.view(-1, 3872)
        x = F.relu(self.dense_1(x))
        x = F.dropout(x, training=self.training)
        x = self.dense_2(x)
        return F.log_softmax(x)

Creation of the model

The model is created by instantiating the model class. Notice that we have to explicitly activate CUDA for the model if we want it to be used:


In [66]:
model = Net()
if cuda:
    model.cuda()

In [67]:
for p in model.parameters():
    print(p.data.shape)


torch.Size([32, 1, 3, 3])
torch.Size([32])
torch.Size([32, 32, 3, 3])
torch.Size([32])
torch.Size([64, 3872])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])

In [68]:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
for epoch in range(1, 3):
    train(model, train_loader, optimizer, epoch)


Train Epoch: 1 [0/60000 (0%)]	Loss: 2.307804
Train Epoch: 1 [6400/60000 (11%)]	Loss: 0.821783
Train Epoch: 1 [12800/60000 (21%)]	Loss: 0.844431
Train Epoch: 1 [19200/60000 (32%)]	Loss: 0.607474
Train Epoch: 1 [25600/60000 (43%)]	Loss: 0.433042
Train Epoch: 1 [32000/60000 (53%)]	Loss: 0.457261
Train Epoch: 1 [38400/60000 (64%)]	Loss: 0.405718
Train Epoch: 1 [44800/60000 (75%)]	Loss: 0.417186
Train Epoch: 1 [51200/60000 (85%)]	Loss: 0.270134
Train Epoch: 1 [57600/60000 (96%)]	Loss: 0.280051
Train Epoch: 2 [0/60000 (0%)]	Loss: 0.435494
Train Epoch: 2 [6400/60000 (11%)]	Loss: 0.199564
Train Epoch: 2 [12800/60000 (21%)]	Loss: 0.255928
Train Epoch: 2 [19200/60000 (32%)]	Loss: 0.269778
Train Epoch: 2 [25600/60000 (43%)]	Loss: 0.120274
Train Epoch: 2 [32000/60000 (53%)]	Loss: 0.283706
Train Epoch: 2 [38400/60000 (64%)]	Loss: 0.161244
Train Epoch: 2 [44800/60000 (75%)]	Loss: 0.110074
Train Epoch: 2 [51200/60000 (85%)]	Loss: 0.260236
Train Epoch: 2 [57600/60000 (96%)]	Loss: 0.056930

In [69]:
test(model, test_loader)


Test set: Average loss: 0.0856, Accuracy: 9732/10000 (97%)


In [70]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

model_seq = nn.Sequential(OrderedDict([
    ('conv2d_1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)),
    ('relu_1', nn.ReLU()),
    ('max_pooling2d_1', nn.MaxPool2d(kernel_size=2)),
    ('conv2d_2', nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3)),
    ('relu_2', nn.ReLU()),
    ('dropout_1', nn.Dropout(p=0.25)),
    ('flatten_1', Flatten()),
    ('dense_1', nn.Linear(3872, 64)),
    ('relu_3', nn.ReLU()),
    ('dropout_2', nn.Dropout(p=0.5)),
    ('dense_2', nn.Linear(64, 10)),
    ('readout', nn.LogSoftmax())
]))

if cuda:
    model_seq.cuda()

In [71]:
model.__call__


Out[71]:
<bound method Module.__call__ of Net (
  (conv2d_1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2d_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (dense_1): Linear (3872 -> 64)
  (dense_2): Linear (64 -> 10)
)>

In [72]:
model.forward


Out[72]:
<bound method Net.forward of Net (
  (conv2d_1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2d_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (dense_1): Linear (3872 -> 64)
  (dense_2): Linear (64 -> 10)
)>

In [73]:
for p in model_seq.parameters():
    print(p.data.shape)


torch.Size([32, 1, 3, 3])
torch.Size([32])
torch.Size([32, 32, 3, 3])
torch.Size([32])
torch.Size([64, 3872])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])

In [102]:
for m in model_seq.modules():
    print(m)


Sequential (
  (conv2d_1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (relu_1): ReLU ()
  (max_pooling2d_1): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  (conv2d_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (relu_2): ReLU ()
  (dropout_1): Dropout (p = 0.25)
  (flatten_1): Flatten (
  )
  (dense_1): Linear (3872 -> 64)
  (relu_3): ReLU ()
  (dropout_2): Dropout (p = 0.5)
  (dense_2): Linear (64 -> 10)
  (readout): LogSoftmax ()
)
Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
Dropout (p = 0.25)
Flatten (
)
Linear (3872 -> 64)
ReLU ()
Dropout (p = 0.5)
Linear (64 -> 10)
LogSoftmax ()

In [107]:
for m in model_seq.children():
    print(m)


Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
Dropout (p = 0.25)
Flatten (
)
Linear (3872 -> 64)
ReLU ()
Dropout (p = 0.5)
Linear (64 -> 10)
LogSoftmax ()

In [115]:
model_seq.named_modules


Out[115]:
<bound method Module.named_modules of Sequential (
  (conv2d_1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (relu_1): ReLU ()
  (max_pooling2d_1): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  (conv2d_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (relu_2): ReLU ()
  (dropout_1): Dropout (p = 0.25)
  (flatten_1): Flatten (
  )
  (dense_1): Linear (3872 -> 64)
  (relu_3): ReLU ()
  (dropout_2): Dropout (p = 0.5)
  (dense_2): Linear (64 -> 10)
  (readout): LogSoftmax ()
)>

In [118]:
for l in model_seq:
    print(l)


Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
ReLU ()
Dropout (p = 0.25)
Flatten (
)
Linear (3872 -> 64)
ReLU ()
Dropout (p = 0.5)
Linear (64 -> 10)
LogSoftmax ()

In [117]:
model_seq[0]


Out[117]:
Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))

In [103]:
model_seq._modules


Out[103]:
OrderedDict([('conv2d_1', Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))),
             ('relu_1', ReLU ()),
             ('max_pooling2d_1',
              MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))),
             ('conv2d_2', Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))),
             ('relu_2', ReLU ()),
             ('dropout_1', Dropout (p = 0.25)),
             ('flatten_1', Flatten (
              )),
             ('dense_1', Linear (3872 -> 64)),
             ('relu_3', ReLU ()),
             ('dropout_2', Dropout (p = 0.5)),
             ('dense_2', Linear (64 -> 10)),
             ('readout', LogSoftmax ())])

In [74]:
optimizer = torch.optim.SGD(model_seq.parameters(), lr=lr)

Training loop


In [75]:
for epoch in range(1, 3):
    train(model_seq, train_loader, optimizer, epoch)


Train Epoch: 1 [0/60000 (0%)]	Loss: 2.320623
Train Epoch: 1 [6400/60000 (11%)]	Loss: 1.501908
Train Epoch: 1 [12800/60000 (21%)]	Loss: 0.895033
Train Epoch: 1 [19200/60000 (32%)]	Loss: 0.508193
Train Epoch: 1 [25600/60000 (43%)]	Loss: 0.546440
Train Epoch: 1 [32000/60000 (53%)]	Loss: 0.461239
Train Epoch: 1 [38400/60000 (64%)]	Loss: 0.339877
Train Epoch: 1 [44800/60000 (75%)]	Loss: 0.372804
Train Epoch: 1 [51200/60000 (85%)]	Loss: 0.633542
Train Epoch: 1 [57600/60000 (96%)]	Loss: 0.338278
Train Epoch: 2 [0/60000 (0%)]	Loss: 0.258300
Train Epoch: 2 [6400/60000 (11%)]	Loss: 0.240900
Train Epoch: 2 [12800/60000 (21%)]	Loss: 0.247753
Train Epoch: 2 [19200/60000 (32%)]	Loss: 0.171940
Train Epoch: 2 [25600/60000 (43%)]	Loss: 0.391493
Train Epoch: 2 [32000/60000 (53%)]	Loss: 0.212401
Train Epoch: 2 [38400/60000 (64%)]	Loss: 0.193295
Train Epoch: 2 [44800/60000 (75%)]	Loss: 0.310381
Train Epoch: 2 [51200/60000 (85%)]	Loss: 0.316394
Train Epoch: 2 [57600/60000 (96%)]	Loss: 0.275944

In [76]:
test(model_seq, test_loader)


Test set: Average loss: 0.1460, Accuracy: 9571/10000 (96%)

Saving and loading the model

There are two methods provided by

  • Saving model parameters
  • Saving the entire model (may be less stable to changes)

References:


In [119]:
model_file = 'example_torch_mnist_model'
if cuda: model_file += '_gpu'

Method 1: saving the model parameters


In [120]:
name = model_file + '.pth'

In [121]:
torch.save(model_seq.state_dict(), name)

In [90]:
model2 = Net()
if cuda:
    model2.cuda()
model2.load_state_dict(torch.load(name))

In [91]:
test(model2, test_loader)


Test set: Average loss: 0.0856, Accuracy: 9732/10000 (97%)

Method 2: saving the entire model


In [92]:
name = model_file + '.pth.tar'

In [93]:
model.eval()
torch.save(model, name)


/work/miniconda/envs/dnnviz/lib/python3.6/site-packages/torch/serialization.py:147: UserWarning: Couldn't retrieve source code for container of type Net. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "

In [94]:
model3 = torch.load(name)

In [95]:
test(model3, test_loader)


Test set: Average loss: 0.0856, Accuracy: 9732/10000 (97%)