Fisheries competition

In this notebook we're going to investigate a range of different architectures for the Kaggle fisheries competition. We use VGG with batch normalization through out this notebook.


In [ ]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision.utils import make_grid
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.utils.trainer as trainer
import torch.utils.trainer.plugins
from torch.autograd import Variable
import numpy as np
import pandas as pd
import os

from torchsample.modules import ModuleTrainer
from torchsample.metrics import CategoricalAccuracy

import glob
import PIL
import matplotlib.pyplot as plt

import scipy.misc

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [ ]:
def denorm(tensor):
    # Undo the image normalization + clamp between 0 and 1 to avoid image artifacts
    for t, m, s in zip(tensor, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]): 
        t.mul_(s).add_(m).clamp_(0, 1)
    return tensor

def get_images_to_plot(images_tensor):
    denormalize = transforms.Compose([
        transforms.Lambda(denorm)
    ])
    return denormalize(images_tensor)

def show(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')

In [ ]:
data_path = "data/fish/"
# data_path = "data/fish/sample/"
use_cuda = True
batch_size = 64
print('Using CUDA:', use_cuda)

In [ ]:
cuda_device = -1
if use_cuda:
    cuda_device = 0

In [ ]:
# Change to True to create & populate the validation directory
if False:
    # Create validation directory
    %cd data/fish/train
    %mkdir -p ../valid
    %cd ../../../
    
    # Create a folder for each category of fish
    for d in glob.glob('*'): os.mkdir('../valid/' + d)

    # Copy some random images from each class
    shuf = np.random.permutation(glob.glob('*/*.jpg'))
    for i in range(500): os.rename(shuf[i], '../valid/' + shuf[i])

# Change to True to create the sample dir      
# Manually inspect all classes have at least one fish
if False:
    %cd data/fish/train
    %mkdir -p ../sample
    %mkdir -p ../sample/train
    %mkdir -p ../sample/valid
    
    from shutil import copyfile

    # Create a folder for each category of fish
    for d in glob.glob('*'): 
        os.mkdir('../sample/train/' + d)
        os.mkdir('../sample/valid/' + d)
        
    # Copy a few samples per fish
    shuf = np.random.permutation(glob.glob('*/*.jpg'))
    for i in range(60): copyfile(shuf[i], '../sample/train/' + shuf[i])

    %cd ../valid
    shuf = np.random.permutation(glob.glob('*/*.jpg'))
    for i in range(50): copyfile(shuf[i], '../sample/valid/' + shuf[i])
    %cd ../../../

In [ ]:
# This class is required so we can easily extract the labels of the training dataset
class ShuffleOnceSampler(torch.utils.data.sampler.Sampler):
    """Randomly shuffles the data source on creation, without replacement.
    Returns the same sequential order on every epoch.
    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source):
        self.shuffled_order = torch.randperm(len(data_source)).long()

    def __iter__(self):
        return iter(self.shuffled_order)

    def __len__(self):
        return len(self.shuffled_order)

In [ ]:
# Data loading code
traindir = os.path.join(data_path, 'train')
valdir = os.path.join(data_path, 'valid') 
testdir = os.path.join(data_path, 'test')

# pytorch way of implementing fastai's get_batches, (utils.py)
def get_data_loader(dirname, batch_size=64, shuffle_once=False, image_size=(224, 224)):
    # pytorch's VGG requires images to be 224x224 and normalized using https://github.com/pytorch/vision#models
    normalize = transforms.Compose([
        transforms.Lambda(lambda img: img.resize(image_size, Image.BILINEAR)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                             std=[0.229, 0.224, 0.225]),
    ])
    
    image_folder = datasets.ImageFolder(dirname, normalize)
    sampler = None
    if shuffle_once:
        sampler = ShuffleOnceSampler(image_folder)
    return torch.utils.data.DataLoader(image_folder, batch_size=batch_size, 
                                       shuffle=False, pin_memory=use_cuda, sampler=sampler), image_folder

train_loader, train_folder = get_data_loader(traindir, batch_size=batch_size, shuffle_once=True)
val_loader, val_folder = get_data_loader(valdir, batch_size=batch_size)
test_loader, test_folder = get_data_loader(testdir, batch_size=batch_size)

print('Images in train folder:', len(train_folder.imgs))
print('Images in val folder:', len(val_folder.imgs))
print('Images in test folder:', len(test_folder.imgs))

Basic VGG

We start with our usual VGG approach. We will be using VGG with batch normalization. For more information about batch normalization please see this notebook.

Initial model

First we create a simple fine-tuned VGG model to be our starting point.


In [ ]:
# Monkey patch the parameters() to return trainable weights only
import types

def parameters(self):
    p = filter(lambda p: p.requires_grad, nn.Module.parameters(self))
    return p

# TODO create a utiliy class that inits models correctly
# Keras inits the model with sensible defaults, PyTorch does not
def init_model(model):
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform(m.weight)
            if m.bias is not None:
                nn.init.constant(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant(m.weight, 1)
            nn.init.constant(m.bias, 0)
        elif isinstance(m, nn.BatchNorm1d):
            nn.init.constant(m.weight, 1)
            nn.init.constant(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.normal(m.weight, mean=0, std=0.01)
            nn.init.constant(m.bias, 0)

In [ ]:
# Load the model
model = models.vgg16_bn(pretrained=True)

# Finetune by replacing the last fully connected layer and freezing all network parameters
for param in model.parameters():
    param.requires_grad = False
model.parameters = types.MethodType(parameters, model)

# Replace the last fully-connected layer matching the new class count
classes = train_loader.dataset.classes
num_classes = len(classes)
print('Using {:d} classes: {}'.format(num_classes, classes))

model.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )

In [ ]:
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss()
# enable cuda if available
if(use_cuda):
    model.cuda()
    criterion.cuda()
    
def getTrainer(model):
    trainer = ModuleTrainer(model)
    trainer.compile(optimizer='adam', loss=criterion, metrics=[CategoricalAccuracy()])
    
    return trainer

In [ ]:
trainer = getTrainer(model)

In [ ]:
# TODO fix this: 'ImageFolder' object has no attribute 'num_inputs', module_trainer.py (318)
# trainer.fit_loader(train_loader, val_loader=val_loader, num_epoch=1, cuda_device=cuda_device)

Precompute convolutional output

We pre-compute the output of the last convolution layer of VGG, since we're unlikely to need to fine-tune those layers. (All following analysis will be done on just the pre-computed convolutional features.)


In [ ]:
class VggNoClassifier(nn.Module):
    def __init__(self, vgg):
        super(VggNoClassifier, self).__init__()
        # The last feature is a Max Pooling layer, remove it
        num_features = len(vgg.features._modules)
        print(num_features, type(vgg.features[num_features - 2]), type(vgg.features[num_features - 1]))
        self.features = nn.Sequential(*[vgg.features[idx] for idx in range(num_features - 1)])
        
    def forward(self, x):
        x = self.features(x)
        return x

In [ ]:
vgg = VggNoClassifier(model)
if(use_cuda):
    vgg.cuda()
    
trainer = ModuleTrainer(vgg)

In [ ]:
if False:
    %time conv_train = trainer.predict_loader(train_loader, cuda_device=cuda_device).data # Extract Tensor from Variable
    %time conv_val = trainer.predict_loader(val_loader, cuda_device=cuda_device).data
    %time conv_test = trainer.predict_loader(test_loader, cuda_device=cuda_device).data

    labels_train = torch.cat([labels for (batch, labels) in train_loader])
    labels_val = torch.cat([labels for (batch, labels) in val_loader])

    %mkdir -p data/fish/results
    torch.save(conv_train, data_path + 'results/conv_train_224.pth')
    torch.save(conv_val,   data_path + 'results/conv_val_224.pth')
    torch.save(conv_test,  data_path + 'results/conv_test_224.pth')

    torch.save(labels_train, data_path + 'results/labels_train_224.pth')
    torch.save(labels_val,   data_path + 'results/labels_val_224.pth')
else:
    conv_train = torch.load(data_path + 'results/conv_train_224.pth')
    conv_val   = torch.load(data_path + 'results/conv_val_224.pth')
    conv_test  = torch.load(data_path + 'results/conv_test_224.pth')

    labels_train = torch.load(data_path + 'results/labels_train_224.pth')
    labels_val   = torch.load(data_path + 'results/labels_val_224.pth')

In [ ]:
conv_train.size(), labels_train.size()

Train model

We can now create our first baseline model - a simple 3-layer FC net.


In [ ]:
class FCNet3LayerClassifer(nn.Module):
    
    def __init__(self, p):
        super(FCNet3LayerClassifer, self).__init__()
        size_after_pool = 512 * 7 * 7 # 7 = 14 / 2
        feature_size = 512
        
        self.maxPool = nn.Sequential(nn.MaxPool2d((2, 2)),
                                     nn.BatchNorm2d(feature_size),
                                     nn.Dropout2d(p / 4))
        
        self.linear = nn.Sequential(nn.Linear(size_after_pool, feature_size), 
                                    nn.ReLU(inplace=True), 
                                    nn.BatchNorm1d(feature_size), 
                                    nn.Dropout(p),
                                    
                                    nn.Linear(feature_size, feature_size), 
                                    nn.ReLU(inplace=True), 
                                    nn.BatchNorm1d(feature_size), 
                                    nn.Dropout(p / 2))
                                    
        self.classifier = nn.Linear(feature_size, num_classes)
        init_model(self)
        
    def forward(self, x):
        x = self.maxPool(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        x = self.classifier(x)
        return x

In [ ]:
model = FCNet3LayerClassifer(0.6)
if(use_cuda):
    model.cuda()

trainer = getTrainer(model)

In [ ]:
trainer.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=3, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer.adjust_learning_rate(1e-4)

In [ ]:
trainer.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=7, batch_size=batch_size, cuda_device=cuda_device)

Multi-input

The images are of different sizes, which are likely to represent the boat they came from (since different boats will use different cameras). Perhaps this creates some data leakage that we can take advantage of to get a better Kaggle leaderboard position? To find out, first we create arrays of the file sizes for each image:


In [ ]:
filenames_train = [filename for filename, _ in train_loader.dataset.imgs]
sizes_tuples_train = [PIL.Image.open(f).size for f in filenames_train]
unique_sizes = list(set(sizes_tuples_train))
size_to_idx = {size : idx for idx, size in enumerate(unique_sizes)}
size_to_idx[(1334, 750)] = 8 # Add any other sizes not present in training set, but present in val or test sets
image_sizes_count = len(size_to_idx)

In [ ]:
import collections
collections.Counter(sizes_tuples_train)

In [ ]:
def one_hot_encode_normalize(list_indexes, index_size):
    hot_encoded = torch.FloatTensor(len(list_indexes), index_size).zero_()
    idx = torch.LongTensor(list_indexes).view(-1, 1)
    hot_encoded.scatter_(1, idx, 1.0)
    return hot_encoded - hot_encoded.mean() / hot_encoded.std()

Then we one-hot encode them (since we want to treat them as categorical) and normalize the data.


In [ ]:
sizes_train = one_hot_encode_normalize(list(map(size_to_idx.__getitem__, sizes_tuples_train)), image_sizes_count)
sizes_train.size()

In [ ]:
filenames_val = [filename for filename, _ in val_loader.dataset.imgs]
sizes_tuples_val = [PIL.Image.open(f).size for f in filenames_val]
sizes_val = one_hot_encode_normalize(list(map(size_to_idx.__getitem__, sizes_tuples_val)), image_sizes_count)
sizes_val.size()

In [ ]:
class MultiInput3LayerFCNetClassifer(FCNet3LayerClassifer):
    def __init__(self, p):
        super(MultiInput3LayerFCNetClassifer, self).__init__(p)
        image_size_feature = image_sizes_count
        feature_size = 512 + image_size_feature
        
        self.batchnorm = nn.BatchNorm1d(image_sizes_count)
        self.classifier = nn.Linear(feature_size, num_classes)
        init_model(self)
        
    def forward(self, x, x_image_sizes):
        x_bn = self.batchnorm(x_image_sizes)
        
        x = self.maxPool(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        
        x = torch.cat([x, x_bn], dim=1)
        
        x = self.classifier(x)
        return x

In [ ]:
model = MultiInput3LayerFCNetClassifer(0.6)
if(use_cuda):
    model.cuda()

trainer = getTrainer(model)

In [ ]:
trainer.fit((conv_train, sizes_train), labels_train, val_data=((conv_val, sizes_val), labels_val), num_epoch=3, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer.adjust_learning_rate(1e-4)

In [ ]:
trainer.fit((conv_train, sizes_train), labels_train, val_data=((conv_val, sizes_val), labels_val), num_epoch=7, batch_size=batch_size, cuda_device=cuda_device)

The model did not show an improvement by using the leakage, other than in the early epochs. This is most likely because the information about what boat the picture came from is readily identified from the image itself, so the meta-data turned out not to add any additional information.

Bounding boxes & multi output

Import / view bounding boxes

A kaggle user has created bounding box annotations for each fish in each training set image. You can download them from here. We will see if we can utilize this additional information. First, we'll load in the data, and keep just the largest bounding box for each image.


In [ ]:
import json

In [ ]:
anno_classes = ['alb', 'bet', 'dol', 'lag', 'other', 'shark', 'yft']

In [ ]:
bb_json = {}
for c in anno_classes:
    j = json.load(open('data/fish/annotations/{}_labels.json'.format(c), 'r'))
    for l in j:
        if 'annotations' in l.keys() and len(l['annotations'])>0:
            bb_json[l['filename'].split('/')[-1]] = sorted(
                l['annotations'], key=lambda x: x['height']*x['width'])[-1]

In [ ]:
bb_json['img_04908.jpg']

For any images that have no annotations, we'll create an empty bounding box.


In [ ]:
empty_bbox = {'height': 0., 'width': 0., 'x': 0., 'y': 0.}

Finally, we convert the dictionary into an array, and convert the coordinates to our resized 224x224 images.


In [ ]:
bb_params = ['height', 'width', 'x', 'y']
def convert_bb(filename, size):
    bb = bb_json.get(no_folders(filename), empty_bbox)
    
    bb = [bb[p] for p in bb_params]
    conv_x = (224. / size[0])
    conv_y = (224. / size[1])
    bb[0] = bb[0] * conv_y
    bb[1] = bb[1] * conv_x
    bb[2] = max(bb[2] * conv_x, 0)
    bb[3] = max(bb[3] * conv_y, 0)
    
    return torch.FloatTensor(bb)

def no_folders(filename):
    return filename.split('/')[-1]

In [ ]:
bbox_train = torch.stack([convert_bb(filename, size) for filename, size in zip(filenames_train, sizes_tuples_train)])
bbox_val = torch.stack([convert_bb(filename, size) for filename, size in zip(filenames_val, sizes_tuples_val)])

Now we can check our work by drawing one of the annotations.


In [ ]:
def create_rect(bb, color='red'):
    return plt.Rectangle((bb[2], bb[3]), bb[1], bb[0], color=color, fill=False, lw=3)

def show_bb(i):
    bb = bbox_val[i]
    show(get_images_to_plot(val_loader.dataset[i][0]))
    plt.gca().add_patch(create_rect(bb))

In [ ]:
show_bb(0)

Create & train model

Since we're not allowed (by the kaggle rules) to manually annotate the test set, we'll need to create a model that predicts the locations of the bounding box on each image. To do so, we create a model with multiple outputs: it will predict both the type of fish (the 'class'), and the 4 bounding box coordinates. We prefer this approach to only predicting the bounding box coordinates, since we hope that giving the model more context about what it's looking for will help it with both tasks.


In [ ]:
class MultiOutput3LayerFCNetClassifer(FCNet3LayerClassifer):
    def __init__(self, p):
        super(MultiOutput3LayerFCNetClassifer, self).__init__(p)
        feature_size = 512
        bbox_corners_count = 4
        self.bbox_regressor = nn.Linear(feature_size, bbox_corners_count)
        init_model(self)
        
    def forward(self, x):
        x = self.maxPool(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        
        x_bb = self.bbox_regressor(x)
        x = self.classifier(x)
        return x, x_bb

Since we have multiple outputs, we need to provide them to the model constructor in an array, and we also need to say what loss function to use for each. We also weight the bounding box loss function down by 1000x since the scale of the cross-entropy loss and the MSE is very different.


In [ ]:
# TODO how to pass a weight to each loss function?
model = MultiOutput3LayerFCNetClassifer(0.6) 
mse_loss = nn.MSELoss()
if(use_cuda):
    model.cuda()
    mse_loss.cuda()

trainer = getTrainer(model)
trainer.compile(optimizer='adam', loss=[criterion, mse_loss], metrics=[CategoricalAccuracy(), None], loss_weights=[1., 0.001])

In [ ]:
trainer.fit(conv_train, (labels_train, bbox_train), val_data=(conv_val, (labels_val, bbox_val)), num_epoch=3, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer.adjust_learning_rate(1e-5)

In [ ]:
trainer.fit(conv_train, (labels_train, bbox_train), val_data=(conv_val, (labels_val, bbox_val)), num_epoch=10, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
# TODO This model does not seem to converge on a solution that accurately finds the bounding boxes. 
# (It tends to get stucked and always give the same or similar result)

Excitingly, it turned out that the classification model is much improved by giving it this additional task. Let's see how well the bounding box model did by taking a look at its output.


In [ ]:
predictions = trainer.predict(conv_train, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
def show_bb_pred(i):
    bb = bbox_val[i]
    bb_pred = predictions[1][i].data
    plt.figure(figsize=(6,6))
    show(get_images_to_plot(val_loader.dataset[i][0]))
    ax = plt.gca()
    ax.add_patch(create_rect(bb_pred, 'yellow'))
    ax.add_patch(create_rect(bb))
    
    _, class_id = predictions[0][i].max(0)
    class_id_number = torch.max(class_id.data) # From Tensor to Number
    print(classes[class_id_number])
    print(bb_pred, bb)

In [ ]:
show_bb_pred(6)

Larger size

Set up data

Let's see if we get better results if we use larger images. We'll use 640x360, since it's the same shape as the most common size we saw earlier (1280x720), without being too big.


In [ ]:
train_loader, train_folder = get_data_loader(traindir, batch_size=32, shuffle_once=True, image_size=(640, 360))
val_loader, val_folder = get_data_loader(valdir, batch_size=32, image_size=(640, 360))
test_loader, test_folder = get_data_loader(testdir, batch_size=32, image_size=(640, 360))

print('Images in train folder:', len(train_folder.imgs))
print('Images in val folder:', len(val_folder.imgs))
print('Images in test folder:', len(test_folder.imgs))

The image shows that things are much clearer at this size.


In [ ]:
show(get_images_to_plot(train_loader.dataset[0][0]))

We can now create our VGG model - we'll need to tell it we're not using the normal 224x224 images, which also means it won't include the fully connected layers (since they don't make sense for non-default sizes). We will also remove the last max pooling layer, since we don't want to throw away information yet.


In [ ]:
# Load the model
model = models.vgg16_bn(pretrained=True)
vgg = VggNoClassifier(model)
if(use_cuda):
    vgg.cuda()
    
trainer = ModuleTrainer(vgg)

In [ ]:
if False:
    %time conv_train = trainer.predict_loader(train_loader, cuda_device=cuda_device).data # Extract Tensor from Variable
    %time conv_val = trainer.predict_loader(val_loader, cuda_device=cuda_device).data
    %time conv_test = trainer.predict_loader(test_loader, cuda_device=cuda_device).data

    labels_train = torch.cat([labels for (batch, labels) in train_loader])
    labels_val = torch.cat([labels for (batch, labels) in val_loader])

    %mkdir -p data/fish/results
    torch.save(conv_train, data_path + 'results/conv_train_640.pth')
    torch.save(conv_val,   data_path + 'results/conv_val_640.pth')
    torch.save(conv_test,  data_path + 'results/conv_test_640.pth')

    torch.save(labels_train, data_path + 'results/labels_train_640.pth')
    torch.save(labels_val,   data_path + 'results/labels_val_640.pth')
else:
    conv_train = torch.load(data_path + 'results/conv_train_640.pth')
    conv_val   = torch.load(data_path + 'results/conv_val_640.pth')
    conv_test  = torch.load(data_path + 'results/conv_test_640.pth')

    labels_train = torch.load(data_path + 'results/labels_train_640.pth')
    labels_val   = torch.load(data_path + 'results/labels_val_640.pth')

In [ ]:
conv_train.size(), labels_train.size(), conv_test.size()

Fully convolutional net (FCN)

Since we're using a larger input, the output of the final convolutional layer is also larger. So we probably don't want to put a dense layer there - that would be a lot of parameters! Instead, let's use a fully convolutional net (FCN); this also has the benefit that they tend to generalize well, and also seems like a good fit for our problem (since the fish are a small part of the image).


In [ ]:
import torch.nn.functional as F

class FCNClassifer(nn.Module):
    
    def __init__(self, p):
        super(FCNClassifer, self).__init__()
        feature_size = 512
        feature_size_conv = 128
        kernel_size = (3, 3)
        padding = (1, 1)
        
        self.fcn = nn.Sequential(nn.BatchNorm2d(feature_size),
                                 nn.Conv2d(feature_size, feature_size_conv, kernel_size, padding=padding),
                                 nn.ReLU(inplace=True),
                                 nn.BatchNorm2d(feature_size_conv),
                                 nn.MaxPool2d((2, 2)),
                                 
                                 nn.Conv2d(feature_size_conv, feature_size_conv, kernel_size, padding=padding),
                                 nn.ReLU(inplace=True),
                                 nn.BatchNorm2d(feature_size_conv),
                                 nn.MaxPool2d((2, 2)),

                                 nn.Conv2d(feature_size_conv, feature_size_conv, kernel_size, padding=padding),
                                 nn.ReLU(inplace=True),
                                 nn.BatchNorm2d(feature_size_conv),
                                 nn.MaxPool2d((1, 2)),
                                 
                                 nn.Conv2d(feature_size_conv, num_classes, kernel_size, padding=padding),
                                 )
        self.dropout = nn.Dropout2d(p)
        init_model(self)
        
    def forward(self, x):
        x = self.fcn(x)
        x = self.dropout(x)
        h_x_w = x.size()[2:] # h x w = 5x5
        x = F.avg_pool2d(x, kernel_size=h_x_w)
        x = x.view(-1, num_classes)
        return x

I'm not using any dropout, since I found I got better results without it.


In [ ]:
model_fc = FCNClassifer(0.0)
if(use_cuda):
    model_fc.cuda()
trainer_fc = getTrainer(model_fc)

In [ ]:
trainer_fc.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=3, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer_fc.adjust_learning_rate(1e-5)

In [ ]:
trainer_fc.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=4, batch_size=batch_size, cuda_device=cuda_device)

Another benefit of this kind of model is that the last convolutional layer has to learn to classify each part of the image (since there's only an average pooling layer after). Let's create a function that grabs the output of this layer (which is the 4th-last layer of our model).

We have to add an extra dimension to our input since the CNN expects a 'batch' (even if it's just a batch of one).


In [ ]:
def get_convolution_image(model, image_index, channel):
    image = Variable(conv_val[image_index], volatile=True)
    x = image.unsqueeze(0) # Add the extra dimension
    if(use_cuda):
        x = x.cuda()
    print(x.size())
    conv = model.fcn(x)
    print(conv.size())
    # Get first result of batch, then grab one of the filters out of the 8 prediction ones
    print('Predicted class:', torch.max(model.forward(x), 1)[1])
    conv = conv.data[0][channel].cpu().numpy()
    return scipy.misc.imresize(conv, (360,640), interp='nearest')

In [ ]:
image_index = 88
predicted_class = val_loader.dataset[image_index][1]
print('Class =', predicted_class)
show(get_images_to_plot(val_loader.dataset[image_index][0]))

The heatmap shows that (at very low resolution) the model is finding the fish!


In [ ]:
plt.imshow(get_convolution_image(model_fc, image_index, channel=predicted_class), cmap='cool')

All convolutional net heatmap

To create a higher resolution heatmap, we'll remove all the max pooling layers, and repeat the previous steps.


In [ ]:
import torch.nn.functional as F

class FCNClassiferNoMaxPooling(nn.Module):
    
    def __init__(self, p):
        super(FCNClassiferNoMaxPooling, self).__init__()
        self.fcn_module = FCNClassifer(p)
        self.fcn_module.fcn = nn.Sequential(* list(filter(lambda module: not isinstance(module, nn.MaxPool2d), self.fcn_module.fcn)))
        init_model(self)
    
    def forward(self, x):
        return self.fcn_module.forward(x)

In [ ]:
model_heatmap = FCNClassiferNoMaxPooling(0)
if(use_cuda):
    model_heatmap.cuda()
trainer_heatmap = getTrainer(model_heatmap)

In [ ]:
trainer_heatmap.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=2, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer_heatmap.adjust_learning_rate(1e-5)

In [ ]:
trainer_heatmap.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=6, batch_size=batch_size, cuda_device=cuda_device)

Create heatmap


In [ ]:
image_index = 88
predicted_class = val_loader.dataset[image_index][1]
print('Class =', predicted_class)
show(get_images_to_plot(val_loader.dataset[image_index][0]))

In [ ]:
convolution_map = get_convolution_image(model_heatmap.fcn_module, image_index, channel=predicted_class)
plt.imshow(convolution_map, cmap='cool')

In [ ]:
plt.figure(figsize=(10,10))
show(get_images_to_plot(val_loader.dataset[image_index][0]))
plt.imshow(convolution_map, cmap="cool", alpha=0.5)

Inception mini-net

Here's an example of how to create and use "inception blocks" - as you see, they use multiple different convolution filter sizes and concatenate the results together. We'll talk more about these next year.


In [ ]:
class BasicConv2d(nn.Module):

    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)

In [ ]:
class InceptionBlock(nn.Module):
    
    def __init__(self, in_channels, **kwargs):
        super(InceptionBlock, self).__init__()
        self.conv2d = BasicConv2d(in_channels, 16, kernel_size=1)
        self.branch1x1 = BasicConv2d(in_channels, 32, kernel_size=1, stride=2)
        self.branch5x5 = nn.Sequential(
            BasicConv2d(in_channels, 24, kernel_size=1),
            BasicConv2d(24, 32, kernel_size=5, stride=2, padding=2))
        self.branch3x3dbl = nn.Sequential(
            BasicConv2d(in_channels, 31, kernel_size=1),
            BasicConv2d(31, 48, kernel_size=3),
            BasicConv2d(48, 48, kernel_size=3, stride=2, padding=2))

    def forward(self, x):
        branch1x1 = self.branch1x1(x)
        branch5x5 = self.branch5x5(x)
        branch3x3dbl = self.branch3x3dbl(x)

        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=2, padding=1)
        branch_pool = self.conv2d(branch_pool)

        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
        # print(list(map(lambda x: x.size(), outputs)))
        return torch.cat(outputs, 1)

In [ ]:
class InceptionModule(nn.Module):
    
    def __init__(self, p, **kwargs):
        super(InceptionModule, self).__init__()
        in_channels = 512
        in_channels_inception = 128
        self.batchnorm = nn.BatchNorm2d(in_channels)
        self.inception = nn.Sequential(
            InceptionBlock(in_channels),
            InceptionBlock(in_channels_inception),
            InceptionBlock(in_channels_inception))
        self.dropout = nn.Dropout(p)
        self.classifier = nn.Conv2d(in_channels_inception, num_classes, (3, 3), padding=(1, 1))
        

    def forward(self, x):
        x = self.batchnorm(x)
        x = self.inception(x)
        x = self.dropout(x)
        x = self.classifier(x)
        h_x_w = x.size()[2:] # h x w = 5x5
        x = F.avg_pool2d(x, kernel_size=h_x_w)
        x = x.view(-1, num_classes)
        return x

In [ ]:
model_inception = InceptionModule(0.08)
if(use_cuda):
    model_inception.cuda()
trainer_inception = getTrainer(model_inception)

In [ ]:
trainer_inception.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=2, batch_size=batch_size, cuda_device=cuda_device)

In [ ]:
trainer_inception.adjust_learning_rate(1e-5)

In [ ]:
trainer_inception.fit(conv_train, labels_train, val_data=(conv_val, labels_val), num_epoch=6, batch_size=batch_size, cuda_device=cuda_device)

Pseudo-labeling


In [ ]:
kaggle_trainer = trainer_fc

In [ ]:
conv_val_test = torch.cat([conv_val, conv_test[:2000]]) # The 13K Test samples don't fit in ram :(
predictions_val_test_float = kaggle_trainer.predict(conv_val_test, batch_size=batch_size, cuda_device=cuda_device)
_, predictions_val_test = torch.max(predictions_val_test_float.data, 1)
predictions_val_test = predictions_val_test.view(-1)

In [ ]:
conv_train_val_test = torch.cat([conv_train, conv_val_test])
labels_train_val_test = torch.cat([labels_train, predictions_val_test])
print(conv_train_val_test.size(), labels_train_val_test.size())

# Need to create a Dataset and DataLoader as using kaggle_trainer.fit() runs out of memory
train = torch.utils.data.TensorDataset(conv_train_val_test, labels_train_val_test)
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)

val = torch.utils.data.TensorDataset(conv_val, labels_val)
val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False)

In [ ]:
kaggle_trainer.fit_loader(train_loader, val_loader=val_loader, num_epoch=8, cuda_device=cuda_device)

Submit


In [ ]:
predictions_kaggle = kaggle_trainer.predict(conv_test, batch_size=batch_size, cuda_device=cuda_device)
predictions_kaggle = F.softmax(predictions_kaggle).data
len(predictions_kaggle)

In [ ]:
def get_csv_filename(filename):
    file = filename.split('/')[-1]
    if 'test_stg2' in filename:
        return 'test_stg2/' + file
    else:
        return file
filenames_test = [ get_csv_filename(filename) for filename, _ in test_loader.dataset.imgs]
print(len(filenames_test))
classes

In [ ]:
max_value = 0.85
min_value = (1. - max_value) / 8.
predictions_csv = torch.clamp(predictions_kaggle, min_value, max_value).numpy()

In [ ]:
submission = pd.DataFrame(predictions_csv, columns=classes)
submission.insert(0, 'image', filenames_test)
submission.head()

In [ ]:
submission_name = data_path + 'results/submission_fc.gz'
submission.to_csv(submission_name, index=False, compression='gzip')

In [ ]:
from IPython.display import FileLink
FileLink(submission_name)

This model would have ranked 350 on the private leaderboard (~231 on the public leaderboard) with public score 1.12329, private score 2.60793