Lesson 9 Code Along

Lesson 9 Wiki | Notebook: pascal-multi.ipynb



In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.conv_learner import *
from fastai.dataset import *

import json, pdb
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects

In [3]:
torch.backends.cudnn.benchmark=True

1. Setup


In [4]:
PATH = Path('data/pascal')
trn_j = json.load((PATH / 'pascal_train2007.json').open())
IMAGES, ANNOTATIONS, CATEGORIES = ['images', 'annotations', 'categories']
FILE_NAME,ID,IMG_ID,CAT_ID,BBOX = 'file_name','id','image_id','category_id','bbox'

cats = dict((o[ID], o['name']) for o in trn_j[CATEGORIES])
trn_fns = dict((o[ID], o[FILE_NAME]) for o in trn_j[IMAGES])
trn_ids = [o[ID] for o in trn_j[IMAGES]]

JPEGS = 'VOCdevkit/VOC2007/JPEGImages'
IMG_PATH = PATH/JPEGS

In [5]:
def get_trn_anno():
    trn_anno = collections.defaultdict(lambda:[])
    for o in trn_j[ANNOTATIONS]:
        if not o['ignore']:
            bb = o[BBOX]
            bb = np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])
            trn_anno[o[IMG_ID]].append((bb, o[CAT_ID]))
    return trn_anno

trn_anno = get_trn_anno()

In [6]:
def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im)
    ax.set_xticks(np.linspace(0, 224, 8))
    ax.set_yticks(np.linspace(0, 224, 8))
    ax.grid()
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    return ax

def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])
    
def draw_rect(ax, b, color='white'):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=2))
    
def draw_text(ax, xy, txt, sz=14, color='white'):
    text = ax.text(*xy, txt, 
        verticalalignment='top', color=color, fontsize=sz, weight='bold')
    draw_outline(text, 1)

In [7]:
def bb_hw(a): return np.array([a[1],a[0],a[3]-a[1]+1,a[2]-a[0]+1])

def draw_im(im, ann):
    ax = show_img(im, figsize=(16,8))
    for b,c in ann:
        b = bb_hw(b)
        draw_rect(ax, b)
        draw_text(ax, b[:2], cats[c], sz=16)
        
def draw_idx(i):
    im_a = trn_anno[i]
    im = open_image(IMG_PATH/trn_fns[i])
    draw_im(im, im_a)

2. Multi Class

also check out a quick pandas data processing pipeline, instead of using defaultdicts.


In [8]:
MC_CSV = PATH/'tmp/mc.csv'

In [9]:
trn_anno[12]


Out[9]:
[(array([ 96, 155, 269, 350]), 7)]

In [10]:
mc = [set
      ([cats[p[1]] for p in trn_anno[o]]) for o in trn_ids]
mcs = [' '.join(str(p) for p in o) for o in mc]

In [11]:
df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'clas': mcs}, columns=['fn','clas'])
df.to_csv(MC_CSV, index=False)

In [12]:
f_model=resnet34
sz=224
bs=64

In [13]:
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO)
md   = ImageClassifierData.from_csv(PATH, JPEGS, MC_CSV, tfms=tfms)

In [14]:
learn = ConvLearner.pretrained(f_model, md)
learn.opt_fn = optim.Adam

In [15]:
lrf = learn.lr_find(1e-5, 100)


epoch      trn_loss   val_loss   <lambda>                  
    0      1.398041   4.164995   0.849264  


In [16]:
learn.sched.plot(0)



In [17]:
lr = 2e-2
learn.fit(lr, 1, cycle_len=3, use_clr=(32,5))


epoch      trn_loss   val_loss   <lambda>                  
    0      0.323653   0.129685   0.957106  
    1      0.172651   0.078002   0.973122                  
    2      0.11537    0.075069   0.974707                  

Out[17]:
[0.0750692025758326, 0.9747070446610451]

In [18]:
lrs = np.array([lr/100, lr/10, lr])
learn.freeze_to(-2)
learn.lr_find(lrs/1000)
learn.sched.plot(0)


 84%|████████▍ | 27/32 [00:14<00:02,  1.84it/s, loss=0.51]  

In [19]:
learn.fit(lrs/10, 1, cycle_len=5, use_clr=(32,5))


 19%|█▉        | 6/32 [00:04<00:20,  1.27it/s, loss=0.076] 
epoch      trn_loss   val_loss   <lambda>                   
    0      0.072928   0.076492   0.974414  
    1      0.053138   0.078053   0.973881                   
    2      0.039226   0.075312   0.975947                   
    3      0.027348   0.076436   0.976187                   
    4      0.019412   0.075988   0.976164                   

Out[19]:
[0.07598836394026875, 0.9761643558740616]

In [20]:
learn.save('mclas')

In [21]:
learn.load('mclas')

In [22]:
y = learn.predict()
x,_ = next(iter(md.val_dl))
x = to_np(x)

In [23]:
fig,axes = plt.subplots(3, 4, figsize=(12, 8))
for i,ax in enumerate(axes.flat):
    ima=md.val_ds.denorm(x)[i]
    ya = np.nonzero(y[i] > 0.4)[0]
    b  = '\n'.join(md.classes[o] for o in ya)
    ax = show_img(ima, ax=ax)
    draw_text(ax, (0,0), b)
plt.tight_layout()


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

3. Bbox per Cell

3.1 Set up Data


In [8]:
CLAS_CSV = PATH/'tmp/clas.csv'
MBB_CSV  = PATH/'tmp/mbb.csv'

f_model = resnet34
sz = 224
bs = 64

In [9]:
mc = [[cats[p[1]] for p in trn_anno[o]] for o in trn_ids]
id2cat = list(cats.values())
cat2id = {v:k for k,v in enumerate(id2cat)}
mcs = np.array([np.array([cat2id[p] for p in o]) for o in mc]); mcs


Out[9]:
array([array([6]), array([14, 12]), array([ 1,  1, 14, 14, 14]), ..., array([17,  8, 14, 14, 14]),
       array([6]), array([11])], dtype=object)

In [10]:
val_idxs = get_cv_idxs(len(trn_fns))
((val_mcs, trn_mcs),) = split_by_idx(val_idxs, mcs)

In [11]:
mbb  = [np.concatenate([p[0] for p in trn_anno[o]]) for o in trn_ids]
mbbs = [' '.join(str(p) for p in o) for o in mbb]

df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'bbox': mbbs}, columns=['fn', 'bbox'])
df.to_csv(MBB_CSV, index=False)

In [12]:
df.head()


Out[12]:
fn bbox
0 000012.jpg 96 155 269 350
1 000017.jpg 61 184 198 278 77 89 335 402
2 000023.jpg 229 8 499 244 219 229 499 333 0 1 368 116 1 2 ...
3 000026.jpg 124 89 211 336
4 000032.jpg 77 103 182 374 87 132 122 196 179 194 228 212 ...

In [13]:
aug_tfms = [RandomRotate(3, p=0.5, tfm_y=TfmType.COORD), 
            RandomLighting(0.05, 0.05, tfm_y=TfmType.COORD), 
            RandomFlip(tfm_y=TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=aug_tfms)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4)

In [14]:
import matplotlib.cm as cmx
import matplotlib.colors as mcolors
from cycler import cycler

def get_cmap(N):
    color_norm = mcolors.Normalize(vmin=0, vmax=N-1)
    return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba

num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]

In [15]:
def show_ground_truth(ax, im, bbox, clas=None, prs=None, thresh=0.3):
    bb = [bb_hw(o) for o in bbox.reshape(-1,4)]
    if prs is None:  prs  = [None]*len(bb)
    if clas is None: clas = [None]*len(bb)
    ax = show_img(im, ax=ax)
    for i,(b,c,pr) in enumerate(zip(bb, clas, prs)):
        if ((b[2] > 0) and (pr is None or pr > thresh)):
            draw_rect(ax, b, color=colr_list[i % num_colr])
            txt = f'{i}: '
            if c is not None:  txt += ('bg' if c==len(id2cat) else id2cat[c])
            if pr is not None: txt += f' {pr:.2f}'
            draw_text(ax, b[:2], txt, color=colr_list[i % num_colr])

In [16]:
class ConcatLblDataset(Dataset):
    def __init__(self, ds, y2):
        self.ds,self.y2 = ds,y2
        self.sz = ds.sz
    def __len__(self): return len(self.ds)
    
    def __getitem__(self, i):
        x,y = self.ds[i]
        return (x, (y,self.y2[i]))

In [17]:
trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs)
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2

In [18]:
x,y = to_np(next(iter(md.val_dl)))
x = md.val_ds.ds.denorm(x)

In [19]:
x,y = to_np(next(iter(md.trn_dl)))
x = md.trn_ds.ds.denorm(x)

In [20]:
fig,axes = plt.subplots(3, 4, figsize=(16,12))
for i,ax in enumerate(axes.flat):
    show_ground_truth(ax, x[i], y[0][i], y[1][i])
plt.tight_layout()


3.2 Set up Model

We're going to make a simple 1st model that simply predicts what object is located in each cell of a 4x4 grid. Later on we can try to improve this.


In [20]:
anc_grid = 4
k = 1

anc_offset = 1/(anc_grid*2)
anc_x = np.repeat(np.linspace(anc_offset, 1-anc_offset, anc_grid), anc_grid)
anc_y = np.tile(np.linspace(anc_offset, 1-anc_offset, anc_grid), anc_grid)

anc_ctrs =  np.tile(np.stack([anc_x,anc_y], axis=1), (k,1))
anc_sizes = np.array([[1/anc_grid,1/anc_grid] for i in range(anc_grid*anc_grid)])
anchors = V(np.concatenate([anc_ctrs, anc_sizes], axis=1), requires_grad=False).float()

In [21]:
grid_sizes = V(np.array([1/anc_grid]), requires_grad=False).unsqueeze(1)

In [22]:
plt.scatter(anc_x, anc_y)
plt.xlim(0, 1)
plt.ylim(0, 1);



In [23]:
anchors


Out[23]:
Variable containing:
 0.1250  0.1250  0.2500  0.2500
 0.1250  0.3750  0.2500  0.2500
 0.1250  0.6250  0.2500  0.2500
 0.1250  0.8750  0.2500  0.2500
 0.3750  0.1250  0.2500  0.2500
 0.3750  0.3750  0.2500  0.2500
 0.3750  0.6250  0.2500  0.2500
 0.3750  0.8750  0.2500  0.2500
 0.6250  0.1250  0.2500  0.2500
 0.6250  0.3750  0.2500  0.2500
 0.6250  0.6250  0.2500  0.2500
 0.6250  0.8750  0.2500  0.2500
 0.8750  0.1250  0.2500  0.2500
 0.8750  0.3750  0.2500  0.2500
 0.8750  0.6250  0.2500  0.2500
 0.8750  0.8750  0.2500  0.2500
[torch.cuda.FloatTensor of size 16x4 (GPU 0)]

In [24]:
def hw2corners(ctr, hw): return torch.cat([ctr - hw/2, ctr + hw/2], dim=1)

In [25]:
anchor_cnr = hw2corners(anchors[:,:2], anchors[:,2:])
anchor_cnr


Out[25]:
Variable containing:
 0.0000  0.0000  0.2500  0.2500
 0.0000  0.2500  0.2500  0.5000
 0.0000  0.5000  0.2500  0.7500
 0.0000  0.7500  0.2500  1.0000
 0.2500  0.0000  0.5000  0.2500
 0.2500  0.2500  0.5000  0.5000
 0.2500  0.5000  0.5000  0.7500
 0.2500  0.7500  0.5000  1.0000
 0.5000  0.0000  0.7500  0.2500
 0.5000  0.2500  0.7500  0.5000
 0.5000  0.5000  0.7500  0.7500
 0.5000  0.7500  0.7500  1.0000
 0.7500  0.0000  1.0000  0.2500
 0.7500  0.2500  1.0000  0.5000
 0.7500  0.5000  1.0000  0.7500
 0.7500  0.7500  1.0000  1.0000
[torch.cuda.FloatTensor of size 16x4 (GPU 0)]

In [26]:
n_clas = len(id2cat) + 1
n_act = k*(4 + n_clas)

In [27]:
class StdConv(nn.Module):
    def __init__(self, nin, nout, stride=2, drop=0.1):
        super().__init__()
        self.conv = nn.Conv2d(nin, nout, 3, stride=stride, padding=1)
        self.bn = nn.BatchNorm2d(nout)
        self.drop = nn.Dropout(drop)
        
    def forward(self, x): return self.drop(self.bn(F.relu(self.conv(x))))
    
def flatten_conv(x,k):
    bs,nf,gx,gy = x.size()
    x = x.permute(0,2,3,1).contiguous()
    return x.view(bs, -1, nf//k)

In [28]:
class OutConv(nn.Module):
    def __init__(self, k, nin, bias):
        super().__init__()
        self.k = k
        self.oconv1 = nn.Conv2d(nin, (len(id2cat) + 1)*k, 3, padding=1) # for classifier
        self.oconv2 = nn.Conv2d(nin, 4*k, 3, padding=1) # for bounding box regression
        self.oconv1.bias.data.zero_().add_(bias)
    
    def forward(self, x):
        return [flatten_conv(self.oconv1(x), self.k), 
                flatten_conv(self.oconv2(x), self.k)]

In [29]:
class SSD_Head(nn.Module):
    def __init__(self, k, bias):
        super().__init__()
        self.drop = nn.Dropout(0.25)
        self.sconv0 = StdConv(512, 256, stride=1)
        # self.sconv1 = StdConv(256, 256)
        self.sconv2 = StdConv(256, 256)
        self.out = OutConv(k, 256, bias)
        
    def forward(self, x):
        x = self.drop(F.relu(x))
        x = self.sconv0(x)
        # x = self.sconv1(x)
        x = self.sconv2(x)
        return self.out(x)

In [30]:
head_reg4 = SSD_Head(k, -3.)
models = ConvnetBuilder(f_model, 0, 0, 0, custom_head=head_reg4)
learn  = ConvLearner(md, models)
learn.opt_fn = optim.Adam
k


Out[30]:
1

We start with stride 1 convolutions because it doesn't change the geometry at all - it lets us add an extra layer of convolutions. We have a mini NN in our custom head.

We have 2 conv layers, stride 1 then stride 2. The output of self.sconv2 = StdConv(256, 256) will be 4x4.

OutConv has 2 separate Conv layers, ea. of which is stride=1, so it's not changing the geometry of the input. 1 is of length: number of classes, the other is equal to 4. (k = 1 for now). So 2 Conv layers: 1 outputs 4, the other outputs C. We return them as a list of two items: two separate Tensors of actvns (classes and bbox coords).

That's nearly the same thing as having a single Conv layer that outputs 4+C; but it lets these layers specialize just a bit. Related tasks don't have to share all their layers.

We also add 1 to the classes Conv layer for background.


So again: we have our Data, we have our Architecture, now we just need our Loss Function.

The Loss Fn needs to look at each of these 16 sets of actvns (ea. of which will have 4 sets of bbx coords and C+1 class probs) and decide: are those actvns close/far away from the object closest to this grid cell in the image.

3.3 Train

NOTE: we don't actually use Cross Entropy. We use Binary Cross Entropy Loss for Classification here. We usually use BCE for multi-label classification (eg: Planet). Softmax can't be used for multi-label. In our case, each Anchor Box can only have 1 object associated with it, so the reason we can't use Softmax is it's possible for an Anchor Box to have nothing assoc. with it.

You could treat 'background' as a class and use Softmax. Lot's of people have done this. It's a very hard thing to ask a NN to do: "does this grid cell not have any of the 20 objects I'm interested in with a Jaccard overlap of 0.5?"

Instead you could ask the NN to go through each object and check "is it this? no.. is it this? no.." and if it's 'No' to all, then it's background.


To achieve this, in the forward method:

We take our target targ and do a One-Hot Embedding with the number of classes + 1: one_hot_embedding(targ, self.num_classes + 1). At this stage we do have a class for background, but we remove that last column on the next line: t = V(t[:,:-1].contiguous()).

So now our vector t is either all zeros or there's 1 one.

Finally we use BCE to compare our predictions with that target: F.binary_cross_entropy_with_logits(x, t, w, size_average=False)/self.num_classes

This is a minor tweak that makes a major difference to training. Also when new ML papers are published: it's usually for some new tweak like this.


In [31]:
def one_hot_embedding(labels, num_classes):
    return torch.eye(num_classes)[labels.data.cpu()]

class BCE_Loss(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        
    def forward(self, pred, targ):
        t = one_hot_embedding(targ, self.num_classes + 1)
        t = V(t[:,:-1].contiguous())#.cpu()
        x = pred[:,:-1]
        w = self.get_weight(x,t)
        return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)/self.num_classes
    
    def get_weight(self, x, t): return None

In [32]:
loss_f = BCE_Loss(len(id2cat))

Here we calculate the Jaccard Index, the Intesection Over Union:


In [33]:
def intersect(box_a, box_b):
    max_xy = torch.min(box_a[:, None, 2:], box_b[None, :, 2:])
    min_xy = torch.max(box_a[:, None, :2], box_b[None, :, :2])
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]

def box_sz(b): return ((b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]))

def jaccard(box_a, box_b):
    inter = intersect(box_a, box_b)
    union = box_sz(box_a).unsqueeze(1) + box_sz(box_b).unsqueeze(0) - inter
    return inter / union

The way we interpret the activations is important. We do that with actn_to_bb. We grab the activations; stick them through tanh (tanh is the same as Sigmoid but scaled between -1:1) which forces them to be in our range; then we grab the actual position of the anchor boxes (+ anchors[:,:2]) and move them around according to the value of the activations divided by 2: (actn_bbs[:,:2]/2 *grid_sizes) -- so ea. predictive BB can be moved by up to 50% of a grid size from its default position. Likewise for it's height & width: it can be scaled between 0.5x to 2x from default size.


In [34]:
def get_y(bbox, clas):
    bbox = bbox.view(-1, 4)/sz
    bb_keep = ((bbox[:, 2] - bbox[:, 0]) > 0).nonzero()[:,0]
    return bbox[bb_keep], clas[bb_keep]

def actn_to_bb(actn, anchors):
    actn_bbs = torch.tanh(actn)
    actn_centers = (actn_bbs[:,:2]/2 * grid_sizes) + anchors[:,:2]
    actn_hw = (actn_bbs[:,2:]/2 + 1) * anchors[:,2:]
    return hw2corners(actn_centers, actn_hw)

def map_to_ground_truth(overlaps, print_it=False):
    prior_overlap, prior_idx = overlaps.max(1)
    if print_it: print(prior_overlap)
    #pdb.set_trace()
    gt_overlap, gt_idx = overlaps.max(0)
    gt_overlap[prior_idx] = 1.99
    for i,o in enumerate(prior_idx): gt_idx[o] = i
    return gt_overlap, gt_idx

def ssd_1_loss(b_c, b_bb, bbox, clas, print_it=False):
    bbox,clas = get_y(bbox,clas)
    a_ic = actn_to_bb(b_bb, anchors)
    overlaps = jaccard(bbox.data, anchor_cnr.data)
    gt_overlap,gt_idx = map_to_ground_truth(overlaps,print_it)
    gt_clas = clas[gt_idx]
    pos = gt_overlap > 0.4
    pos_idx = torch.nonzero(pos)[:,0]
    gt_clas[1-pos] = len(id2cat)
    gt_bbox = bbox[gt_idx]
    loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs()).mean()
    clas_loss = loss_f(b_c, gt_clas)
    return loc_loss, clas_loss

def ssd_loss(pred, targ, print_it=False):
    lcs,lls = 0.,0.
    for b_c,b_bb,bbox,clas in zip(*pred,*targ):
        loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,print_it)
        lls += loc_loss
        lcs += clas_loss
    if print_it: print(f'loc: {lls.data[0]}, clas: {lcs.data[0]}')
    return lls+lcs

We have our custom loss fn (BCE+), the calc for Jacc.Idx, the activations-to-bbox converter, the ground-truth-mapper, and the last thing left is our SSD Loss Function. This is what we set as our criterion below (learn.crit = ssd_loss).

ssd_loss loops through ea. image in the minibatch, and calls ssd_1_loss on it (SSD Loss for 1 image).

ssd_1_loss first destructures bounding boxes & classes, & removes their padding. The Fast.ai Library automatically pads the input data with zeros (bc there can be dfrnt numbers of Ground Truth Objects in each image, but a Tensor must strictly be of a rectangular shape). This allows for parallel batched processing; but you must remember to strip off the padding zeros, and this is what get_y does on the line: bb_keep = ((bbox[:,2] - bbox[:,0]) > 0).nonzero()[:,0] (removes all padding-bounding-boxes).

ssd_1_loss then converts activations to bounding boxes via a_ic = actn_to_bb(..); calculates the Jacc., get overlaps and Ground Truth, check for overlap over some threshold (0.4 here), find what matches (pos_idx = ..), get the background classes for those that don't, and finally get the L1 loss for the Localization part, and the BCE Loss for the Classification part, and return those 2 pieces.

Then ssd_loss takes the two pieces and adds them together: return lls+lcs $\longleftarrow$ Loss:Localizations + Loss:Classifications.


In [35]:
x,y = next(iter(md.val_dl))
# x,y = V(x).cpu(),V(y)
x,y = V(x),V(y)

In [36]:
# for i,o in enumerate(y): y[i] = o.cpu()
# learn.model.cpu()

In [37]:
batch = learn.model(x)

In [251]:
# anchors = anchors.cpu()
# grid_sizes = grid_sizes.cpu()
# anchor_cnr = anchor_cnr.cpu()

In [252]:
ssd_loss(batch, y, True)


 0.1947
 0.1168
 0.2652
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.2885
 0.0888
[torch.cuda.FloatTensor of size 2 (GPU 0)]


1.00000e-02 *
  9.9027
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1608
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3237
 0.2153
 0.2558
 0.2013
 0.2526
 0.0485
 0.0879
[torch.cuda.FloatTensor of size 7 (GPU 0)]


 0.3258
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2704
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3985
 0.4538
 0.1897
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.1527
 0.1863
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.3426
 0.3249
 0.5062
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.0642
 0.2506
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.2027
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2418
 0.2337
 0.2590
[torch.cuda.FloatTensor of size 3 (GPU 0)]


1.00000e-02 *
  8.4642
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3652
 0.1377
[torch.cuda.FloatTensor of size 2 (GPU 0)]


1.00000e-02 *
  9.5146
  5.7398
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.2041
 0.1148
 0.1341
 0.1650
 0.0384
 0.2213
 0.1477
 0.2520
 0.2531
 0.2129
 0.2144
 0.1795
 0.3002
 0.3057
[torch.cuda.FloatTensor of size 14 (GPU 0)]


 0.2097
 0.2182
 0.2786
 0.2973
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.2568
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2184
 0.2459
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.1166
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.0898
 0.0548
 0.4860
 0.0865
 0.1805
 0.2080
 0.2583
 0.0650
 0.0383
[torch.cuda.FloatTensor of size 9 (GPU 0)]


 0.2222
 0.1000
[torch.cuda.FloatTensor of size 2 (GPU 0)]


1.00000e-02 *
  6.6300
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1940
 0.1498
 0.4352
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.5732
 0.1231
 0.2356
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.2515
 0.2851
 0.2107
 0.2351
 0.2572
 0.1801
 0.2538
[torch.cuda.FloatTensor of size 7 (GPU 0)]


 0.2544
 0.0842
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.1890
 0.2767
 0.2161
 0.2104
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.1465
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3846
 0.4679
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.1677
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3781
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1589
 0.1125
 0.1994
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.2309
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2164
 0.4026
 0.3522
 0.2881
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.4166
 0.3824
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.1823
 0.0647
 0.0404
 0.1737
 0.1553
 0.3090
 0.3726
[torch.cuda.FloatTensor of size 7 (GPU 0)]


 0.2105
 0.2143
 0.1074
 0.1572
 0.1939
[torch.cuda.FloatTensor of size 5 (GPU 0)]


 0.1817
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.0536
 0.2392
 0.4061
 0.0804
 0.3463
 0.3876
[torch.cuda.FloatTensor of size 6 (GPU 0)]


 0.1975
 0.1799
 0.2146
 0.0935
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.2553
 0.1721
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.2017
 0.0885
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.4367
 0.2400
 0.1817
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.2471
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.3207
 0.2089
 0.6309
 0.1183
 0.2568
[torch.cuda.FloatTensor of size 5 (GPU 0)]


1.00000e-02 *
  8.3850
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1024
 0.2968
[torch.cuda.FloatTensor of size 2 (GPU 0)]


1.00000e-02 *
  8.3770
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2832
 0.1478
 0.0903
 0.3304
 0.1316
 0.1940
[torch.cuda.FloatTensor of size 6 (GPU 0)]


 0.4223
 0.1600
 0.2250
 0.3211
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.3666
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1067
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1610
 0.1593
 0.3415
 0.6606
[torch.cuda.FloatTensor of size 4 (GPU 0)]


 0.3255
 0.3394
 0.3390
[torch.cuda.FloatTensor of size 3 (GPU 0)]


 0.2139
 0.3500
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.1369
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.1455
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2794
[torch.cuda.FloatTensor of size 1 (GPU 0)]


 0.2309
[torch.cuda.FloatTensor of size 1 (GPU 0)]


1.00000e-02 *
  6.3919
  9.1493
[torch.cuda.FloatTensor of size 2 (GPU 0)]


 0.4062
 0.2180
 0.1307
 0.5762
 0.1524
 0.4794
[torch.cuda.FloatTensor of size 6 (GPU 0)]


 0.1128
[torch.cuda.FloatTensor of size 1 (GPU 0)]

loc: 10.125288963317871, clas: 73.1052017211914
Out[252]:
Variable containing:
 83.2305
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Now with our Data, Architecture, and Loss Function we have the 3 things we need to train:


In [40]:
learn.crit = ssd_loss
lr = 3e-3
lrs = np.array([lr/100,lr/10,lr])

In [254]:
learn.lr_find(lrs/1000,1.)
learn.sched.plot(1)


epoch      trn_loss   val_loss                            
    0      135.3587   319618.251953


In [255]:
learn.fit(lr, 1, cycle_len=5, use_clr=(20,10))


epoch      trn_loss   val_loss                            
    0      42.802824  31.616288 
    1      33.634386  28.359744                           
    2      29.369969  26.998438                           
    3      26.563668  26.304111                           
    4      24.498     25.733856                           

Out[255]:
[25.733855962753296]

In [256]:
learn.save('0')

In [41]:
learn.load('0')

3.4 Testing

Grab our validation set data loader; grab a batch from it; turn them into variables - so we can put them in model; set our model to evaluation mode; stick that data into our model to grab a batch of activations.

The final output Convolution returned 2 items: the classes and the bounding boxes -- so we can do destructuring assignment to grab the 2 pieces (the batch of classes outputs, and the batch of bounding box outputs).


In [42]:
x,y = next(iter(md.val_dl))
x,y = V(x),V(y)
learn.model.eval()
batch = learn.model(x)
b_clas,b_bb = batch

As expected the batch of class outputs is batch size 64 by 16 grid cells by 21 classes; and 64x16x4 for the bounding box coordinates.


In [43]:
b_clas.size(),b_bb.size()


Out[43]:
(torch.Size([64, 16, 21]), torch.Size([64, 16, 4]))

Going back to look at the ground truth in the y variable; grab the bounding box and class parts and put them into 2 python variables to print them:


In [44]:
idx = 7
b_clasi = b_clas[idx]
b_bboxi = b_bb[idx]
ima = md.val_ds.ds.denorm(to_np(x))[idx]
bbox,clas = get_y(y[0][idx], y[1][idx])
bbox,clas


Out[44]:
(Variable containing:
  0.6786  0.4866  0.9911  0.6250
  0.7098  0.0848  0.9911  0.5491
  0.5134  0.8304  0.6696  0.9063
 [torch.cuda.FloatTensor of size 3x4 (GPU 0)], Variable containing:
   8
  10
  17
 [torch.cuda.LongTensor of size 3 (GPU 0)])

We see our groundtruth bounding boxes and groundtruth classes -- this image apparently has 3 objects in it. So let's draw the image of these 3 objects:


In [45]:
def torch_gt(ax, ima, bbox, clas, prs=None, thresh=0.4):
    return show_ground_truth(ax, ima, to_np((bbox*224).long()),
                to_np(clas), to_np(prs) if prs is not None else None, thresh)

In [264]:
fig, ax = plt.subplots(figsize=(7,7))
torch_gt(ax, ima, bbox, clas)


Here are our 4x4 grid cells from our final Conv layer. The boxes are called Anchor/Prior/Default boxes.

What we're going to do for this LossFn is we're going to go through a matching-problem, where we're going to take ea. one of these 16 boxes and see which one of the 3 ground-truth objects has the highest amount of overlap with it.

So we need some way of measuring overlap. The Jaccard Index is the standard fn used for this. It's the area of the Intersection of the Anchor Box (AB) and Ground Truth Box/Object (GTO) divided by the area of their Union. $\longrightarrow$ IOU: Intersection Over Union.


In [265]:
fig, ax = plt.subplots(figsize=(7,7))
torch_gt(ax, ima, anchor_cnr, b_clasi.max(1)[1])



In [266]:
grid_sizes


Out[266]:
Variable containing:
 0.2500
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

So we'll find the Jaccard overlap for each of our objects vs ea. of the 16 ABs, giving us a 3 x 16 matrix.

Here are the coordinates of ea. of our ABs, printed as Center (X,Y), Height, Width


In [267]:
anchors


Out[267]:
Variable containing:
 0.1250  0.1250  0.2500  0.2500
 0.1250  0.3750  0.2500  0.2500
 0.1250  0.6250  0.2500  0.2500
 0.1250  0.8750  0.2500  0.2500
 0.3750  0.1250  0.2500  0.2500
 0.3750  0.3750  0.2500  0.2500
 0.3750  0.6250  0.2500  0.2500
 0.3750  0.8750  0.2500  0.2500
 0.6250  0.1250  0.2500  0.2500
 0.6250  0.3750  0.2500  0.2500
 0.6250  0.6250  0.2500  0.2500
 0.6250  0.8750  0.2500  0.2500
 0.8750  0.1250  0.2500  0.2500
 0.8750  0.3750  0.2500  0.2500
 0.8750  0.6250  0.2500  0.2500
 0.8750  0.8750  0.2500  0.2500
[torch.cuda.FloatTensor of size 16x4 (GPU 0)]

In [46]:
a_ic = actn_to_bb(b_bboxi, anchors)

In [269]:
fig, ax = plt.subplots(figsize=(7,7))
torch_gt(ax, ima, a_ic, b_clasi.max(1)[1], 
         b_clasi.max(1)[0].sigmoid(), thresh=0.0)


As you can see: the 3x16 matrix of IOU (Jaccard) overlaps for each AB & GTO. Here you can see the 8th AB overlaps a little bit with the 2nd GTO


In [270]:
overlaps = jaccard(bbox.data, anchor_cnr.data)
overlaps


Out[270]:

Columns 0 to 9 
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0091
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0356  0.0549
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000

Columns 10 to 15 
 0.0922  0.0000  0.0000  0.0315  0.3985  0.0000
 0.0103  0.0000  0.2598  0.4538  0.0653  0.0000
 0.0000  0.1897  0.0000  0.0000  0.0000  0.0000
[torch.cuda.FloatTensor of size 3x16 (GPU 0)]

What we can do now is take the max of each row (max of dimension 1), and that'll tell us: foreach GTO what's the max amount of overlap with any gridcell.

PyTorch returns the max and it's index when you ask for .max; so we also have their indices.

This gives us a pretty good way of assigning ea. of these GTOs to a gridcell -- matching to highest overlap.


In [271]:
overlaps.max(1)


Out[271]:
(
  0.3985
  0.4538
  0.1897
 [torch.cuda.FloatTensor of size 3 (GPU 0)], 
  14
  13
  11
 [torch.cuda.LongTensor of size 3 (GPU 0)])

We also look at the max over dim 0, which tells us the max amount of overlap foreach gridcell across each of the GTOs.

The indices tell us for every gridcell (16): what's the index of the GTO that overlaps with it the most.

'0' is overloaded: it could either mean no overlap, or object idx 0 overlaps -- but it turns out not to matter (bc we set '0' to be the background).


In [272]:
overlaps.max(0)


Out[272]:
(
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0356
  0.0549
  0.0922
  0.1897
  0.2598
  0.4538
  0.3985
  0.0000
 [torch.cuda.FloatTensor of size 16 (GPU 0)], 
  0
  0
  0
  0
  0
  0
  0
  0
  1
  1
  0
  2
  1
  1
  0
  0
 [torch.cuda.LongTensor of size 16 (GPU 0)])

map_to_ground_truth combines the 2 sets of overlaps (along GTO, along gridcell) in a way described by the SSD paper to assign each AB to a GTO. GTOs are assigned to their highest overlapping AB. All other ABs are assigned to the GTO to which their overlap is > 0.5. All others are considered to be cells containing background.

Below you see in the 2nd list the assignments; in the 1st any number < 0.5 is bg; the 'forced' assignments are given a high number to make sure they get assigned.


In [273]:
gt_overlap,gt_idx = map_to_ground_truth(overlaps)
gt_overlap,gt_idx


Out[273]:
(
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0000
  0.0356
  0.0549
  0.0922
  1.9900
  0.2598
  1.9900
  1.9900
  0.0000
 [torch.cuda.FloatTensor of size 16 (GPU 0)], 
  0
  0
  0
  0
  0
  0
  0
  0
  1
  1
  0
  2
  1
  1
  0
  0
 [torch.cuda.LongTensor of size 16 (GPU 0)])

So now we can convert them to classes:


In [274]:
gt_clas = clas[gt_idx]; gt_clas


Out[274]:
Variable containing:
  8
  8
  8
  8
  8
  8
  8
  8
 10
 10
  8
 17
 10
 10
  8
  8
[torch.cuda.LongTensor of size 16 (GPU 0)]

And using a threshold on 0.5 let's us see the 3 classes that are being predicted;


In [275]:
thresh = 0.5
pos = gt_overlap > thresh
pos_idx = torch.nonzero(pos)[:,0]
neg_idx = torch.nonzero(1-pos)[:,0]
pos_idx


Out[275]:
 11
 13
 14
[torch.cuda.LongTensor of size 3 (GPU 0)]

And we can use that to see what class each AB is meant to be predicting.


In [276]:
gt_clas[1-pos] = len(id2cat)
[id2cat[o] if o < len(id2cat) else 'bg' for o in gt_clas.data]


Out[276]:
['bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'bg',
 'sofa',
 'bg',
 'diningtable',
 'chair',
 'bg']

So That's the matching stage.


Once we're done with matching, we're p.much finished. We can take the activations which matched something, subtract from those the GT bounding boxes, take the Absolute Value of the Difference, the Mean of that: and that's L1Loss.

For the Classifications we can do Cross Entropy. (We ultimately add them both for our full Loss Function)


In [277]:
gt_bbox = bbox[gt_idx]
loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs()).mean()
clas_loss = F.cross_entropy(b_clasi, gt_clas)
loc_loss,clas_loss


Out[277]:
(Variable containing:
 1.00000e-02 *
   7.5432
 [torch.cuda.FloatTensor of size 1 (GPU 0)], Variable containing:
  1.1404
 [torch.cuda.FloatTensor of size 1 (GPU 0)])

In [279]:
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for idx,ax in enumerate(axes.flat):
    ima = md.val_ds.ds.denorm(to_np(x))[idx]
    bbox,clas = get_y(y[0][idx], y[1][idx])
    ima = md.val_ds.ds.denorm(to_np(x))[idx]
    bbox,clas = get_y(bbox,clas); bbox,clas
    a_ic = actn_to_bb(b_bb[idx], anchors)
    torch_gt(ax, ima, a_ic, b_clas[idx].max(1)[1], b_clas[idx].max(1)[0].sigmoid(), 0.01)
plt.tight_layout()


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

In practice we'd remove the background, but you can see it's kind of working in the right direction. It sees the bird in the middle with a 0.86, the car, etc. It doesn't pick up the motorcycle (calls it a bicycle), it barely sees the potted plant; and that's bc our ABs are too few: we just have a 4x4 grid. We'll solve this by just having many more of them.

4. More Anchors

There're 3 ways to make more anchor boxes.

  1. Create different ABs of dfrnt sizes and aspect ratios.

  2. Use more Conv layers as sources of bounding boxes.

  3. -----??

4.1 Create Anchors

The only important number we need to know is k. Since the different Conv layers have different filter sizes, they already output a certain resolution of activations (a grid). k is the number of zooms $\times$ the number of aspect ratios.

The grids (anc_grids) we're going to get for free through our architecture.


In [47]:
anc_grids = [4,2,1]  # anchor box sizes
# anc_grids = [2]
# anc_grids = [1]
anc_zooms = [0.7, 1., 1.3]  # anchor box zoom levels
# anc_zooms = [0.5, 0.7, 1.0]
# anc_zooms = [1.]
anc_ratios = [(1.,1.), (1.,0.5), (0.5,1.)]  # anchor box aspect ratios
# anc_ratios = [(1.,1.)]
anchor_scales = [(anz*i, anz*j) for anz in anc_zooms for (i,j) in anc_ratios]
k = len(anchor_scales)
anc_offsets = [1/(o*2) for o in anc_grids]
k


Out[47]:
9

In [48]:
anc_x = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag) 
                       for ao,ag in zip(anc_offsets,anc_grids)])
anc_y = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag) 
                       for ao,ag in zip(anc_offsets,anc_grids)])
anc_ctrs = np.repeat(np.stack([anc_x,anc_y], axis=1), k, axis=0)

In [49]:
anc_sizes  =  np.concatenate([np.array([[o/ag,p/ag] for i in range(ag*ag) 
                                        for o,p in anchor_scales])
                              for ag in anc_grids])
grid_sizes = V(np.concatenate([np.array([1/ag       for i in range(ag*ag) 
                                        for o,p in anchor_scales])
                              for ag in anc_grids]), requires_grad=False).unsqueeze(1)
anchors    = V(np.concatenate([anc_ctrs, anc_sizes], axis=1), requires_grad=False).float()
anchor_cnr = hw2corners(anchors[:,:2], anchors[:,2:])

In [50]:
# anchors

In [51]:
x,y = to_np(next(iter(md.val_dl)))
x   = md.val_ds.ds.denorm(x)

In [52]:
a = np.reshape((to_np(anchor_cnr) + to_np(torch.randn(*anchor_cnr.size()))*0.01)*224, -1)

Using:

anc_grids = [2]
anc_zooms = [1.]
anc_ratios = [(1.,1.)]

In [80]:
fig,ax = plt.subplots(figsize=(7,7))
show_ground_truth(ax, x[0], a)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

Using:

anc_grids = [2]
anc_zooms = [1.]
anc_ratios = [(1.,1.), (1.,0.5), (0.5,1.)]

In [73]:
fig,ax = plt.subplots(figsize=(7,7))
show_ground_truth(ax, x[0], a)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

Using:

anc_grids = [4,2,1]
anc_zooms = [1.]
anc_ratios = [(1.,1.)]

In [39]:
fig,ax = plt.subplots(figsize=(7,7))
show_ground_truth(ax, x[0], a)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

Using:

anc_grids = [1]
anc_zooms = [0.5, 0.7, 1.0]
anc_ratios = [(1.,1.), (1.,0.5), (0.5,1.)]

In [65]:
fig,ax = plt.subplots(figsize=(7,7))
show_ground_truth(ax, x[0], a)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

We use the SSD approach to activations where we produce a Convolutional output -- where want to match up the set of activations whose receptive field most closely reflects \ has the maximum density from where the real object is.

We need a consistent rule to decide which set of activations (of the m*(4+C) where m: number of objects in image; C: number of classes) correspond to which boundingbox and object.

The Neural Net's Loss Function needs some consistent task -- in this case: try to make these activations reflect the bounding box in this general area.

The 4x4 output Conv gives us activations whose receptive field corresponds to those locations in the input image.

4.2 Model

TEMP


In [116]:
anc_grids = [4,2,1]  # anchor box sizes
# anc_grids = [2,1]
anc_zooms = [0.7, 1., 1.3]  # anchor box zoom levels
# anc_zooms = [1.]
anc_ratios = [(1.,1.), (1.,0.5), (0.5,1.)]  # anchor box aspect ratios
anchor_scales = [(anz*i, anz*j) for anz in anc_zooms for (i,j) in anc_ratios]
k = len(anchor_scales)
anc_offsets = [1/(o*2) for o in anc_grids]


anc_x = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag) 
                       for ao,ag in zip(anc_offsets,anc_grids)])
anc_y = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag) 
                       for ao,ag in zip(anc_offsets,anc_grids)])
anc_ctrs = np.repeat(np.stack([anc_x,anc_y], axis=1), k, axis=0)


anc_sizes  =  np.concatenate([np.array([[o/ag,p/ag] for i in range(ag*ag) 
                                        for o,p in anchor_scales])
                              for ag in anc_grids])
grid_sizes = V(np.concatenate([np.array([1/ag       for i in range(ag*ag) 
                                        for o,p in anchor_scales])
                              for ag in anc_grids]), requires_grad=False).unsqueeze(1)
anchors    = V(np.concatenate([anc_ctrs, anc_sizes], axis=1), requires_grad=False).float()
anchor_cnr = hw2corners(anchors[:,:2], anchors[:,2:])


a = np.reshape((to_np(anchor_cnr) + to_np(torch.randn(*anchor_cnr.size()))*0.01)*224, -1)

In [117]:
class SSD_MultiHead(nn.Module):
    def __init__(self, k, bias):
        super().__init__()
        self.drop = nn.Dropout(drop)
        self.sconv1 = StdConv(512,256, drop=drop)
        self.sconv2 = StdConv(256,256, drop=drop)
        self.sconv3 = StdConv(256,256, drop=drop)
        self.out0 = OutConv(k, 256, bias)
        self.out1 = OutConv(k, 256, bias)
        self.out2 = OutConv(k, 256, bias)
        self.out3 = OutConv(k, 256, bias)

    def forward(self, x):
        x = self.drop(F.relu(x))
        x = self.sconv1(x)
        x = F.adaptive_max_pool2d(x, anc_grids[0]) # adaptive maxpool for 1st size of anchors
        o1c,o1l = self.out1(x)
        x = self.sconv2(x)
        x = F.adaptive_max_pool2d(x, anc_grids[1]) # adaptive maxpool for 2nd size of anchors
        o2c,o2l = self.out2(x) 
        x = self.sconv3(x)
        x = F.adaptive_max_pool2d(x, anc_grids[2]) # adaptive maxpool for 3rd size of anchors
        o3c,o3l = self.out3(x)
#         return [o1c, o1l]
        return [torch.cat([o1c,o2c,o3c], dim=1),
                torch.cat([o1l,o2l,o3l], dim=1)]

In [118]:
learn.crit = ssd_loss
lr = 1e-2
lrs = np.array([lr/100,lr/10,lr])

x,y = next(iter(md.val_dl))
x,y = V(x),V(y)
batch = learn.model(V(x))

In [119]:
batch[0].size(),batch[1].size()


Out[119]:
(torch.Size([64, 189, 21]), torch.Size([64, 189, 4]))

In [120]:
ssd_loss(batch, y, False)


Out[120]:
Variable containing:
 332.0184
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [121]:
learn.lr_find(lrs/1000,1.)
learn.sched.plot(n_skip_end=2)


epoch      trn_loss   val_loss                           
    0      306.218304 7777654.25

END TEMP


In [143]:
drop = 0.4

class SSD_MultiHead(nn.Module):
    def __init__(self, k, bias):
        super().__init__()
        self.drop = nn.Dropout(drop)
        self.sconv0 = StdConv(512,256, stride=1, drop=drop)
        self.sconv1 = StdConv(256,256, drop=drop)
        self.sconv2 = StdConv(256,256, drop=drop)
        self.sconv3 = StdConv(256,256, drop=drop)
        self.out0 = OutConv(k, 256, bias)
        self.out1 = OutConv(k, 256, bias)
        self.out2 = OutConv(k, 256, bias)
        self.out3 = OutConv(k, 256, bias)

    def forward(self, x):
        x = self.drop(F.relu(x))
        x = self.sconv0(x)
        x = self.sconv1(x) # 4x4
        o1c,o1λ = self.out1(x) # grab set of outputs from conv1
        x = self.sconv2(x) # 2x2
        o2c,o2λ = self.out2(x) # grab set of outputs from conv2
        x = self.sconv3(x) # 1x1
        o3c,o3λ = self.out3(x) # grab set of outputs from conv3
        return [torch.cat([o1c,o2c,o3c], dim=1),
                torch.cat([o1λ,o2λ,o3λ], dim=1)]

head_reg4   = SSD_MultiHead(k, -4.)
models = ConvnetBuilder(f_model, 0, 0, 0, custom_head=head_reg4)
learn  = ConvLearner(md, models)
learn.opt_fn = optim.Adam

In [144]:
learn.crit = ssd_loss
lr = 1e-2
lrs = np.array([lr/100,lr/10,lr])

In [145]:
x,y = next(iter(md.val_dl))
x,y = V(x),V(y)
batch = learn.model(V(x))

In [146]:
batch[0].size(),batch[1].size()


Out[146]:
(torch.Size([64, 189, 21]), torch.Size([64, 189, 4]))

In [147]:
ssd_loss(batch, y, False)


Out[147]:
Variable containing:
 334.0669
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [148]:
learn.lr_find(lrs/1000,1.)
learn.sched.plot(n_skip_end=2)


epoch      trn_loss   val_loss                           
    0      347.288369 1441133.71875


In [149]:
learn.fit(lrs, 1, cycle_len=4, use_clr=(20,8))


epoch      trn_loss   val_loss                           
    0      158.423973 130.715827
    1      125.522619 100.800133                         
    2      107.470922 91.400873                          
    3      95.279034  86.717551                           

Out[149]:
[86.71755123138428]

In [150]:
learn.save('tmp')
# learn.load('tmp')

In [151]:
learn.freeze_to(-2)
learn.fit(lrs/2, 1, cycle_len=4, use_clr=(20,8))


epoch      trn_loss   val_loss                            
    0      90.263748  99.257344 
    1      84.897002  90.537347                           
    2      77.97744   81.182782                           
    3      70.282173  77.368475                           

Out[151]:
[77.36847496032715]

In [152]:
learn.save('prefocal')

In [155]:
x,y = next(iter(md.val_dl))
y = V(y)
batch = learn.model(V(x))
b_clas,b_bb = batch
x = to_np(x)

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for idx,ax in enumerate(axes.flat):
    ima = md.val_ds.ds.denorm(x)[idx]
    bbox,clas = get_y(y[0][idx], y[1][idx])
    a_ic = actn_to_bb(b_bb[idx], anchors)
    torch_gt(ax, ima, a_ic, b_clas[idx].max(1)[1], b_clas[idx].max(1)[0].sigmoid(), 0.1) # 0.21 used when loss ~11
plt.tight_layout()


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

The 5 papers that are the key steps in the recent modern history of Object Detection:

  1. Scalable Object Detection using DNNs (Multibox) - (2013)
  2. Faster R-CNN: Towards Real-Time Object Detection w/ Region Proposal Networks
  3. You Only Look Once: Unified, Real-Time Object Detection
  4. SSD: Single Shot MultiBox Detector
  5. Focal Loss for Dense Object Detection (RetinaNet)

The model above is generally able to detect large objects that dominate the image, but struggles or totally fails with smaller ones. The reason is BCE loss has a strong penalty against being 'wrong', and the model has to choose the correct class out of 21 categories; it's a safer bet to just label a grid as 'background'. The detections we do see, for example the large boxes, are from the 1x1 grid's anchor boxes. (although this run seems a bit more accurate than usual - I've run it a few times)

We need a way to tune the Loss Function to allow the network to identify smaller objects without getting overwhelmed by the background -- and that's the 'Focal Loss' (which is basically just multiplying by some number):

FL: $-(1 - p_t)^\gamma \log{p_t}$ instead of CE: $-\log{p_t}$

5. Focal Loss


In [156]:
def plot_results(thresh):
    x,y = next(iter(md.val_dl))
    y   = V(y)
    batch = learn.model(V(x))
    b_clas,b_bb = batch
    
    x = to_np(x)
    fig,axes = plt.subplots(3, 4, figsize=(16, 12))
    for idx, ax in enumerate(axes.flat):
        ima = md.val_ds.ds.denorm(x)[idx]
        bbox,clas = get_y(y[0][idx], y[1][idx])
        a_ic = actn_to_bb(b_bb[idx], anchors)
        clas_pr, clas_ids = b_clas[idx].max(1)
        clas_pr = clas_pr.sigmoid()
        torch_gt(ax, ima, a_ic, clas_ids, clas_pr, clas_pr.max().data[0]*thresh)
    plt.tight_layout()

This is the entirety of Focal Loss - this is the thing that suddently made Object Detection make sense. It comes from the BCE_Loss class way up above (§3), and adds a weight to it.


In [157]:
class FocalLoss(BCE_Loss):
    def get_weight(self,x,t):
        alpha,gamma = 0.25,1
        p = x.sigmoid()
        pt = p*t + (1-p)*(1-t)
        w = alpha*t + (1-alpha)*(1-t)
        return w * (1-pt).pow(gamma)
    
loss_f = FocalLoss(len(id2cat))

In [158]:
x,y = next(iter(md.val_dl))
x,y = V(x),V(y)
batch = learn.model(x)
ssd_loss(batch, y, False) # False to prevent a big printout


Out[158]:
Variable containing:
 21.8347
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [159]:
learn.lr_find(lrs/1000,1.)
learn.sched.plot(n_skip_end=1)


 91%|█████████ | 29/32 [00:43<00:04,  1.51s/it, loss=120] 

In [161]:
learn.fit(lrs, 1, cycle_len=10, use_clr=(20,10))


  0%|          | 0/32 [00:00<?, ?it/s]                   
epoch      trn_loss   val_loss                            
    0      18.769772  42.212212 
    1      20.778641  25.455376                           
    2      20.116982  20.299201                           
    3      18.65252   18.827707                           
    4      17.212789  18.137818                           
    5      15.906149  17.244967                           
    6      14.787977  17.333682                           
    7      13.841894  16.740166                           
    8      12.9954    16.299893                           
    9      12.275578  16.178764                           

Out[161]:
[16.17876362800598]

In [162]:
learn.save('fλ0')

In [ ]:
learn.load('fλ0')

In [163]:
learn.freeze_to(-2)
learn.fit(lrs/4, 1, cycle_len=10, use_clr=(20,10))


epoch      trn_loss   val_loss                            
    0      11.405517  16.684548 
    1      11.497326  17.253577                           
    2      11.317247  16.545794                           
    3      11.007198  16.741605                           
    4      10.702677  16.584066                           
    5      10.321765  16.34311                            
    6      9.940913   16.37835                            
    7      9.66214    16.255918                           
    8      9.399031   16.195716                           
    9      9.130429   16.231855                           

Out[163]:
[16.231855034828186]

In [164]:
learn.save('drop4')

In [ ]:
learn.load('drop4')

In [165]:
plot_results(0.75)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

So it's working a lot better now -- it's doing better at finding the actual object and the correct class of it. Note also the higher confidence values (the threshold used here is much higher than earlier above). Now we need to find out how to pull out the best anchor box. The way to do that is very simple: go through each pair of boundingboxes, and if they overlap by more than some amount and predict the same class: assume they're the same thing -- and pick the one with the higher p value.

6. NMS - Non-Maximum Suppression


In [166]:
def nms(boxes, scores, overlap=0.5, top_k=100):
    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0: return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    idx = idx[-top_k:]  # indices of top-k largest vales
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()
    
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current larget val
        keep[count] = i
        count += 1
        if idx.size(0) == 1: break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store results in IoU
        # keep only elements w/ an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count

In [167]:
x,y = next(iter(md.val_dl))
y = V(y)
batch = learn.model(V(x))
b_clas,b_bb = batch
x = to_np(x)

NOTE: I edited this to skip images where no object detections breach the confidence threshold. The original code throws a ValueError if this occurs (np.concatenate doesn't like empty arrays: it wants something to concatenate). Originally 'fixed' that with a try:except block. Rerunning the Focal-Loss model or training further should solve that issue and let all images be displayed.


In [179]:
def show_nmf(idx):
    ima = md.val_ds.ds.denorm(x)[idx]
    bbox,clas = get_y(y[0][idx], y[1][idx])
    a_ic = actn_to_bb(b_bb[idx], anchors)
    clas_pr,clas_ids = b_clas[idx].max(1)
    clas_pr = clas_pr.sigmoid()
    
    conf_scores = b_clas[idx].sigmoid().t().data
    
    out1,out2,cc = [], [], []
    for  in range(0, len(conf_scores) - 1):
        c_mask = conf_scores[] > 0.25
        if c_mask.sum() == 0: continue
        scores = conf_scores[][c_mask]
        λ_mask = c_mask.unsqueeze(1).expand_as(a_ic)
        boxes = a_ic[λ_mask].view(-1, 4)
        ids,count = nms(boxes.data, scores, 0.4, 50)
        ids = ids[:count]
        out1.append(scores[ids])
        out2.append(boxes.data[ids])
        cc.append([]*count)
    if len(cc) == 0:
        return
    cc = T(np.concatenate(cc))
    out1 = torch.cat(out1)
    out2 = torch.cat(out2)

    fig,ax = plt.subplots(figsize=(8,8))
    torch_gt(ax, ima, out2, cc, out1, 0.1)

In [180]:
for i in range(12): show_nmf(i)


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

End