YOLO V2

First, here's the config from their darknet/yolo repo:

cnf
[net]
# Testing
batch=1
subdivisions=1
# Training
# batch=64
# subdivisions=8
width=608
height=608
channels=3
momentum=0.9
decay=0.0005
angle=0
saturation = 1.5
exposure = 1.5
hue=.1

learning_rate=0.001
burn_in=1000
max_batches = 500200
policy=steps
steps=400000,450000
scales=.1,.1

[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky


#######

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[route]
layers=-9

[convolutional]
batch_normalize=1
size=1
stride=1
pad=1
filters=64
activation=leaky

[reorg]
stride=2

[route]
layers=-1,-4

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=425
activation=linear


[region]
anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
bias_match=1
classes=80
coords=4
num=5
softmax=1
jitter=.3
rescore=1

object_scale=5
noobject_scale=1
class_scale=1
coord_scale=1

absolute=1
thresh = .6
random=1

this can be found here: https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolo.cfg


This is the model we'll be building, we'll be referencing this later, but first let's import and prepare the data


In [1]:
# Imports, helpers, and constants

# numpy/tensorflow
import numpy as np
import tensorflow as tf

# verious keras bits
from keras import backend as K
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# keras layers
from keras.layers import (Conv2D,
                          GlobalAveragePooling2D,
                          Input,
                          Lambda,
                          MaxPooling2D,
                          LeakyReLU,
                          concatenate,
                          BatchNormalization)

# Images
import PIL
from PIL import Image, ImageDraw, ImageFont

# System
import os
import sys
import random
from functools import reduce

# data
import python_voc_parser as voc

# tools
import functools
from functools import partial

# ---

IMAGE_SET_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/Annotations'
IMAGE_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/JPEGImages'

IMAGE_SET_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/Annotations'
IMAGE_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/JPEGImages'

# # temp for macbook
# IMAGE_SET_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt'
# ANNOTATIONS_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/Annotations'
# IMAGE_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/JPEGImages'

IMAGE_SIZE = (608, 608)
CONV_MAP_SIZE = (19, 19)
WEIGHT_DECAY = 0.0005


Using TensorFlow backend.

Prepare the data


In [2]:
parser2012 = voc.VocAnnotationsParser(IMAGE_PATH_2012, IMAGE_SET_PATH_2012, ANNOTATIONS_PATH_2012)

In [3]:
df2012 = parser2012.get_annotation_dataframe()

In [4]:
df2012


Out[4]:
class_name filename height img_full_path width xmax xmin ymax ymin
0 tvmonitor 2008_000002 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 448 34 293 11
1 train 2008_000003 333.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 500 46 333 11
2 person 2008_000003 333.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 83 62 243 190
3 boat 2008_000007 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 428 1 293 230
4 horse 2008_000008 442.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 471 53 420 87
5 person 2008_000008 442.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 289 158 167 44
6 cow 2008_000009 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 294 217 221 161
7 cow 2008_000009 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 500 465 218 167
8 bottle 2008_000015 327.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 378 270 176 1
9 bottle 2008_000015 327.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 164 57 150 1
10 tvmonitor 2008_000016 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 392 91 353 15
11 dog 2008_000019 272.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 480.0 372 139 197 2
12 dog 2008_000019 272.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 480.0 318 165 236 66
13 dog 2008_000019 272.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 480.0 480 361 112 1
14 aeroplane 2008_000021 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 475 14 288 148
15 tvmonitor 2008_000023 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 333.0 314 6 262 1
16 bottle 2008_000023 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 333.0 121 40 411 97
17 person 2008_000023 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 333.0 169 137 109 36
18 person 2008_000023 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 333.0 216 180 104 36
19 person 2008_000023 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 333.0 123 96 103 39
20 person 2008_000026 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 372 122 375 7
21 dog 2008_000026 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 325 211 255 147
22 car 2008_000027 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 500 9 375 32
23 car 2008_000028 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 231 158 240 199
24 car 2008_000028 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 94 47 238 202
25 bus 2008_000032 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 489 6 274 118
26 person 2008_000032 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 364 336 224 173
27 aeroplane 2008_000033 333.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 476 104 257 61
28 bottle 2008_000034 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 338.0 45 6 362 234
29 person 2008_000034 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 338.0 103 1 336 156
... ... ... ... ... ... ... ... ... ...
31531 train 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 349 1 314 140
31532 train 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 67 1 208 165
31533 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 330 310 226 182
31534 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 343 328 228 182
31535 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 367 336 266 192
31536 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 487 458 274 190
31537 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 428 410 243 192
31538 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 29 14 218 182
31539 person 2011_003260 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 47 35 208 184
31540 bicycle 2011_003261 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 375.0 186 132 332 245
31541 person 2011_003261 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 375.0 186 130 292 210
31542 bottle 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 307 283 194 106
31543 bottle 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 248 223 173 97
31544 bottle 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 253 215 272 154
31545 diningtable 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 386 80 359 116
31546 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 190 2 359 45
31547 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 219 98 222 49
31548 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 226 116 192 24
31549 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 266 178 139 31
31550 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 397 341 153 39
31551 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 430 364 172 54
31552 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 448 336 221 71
31553 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 500 285 359 77
31554 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 200 111 60 1
31555 person 2011_003262 359.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 246 221 66 1
31556 car 2011_003269 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 334.0 309 19 461 347
31557 bottle 2011_003271 500.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 499 98 405 1
31558 train 2011_003274 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 384 1 215 113
31559 aeroplane 2011_003275 403.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 448 42 282 50
31560 bird 2011_003276 400.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 292 264 217 142

31561 rows × 9 columns


In [5]:
df2012.index.values


Out[5]:
array([    0,     1,     2, ..., 31558, 31559, 31560])

In [6]:
df2012.iloc[[0]]


Out[6]:
class_name filename height img_full_path width xmax xmin ymax ymin
0 tvmonitor 2008_000002 375.0 /Users/adammenges/Dropbox/Datasets/VOC/VOC 201... 500.0 448 34 293 11

In [7]:
df2012.iloc[[0]].img_full_path.values[0]


Out[7]:
'/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/JPEGImages/2008_000002.jpg'

In [8]:
from collections import defaultdict

annotations = defaultdict(list)

for i in df2012.index.values:
    filename = df2012.iloc[[i]].img_full_path.values[0]
    xmax = float(df2012.iloc[[i]].xmax.values[0])
    xmin = float(df2012.iloc[[i]].xmin.values[0])
    ymax = float(df2012.iloc[[i]].ymax.values[0])
    ymin = float(df2012.iloc[[i]].ymin.values[0])
    c = df2012.iloc[[i]].class_name.values[0]
    annotations[filename].append([xmin,xmax,ymin,ymax,c])

all_boxes = list(annotations.values())
len_boxes = [len(x) for x in all_boxes]
max_number_of_boxes = max(len_boxes)
max_loc = len_boxes.index(max_number_of_boxes)

print("this dataset has a max number of {} boxes in one image, will use this for padding later".format(max_number_of_boxes))


this dataset has a max number of 56 boxes in one image, will use this for padding later

In [9]:
voc2012_classes = list(set(parser2012.get_annotation_dataframe().class_name))

For reference while I'm doing this:



In [10]:
def convert_box(size, box):
    dw = 1.0/size[0]
    dh = 1.0/size[1]
    x = (box[0] + box[1])/2.0
    y = (box[2] + box[3])/2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

# Keras voc data generator, use later with new loss
def yolo_voc_generator(batch_size):
    while True:
        # I prob shouldn't be doing it this way...
        random_images = random.sample(annotations.keys(), batch_size)

        # Mmmk you know the drill...
        processed_images = [Image.open(image_path).resize((608, 608), PIL.Image.BICUBIC)\
                            for image_path in random_images]
        processed_images = [np.array(image, dtype=np.float) for image in processed_images]
        processed_images = [image/255.0 for image in processed_images]
        
        boxes = [] # all boxes, for all images
        
        for k in random_images:
            print(k)
            new_image_boxes = []
            for box in annotations[k]:
                print(box)
                origanal_image = Image.open(k) # move up there so I'm not doing this twice
                new_box = list(map(float, box[:4]))
                new_box = convert_box(origanal_image.size, new_box)
                label_idx = voc2012_classes.index(box[4])
                new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
                new_image_boxes.append(new_image_box)
            if len(new_image_boxes) < max_number_of_boxes:
                for z in range(max_number_of_boxes - len(new_image_boxes)):
                    new_image_boxes.append(np.zeros(5))
            boxes.append(np.array(new_image_boxes))
        yield([boxes, processed_images])

# For educational proposes and ease, this method can just do it on all the data
def process_data(amount=None):
    if amount:
        all_images = list(annotations.keys())[:amount] # Only a subset for now
    else:
        all_images = list(annotations.keys())

    # Mmmk you know the drill...
    processed_images = [Image.open(image_path).resize((IMAGE_SIZE[0], IMAGE_SIZE[0]), PIL.Image.BICUBIC) \
                        for image_path in all_images]
    processed_images = [np.array(image, dtype=np.float) for image in processed_images]
    processed_images = [image/255.0 for image in processed_images]

    processed_boxes = [] # all boxes, for all images

    for k in all_images:
        new_image_boxes = []
        for box in annotations[k]:
            origanal_image = Image.open(k) # move up there so I'm not doing this twice
            new_box = list(map(float, box[:4]))
            new_box = convert_box(origanal_image.size, new_box)
            label_idx = voc2012_classes.index(box[4])
            new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
            new_image_boxes.append(new_image_box)
        if len(new_image_boxes) < max_number_of_boxes:
            for z in range(max_number_of_boxes - len(new_image_boxes)):
                new_image_boxes.append(np.zeros(5))
        processed_boxes.append(np.array(new_image_boxes))

    processed_boxes = np.array(processed_boxes)
    processed_images = np.array(processed_images)
    processed_images = np.uint8(processed_images)
    
    return processed_images, processed_boxes

In [11]:
processed_images, processed_boxes = process_data(100)

In [12]:
processed_images.shape


Out[12]:
(100, 608, 608, 3)

In [13]:
processed_boxes.shape


Out[13]:
(100, 56, 5)

VOC Classes and Anchors


In [14]:
voc_anchors = np.array(
    [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]
)

# voc_classes = [
#     "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
#     "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
#     "pottedplant", "sheep", "sofa", "train", "tvmonitor"
# ]

voc_classes = voc2012_classes

Create our model

Now that the data's been munged correctly, time to create our model.

(tried this on a whim, didn't realize you could have gifs in a notebook, I'll be using this liberally in the future)


In [15]:
input_image_tensor = Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

# time for conv
conv_layer_1 = Conv2D(
    filters=32,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(input_image_tensor)
conv_layer_1 = BatchNormalization()(conv_layer_1)
conv_layer_1 = LeakyReLU(alpha=0.1)(conv_layer_1)

# time for max pool
maxpool_layer_1 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_1)

# next conv
conv_layer_2 = Conv2D(
    filters=64,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_1)
conv_layer_2 = BatchNormalization()(conv_layer_2)
conv_layer_2 = LeakyReLU(alpha=0.1)(conv_layer_2)

# now for a max pool
maxpool_layer_2 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_2)

# next conv
conv_layer_3 = Conv2D(
    filters=128,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_2)
conv_layer_3 = BatchNormalization()(conv_layer_3)
conv_layer_3 = LeakyReLU(alpha=0.1)(conv_layer_3)

# next conv
conv_layer_4 = Conv2D(
    filters=64,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_3)
conv_layer_4 = BatchNormalization()(conv_layer_4)
conv_layer_4 = LeakyReLU(alpha=0.1)(conv_layer_4)


# next conv
conv_layer_5 = Conv2D(
    filters=128,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_4)
conv_layer_5 = BatchNormalization()(conv_layer_5)
conv_layer_5 = LeakyReLU(alpha=0.1)(conv_layer_5)

# next max pool
maxpool_layer_3 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)
maxpool_layer_3 = maxpool_layer_3(conv_layer_5)

# next conv
conv_layer_6 = Conv2D(
    filters=256,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_3)
conv_layer_6 = BatchNormalization()(conv_layer_6)
conv_layer_6 = LeakyReLU(alpha=0.1)(conv_layer_6)

# next conv
conv_layer_7 = Conv2D(
    filters=128,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_6)
conv_layer_7 = BatchNormalization()(conv_layer_7)
conv_layer_7 = LeakyReLU(alpha=0.1)(conv_layer_7)


# next conv
conv_layer_8 = Conv2D(
    filters=256,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_7)
conv_layer_8 = BatchNormalization()(conv_layer_8)
conv_layer_8 = LeakyReLU(alpha=0.1)(conv_layer_8)

# now for a max pool
maxpool_layer_4 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_8)

# next conv
conv_layer_9 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_4)
conv_layer_9 = BatchNormalization()(conv_layer_9)
conv_layer_9 = LeakyReLU(alpha=0.1)(conv_layer_9)

# next conv
conv_layer_10 = Conv2D(
    filters=256,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_9)
conv_layer_10 = BatchNormalization()(conv_layer_10)
conv_layer_10 = LeakyReLU(alpha=0.1)(conv_layer_10)


# next conv
conv_layer_11 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_10)
conv_layer_11 = BatchNormalization()(conv_layer_11)
conv_layer_11 = LeakyReLU(alpha=0.1)(conv_layer_11)

# next conv
conv_layer_12 = Conv2D(
    filters=256,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_11)
conv_layer_12 = BatchNormalization()(conv_layer_12)
conv_layer_12 = LeakyReLU(alpha=0.1)(conv_layer_12)


# next conv
conv_layer_13 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_12)
conv_layer_13 = BatchNormalization()(conv_layer_13)
conv_layer_13 = LeakyReLU(alpha=0.1)(conv_layer_13)


# now for a max pool
maxpool_layer_5 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_13)


# next conv
conv_layer_14 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_5)
conv_layer_14 = BatchNormalization()(conv_layer_14)
conv_layer_14 = LeakyReLU(alpha=0.1)(conv_layer_14)


# next conv
conv_layer_15 = Conv2D(
    filters=512,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_14)
conv_layer_15 = BatchNormalization()(conv_layer_15)
conv_layer_15 = LeakyReLU(alpha=0.1)(conv_layer_15)


# next conv
conv_layer_16 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_15)
conv_layer_16 = BatchNormalization()(conv_layer_16)
conv_layer_16 = LeakyReLU(alpha=0.1)(conv_layer_16)


# next conv
conv_layer_17 = Conv2D(
    filters=512,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_16)
conv_layer_17 = BatchNormalization()(conv_layer_17)
conv_layer_17 = LeakyReLU(alpha=0.1)(conv_layer_17)


# next conv
conv_layer_18 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_17)
conv_layer_18 = BatchNormalization()(conv_layer_18)
conv_layer_18 = LeakyReLU(alpha=0.1)(conv_layer_18)

# next conv
conv_layer_19 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_18)
conv_layer_19 = BatchNormalization()(conv_layer_19)
conv_layer_19 = LeakyReLU(alpha=0.1)(conv_layer_19)


# next conv
conv_layer_20 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_19)
conv_layer_20 = BatchNormalization()(conv_layer_20)
conv_layer_20 = LeakyReLU(alpha=0.1)(conv_layer_20)


# next conv
conv_layer_21 = Conv2D(
    filters=64,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_13) # skiping back to layer 13 for a higher res representation
conv_layer_21 = BatchNormalization()(conv_layer_21)
conv_layer_21 = LeakyReLU(alpha=0.1)(conv_layer_21)

###
# Reorg Helpers
###

def space_to_depth_helper(x):
    import tensorflow as tf
    return tf.space_to_depth(x, block_size=2)


def space_to_depth_output_shape_helper(input_shape):
    return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 *
            input_shape[3]) if input_shape[1] else (input_shape[0], None, None,
                                                    4 * input_shape[3])

# reorg time
space_to_depth_helper_func = Lambda(
        space_to_depth_helper,
        output_shape=space_to_depth_output_shape_helper,
        name='space_to_depth_helper'
)(conv_layer_21)


# routing time
route_layers = [space_to_depth_helper_func, conv_layer_20]
concatenate_layer = concatenate(route_layers)


# next conv
conv_layer_22 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(concatenate_layer) # use concat layer
conv_layer_22 = BatchNormalization()(conv_layer_22)
conv_layer_22 = LeakyReLU(alpha=0.1)(conv_layer_22)


# final conv!
final_conv_filters = len(voc_anchors)*(5+len(voc_classes)) # see paper
conv_layer_23 = Conv2D(
    filters=final_conv_filters,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=True, # because we're NOT using batch norm
    activation='linear',
    padding='same'
)(conv_layer_22)

YOLO Loss


In [19]:
def yolo_loss(args,
              anchors,
              num_classes):
    """
    Loss for yolov2, given the features from the last conv, compute loss and return
    """
    (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args

    num_anchors = len(anchors)
    object_scale = 5
    no_object_scale = 1
    class_scale = 1
    coordinates_scale = 1
    pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_convert_boxes(
        yolo_output, anchors, num_classes
    )

    yolo_output_shape = K.shape(yolo_output)
    feats = K.reshape(yolo_output, [
        -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors,
        num_classes + 5
    ])

    pred_boxes = K.concatenate(
        (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1)

    # Expand pred x,y,w,h to allow comparison with ground truth.
    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    pred_xy = K.expand_dims(pred_xy, 4)
    pred_wh = K.expand_dims(pred_wh, 4)

    pred_wh_half = pred_wh / 2.
    pred_mins = pred_xy - pred_wh_half
    pred_maxes = pred_xy + pred_wh_half

    true_boxes_shape = K.shape(true_boxes)

    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    true_boxes = K.reshape(true_boxes, [
        true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2]
    ])
    true_xy = true_boxes[..., 0:2]
    true_wh = true_boxes[..., 2:4]

    # Find IOU of each predicted box with each ground truth box.
    true_wh_half = true_wh / 2.
    true_mins = true_xy - true_wh_half
    true_maxes = true_xy + true_wh_half

    intersect_mins = K.maximum(pred_mins, true_mins)
    intersect_maxes = K.minimum(pred_maxes, true_maxes)
    intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
    true_areas = true_wh[..., 0] * true_wh[..., 1]

    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = intersect_areas / union_areas

    # Best IOUs for each location.
    best_ious = K.max(iou_scores, axis=4)  # Best IOU scores.
    best_ious = K.expand_dims(best_ious)

    # A detector has found an object if IOU > thresh for some true box.
    object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious))


    # Losses!
    no_object_weights = (no_object_scale * (1 - object_detections) *
                         (1 - detectors_mask))
    no_objects_loss = no_object_weights * K.square(-pred_confidence)

    # Object loss
    objects_loss = (object_scale * detectors_mask *
                        K.square(1 - pred_confidence))

    # Confidence loss
    confidence_loss = objects_loss + no_objects_loss

    matching_classes = K.cast(matching_true_boxes[..., 4], 'int32')
    matching_classes = K.one_hot(matching_classes, num_classes)

    # Classification loss
    classification_loss = (class_scale * detectors_mask *
                           K.square(matching_classes - pred_class_prob))

    matching_boxes = matching_true_boxes[..., 0:4]

    # Coord loss
    coordinates_loss = (coordinates_scale * detectors_mask *
                        K.square(matching_boxes - pred_boxes))

    
    # print(detectors_mask.shape)
    # print(matching_boxes.shape)
    # print(pred_boxes.shape)

    
    # Sum them together
    confidence_loss_sum = K.sum(confidence_loss)
    classification_loss_sum = K.sum(classification_loss)
    coordinates_loss_sum = K.sum(coordinates_loss)

    # Calculate final loss
    return 0.5 * (confidence_loss_sum + classification_loss_sum + coordinates_loss_sum)

def yolo_convert_boxes(feats, anchors, num_classes):
    """
    Convert features into boxes
    """

    num_anchors = len(anchors)

    # Reshape to batch, height, width, num_anchors, box_params.
    anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2])

    # Dynamic implementation of conv dims for fully convolutional model.
    conv_dims = K.shape(feats)[1:3]  # assuming channels last

    # In YOLO the height index is the inner most iteration.
    conv_height_index = K.arange(0, stop=conv_dims[0])
    conv_width_index = K.arange(0, stop=conv_dims[1])
    conv_height_index = K.tile(conv_height_index, [conv_dims[1]])

    conv_width_index = K.tile(
        K.expand_dims(conv_width_index, 0), [conv_dims[0], 1])
    conv_width_index = K.flatten(K.transpose(conv_width_index))
    conv_index = K.transpose(K.stack([conv_height_index, conv_width_index]))
    conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])
    conv_index = K.cast(conv_index, K.dtype(feats))

    feats = K.reshape(
        feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5])
    conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats))

    box_xy = K.sigmoid(feats[..., :2])
    box_wh = K.exp(feats[..., 2:4])
    box_confidence = K.sigmoid(feats[..., 4:5])
    box_class_probs = K.softmax(feats[..., 5:])

    # Adjust preditions to each spatial grid point and anchor size.
    box_xy = (box_xy + conv_index) / conv_dims
    box_wh = box_wh * anchors_tensor / conv_dims

    return box_xy, box_wh, box_confidence, box_class_probs

def preprocess_true_boxes(true_boxes, anchors, image_size):
    """
    Process our true boxes and masks
    """
    height, width = image_size
    num_anchors = len(anchors)

    # 19 in our case
    conv_height = height // 32
    conv_width = width // 32 

    num_box_params = true_boxes.shape[1]
    
    detectors_mask = np.zeros(
        (conv_height, conv_width, num_anchors, 1), dtype=np.float32)
    
    matching_true_boxes = np.zeros(
        (conv_height, conv_width, num_anchors, num_box_params),
        dtype=np.float32)

    for box in true_boxes:
        # scale box to convolutional feature spatial dimensions
        box_class = box[4:5]
        box = box[0:4] * np.array(
            [conv_width, conv_height, conv_width, conv_height])
        i = np.floor(box[1]).astype('int')
        j = np.floor(box[0]).astype('int')
        best_iou = 0
        best_anchor = 0
        for k, anchor in enumerate(anchors):
            # Find IOU between box shifted to origin and anchor box.
            box_maxes = box[2:4] / 2.
            box_mins = -box_maxes
            anchor_maxes = (anchor / 2.)
            anchor_mins = -anchor_maxes

            intersect_mins = np.maximum(box_mins, anchor_mins)
            intersect_maxes = np.minimum(box_maxes, anchor_maxes)
            intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
            intersect_area = intersect_wh[0] * intersect_wh[1]

            box_area = box[2] * box[3]
            anchor_area = anchor[0] * anchor[1]
            iou = intersect_area / (box_area + anchor_area - intersect_area)

            if iou > best_iou:
                best_iou = iou
                best_anchor = k

        if best_iou > 0:
            detectors_mask[i, j, best_anchor] = 1

            # adjust final box my ancor
            adjusted_box = np.array(
                [
                    box[0] - j, box[1] - i,
                    np.log(box[2] / anchors[best_anchor][0]),
                    np.log(box[3] / anchors[best_anchor][1]), box_class
                ],
                dtype=np.float32
            )

            matching_true_boxes[i, j, best_anchor] = adjusted_box

    return detectors_mask, matching_true_boxes

def get_detector_mask(boxes, anchors):
    """
    Process each box
    """
    detectors_mask = [0 for i in range(len(boxes))]
    matching_true_boxes = [0 for i in range(len(boxes))]
    for i, box in enumerate(boxes):
        detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors,\
                                                                          [IMAGE_SIZE[0], IMAGE_SIZE[1]])

    return [np.array(detectors_mask), np.array(matching_true_boxes)]

# TODO: do something about this ugly hack (!!!)
# Should be done as pre processing, but in my case we can't so we have to do
# it for each mini batch
def hack_py_func(x):
    # Just make my own copy
    voc_anchors = np.array([[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
    return get_detector_mask(x, voc_anchors)

def lambda_hack_box(x):
    return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[1]

def lambda_hack_mask(x):
    return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[0]

Model Crafting


In [20]:
def craft_model(anchors, class_names):
    """
    Now a function to create our model
    """
    
    # Used for the loss
    detectors_mask_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 1)
    matching_boxes_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 5)
    detectors_mask_input = Input(shape=detectors_mask_shape)
    matching_boxes_input = Input(shape=matching_boxes_shape)

    # Create model input layers, besides image.
    boxes_input = Input(shape=(56, 5))

    # Process our mask and true boxes
    matching_true_boxes = Lambda(
                lambda_hack_box,
                output_shape=(19, 19, 5, 5),
                name='yolo_loss_hack_box')([boxes_input])
        
    detectors_mask = Lambda(
                lambda_hack_mask,
                output_shape=(19, 19, 5, 1),
                name='yolo_loss_hack_mask')([boxes_input])

    # Set the shape becuase Keras doesn't use it's own output shape here
    # later, :/ ==== Thanks @markus
    detectors_mask.set_shape((None, 19, 19, 5, 1))
    matching_true_boxes.set_shape((None, 19, 19, 5, 5))

    # Final layer that computes loss, hack around limitations in Keras loss formation
    model_loss = Lambda(
        yolo_loss,
        output_shape=(1, ),
        name='yolo_loss',
        arguments={'anchors': anchors,
                   'num_classes': len(class_names)})([
                       conv_layer_23, boxes_input,
                       detectors_mask, matching_true_boxes
                   ])

    # Create model
    model = Model([input_image_tensor, boxes_input], model_loss)

    return model

In [21]:
model = craft_model(voc_anchors, voc_classes)

For reference, model should be:

Total params: 50,676,061
Trainable params: 50,655,389
Non-trainable params: 20,672

In [22]:
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_1 (InputLayer)             (None, 608, 608, 3)   0                                            
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 608, 608, 32)  864         input_1[0][0]                    
____________________________________________________________________________________________________
batch_normalization_1 (BatchNorm (None, 608, 608, 32)  128         conv2d_1[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)        (None, 608, 608, 32)  0           batch_normalization_1[0][0]      
____________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)   (None, 304, 304, 32)  0           leaky_re_lu_1[0][0]              
____________________________________________________________________________________________________
conv2d_2 (Conv2D)                (None, 304, 304, 64)  18432       max_pooling2d_1[0][0]            
____________________________________________________________________________________________________
batch_normalization_2 (BatchNorm (None, 304, 304, 64)  256         conv2d_2[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_2 (LeakyReLU)        (None, 304, 304, 64)  0           batch_normalization_2[0][0]      
____________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)   (None, 152, 152, 64)  0           leaky_re_lu_2[0][0]              
____________________________________________________________________________________________________
conv2d_3 (Conv2D)                (None, 152, 152, 128) 73728       max_pooling2d_2[0][0]            
____________________________________________________________________________________________________
batch_normalization_3 (BatchNorm (None, 152, 152, 128) 512         conv2d_3[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_3 (LeakyReLU)        (None, 152, 152, 128) 0           batch_normalization_3[0][0]      
____________________________________________________________________________________________________
conv2d_4 (Conv2D)                (None, 152, 152, 64)  8192        leaky_re_lu_3[0][0]              
____________________________________________________________________________________________________
batch_normalization_4 (BatchNorm (None, 152, 152, 64)  256         conv2d_4[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_4 (LeakyReLU)        (None, 152, 152, 64)  0           batch_normalization_4[0][0]      
____________________________________________________________________________________________________
conv2d_5 (Conv2D)                (None, 152, 152, 128) 73728       leaky_re_lu_4[0][0]              
____________________________________________________________________________________________________
batch_normalization_5 (BatchNorm (None, 152, 152, 128) 512         conv2d_5[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)        (None, 152, 152, 128) 0           batch_normalization_5[0][0]      
____________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)   (None, 76, 76, 128)   0           leaky_re_lu_5[0][0]              
____________________________________________________________________________________________________
conv2d_6 (Conv2D)                (None, 76, 76, 256)   294912      max_pooling2d_3[0][0]            
____________________________________________________________________________________________________
batch_normalization_6 (BatchNorm (None, 76, 76, 256)   1024        conv2d_6[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)        (None, 76, 76, 256)   0           batch_normalization_6[0][0]      
____________________________________________________________________________________________________
conv2d_7 (Conv2D)                (None, 76, 76, 128)   32768       leaky_re_lu_6[0][0]              
____________________________________________________________________________________________________
batch_normalization_7 (BatchNorm (None, 76, 76, 128)   512         conv2d_7[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_7 (LeakyReLU)        (None, 76, 76, 128)   0           batch_normalization_7[0][0]      
____________________________________________________________________________________________________
conv2d_8 (Conv2D)                (None, 76, 76, 256)   294912      leaky_re_lu_7[0][0]              
____________________________________________________________________________________________________
batch_normalization_8 (BatchNorm (None, 76, 76, 256)   1024        conv2d_8[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_8 (LeakyReLU)        (None, 76, 76, 256)   0           batch_normalization_8[0][0]      
____________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)   (None, 38, 38, 256)   0           leaky_re_lu_8[0][0]              
____________________________________________________________________________________________________
conv2d_9 (Conv2D)                (None, 38, 38, 512)   1179648     max_pooling2d_4[0][0]            
____________________________________________________________________________________________________
batch_normalization_9 (BatchNorm (None, 38, 38, 512)   2048        conv2d_9[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_9 (LeakyReLU)        (None, 38, 38, 512)   0           batch_normalization_9[0][0]      
____________________________________________________________________________________________________
conv2d_10 (Conv2D)               (None, 38, 38, 256)   131072      leaky_re_lu_9[0][0]              
____________________________________________________________________________________________________
batch_normalization_10 (BatchNor (None, 38, 38, 256)   1024        conv2d_10[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_10 (LeakyReLU)       (None, 38, 38, 256)   0           batch_normalization_10[0][0]     
____________________________________________________________________________________________________
conv2d_11 (Conv2D)               (None, 38, 38, 512)   1179648     leaky_re_lu_10[0][0]             
____________________________________________________________________________________________________
batch_normalization_11 (BatchNor (None, 38, 38, 512)   2048        conv2d_11[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_11 (LeakyReLU)       (None, 38, 38, 512)   0           batch_normalization_11[0][0]     
____________________________________________________________________________________________________
conv2d_12 (Conv2D)               (None, 38, 38, 256)   131072      leaky_re_lu_11[0][0]             
____________________________________________________________________________________________________
batch_normalization_12 (BatchNor (None, 38, 38, 256)   1024        conv2d_12[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_12 (LeakyReLU)       (None, 38, 38, 256)   0           batch_normalization_12[0][0]     
____________________________________________________________________________________________________
conv2d_13 (Conv2D)               (None, 38, 38, 512)   1179648     leaky_re_lu_12[0][0]             
____________________________________________________________________________________________________
batch_normalization_13 (BatchNor (None, 38, 38, 512)   2048        conv2d_13[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_13 (LeakyReLU)       (None, 38, 38, 512)   0           batch_normalization_13[0][0]     
____________________________________________________________________________________________________
max_pooling2d_5 (MaxPooling2D)   (None, 19, 19, 512)   0           leaky_re_lu_13[0][0]             
____________________________________________________________________________________________________
conv2d_14 (Conv2D)               (None, 19, 19, 1024)  4718592     max_pooling2d_5[0][0]            
____________________________________________________________________________________________________
batch_normalization_14 (BatchNor (None, 19, 19, 1024)  4096        conv2d_14[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_14 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_14[0][0]     
____________________________________________________________________________________________________
conv2d_15 (Conv2D)               (None, 19, 19, 512)   524288      leaky_re_lu_14[0][0]             
____________________________________________________________________________________________________
batch_normalization_15 (BatchNor (None, 19, 19, 512)   2048        conv2d_15[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_15 (LeakyReLU)       (None, 19, 19, 512)   0           batch_normalization_15[0][0]     
____________________________________________________________________________________________________
conv2d_16 (Conv2D)               (None, 19, 19, 1024)  4718592     leaky_re_lu_15[0][0]             
____________________________________________________________________________________________________
batch_normalization_16 (BatchNor (None, 19, 19, 1024)  4096        conv2d_16[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_16 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_16[0][0]     
____________________________________________________________________________________________________
conv2d_17 (Conv2D)               (None, 19, 19, 512)   524288      leaky_re_lu_16[0][0]             
____________________________________________________________________________________________________
batch_normalization_17 (BatchNor (None, 19, 19, 512)   2048        conv2d_17[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_17 (LeakyReLU)       (None, 19, 19, 512)   0           batch_normalization_17[0][0]     
____________________________________________________________________________________________________
conv2d_18 (Conv2D)               (None, 19, 19, 1024)  4718592     leaky_re_lu_17[0][0]             
____________________________________________________________________________________________________
batch_normalization_18 (BatchNor (None, 19, 19, 1024)  4096        conv2d_18[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_18 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_18[0][0]     
____________________________________________________________________________________________________
conv2d_19 (Conv2D)               (None, 19, 19, 1024)  9437184     leaky_re_lu_18[0][0]             
____________________________________________________________________________________________________
batch_normalization_19 (BatchNor (None, 19, 19, 1024)  4096        conv2d_19[0][0]                  
____________________________________________________________________________________________________
conv2d_21 (Conv2D)               (None, 38, 38, 64)    32768       leaky_re_lu_13[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_19 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_19[0][0]     
____________________________________________________________________________________________________
batch_normalization_21 (BatchNor (None, 38, 38, 64)    256         conv2d_21[0][0]                  
____________________________________________________________________________________________________
conv2d_20 (Conv2D)               (None, 19, 19, 1024)  9437184     leaky_re_lu_19[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_21 (LeakyReLU)       (None, 38, 38, 64)    0           batch_normalization_21[0][0]     
____________________________________________________________________________________________________
batch_normalization_20 (BatchNor (None, 19, 19, 1024)  4096        conv2d_20[0][0]                  
____________________________________________________________________________________________________
space_to_depth_helper (Lambda)   (None, 19, 19, 256)   0           leaky_re_lu_21[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_20 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_20[0][0]     
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 19, 19, 1280)  0           space_to_depth_helper[0][0]      
                                                                   leaky_re_lu_20[0][0]             
____________________________________________________________________________________________________
conv2d_22 (Conv2D)               (None, 19, 19, 1024)  11796480    concatenate_1[0][0]              
____________________________________________________________________________________________________
batch_normalization_22 (BatchNor (None, 19, 19, 1024)  4096        conv2d_22[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_22 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_22[0][0]     
____________________________________________________________________________________________________
input_7 (InputLayer)             (None, 56, 5)         0                                            
____________________________________________________________________________________________________
conv2d_23 (Conv2D)               (None, 19, 19, 125)   128125      leaky_re_lu_22[0][0]             
____________________________________________________________________________________________________
yolo_loss_hack_mask (Lambda)     (None, 19, 19, 5, 1)  0           input_7[0][0]                    
____________________________________________________________________________________________________
yolo_loss_hack_box (Lambda)      (None, 19, 19, 5, 5)  0           input_7[0][0]                    
____________________________________________________________________________________________________
yolo_loss (Lambda)               (None, 1)             0           conv2d_23[0][0]                  
                                                                   input_7[0][0]                    
                                                                   yolo_loss_hack_mask[0][0]        
                                                                   yolo_loss_hack_box[0][0]         
====================================================================================================
Total params: 50,676,061
Trainable params: 50,655,389
Non-trainable params: 20,672
____________________________________________________________________________________________________

Training Model


In [25]:
def train(model, class_names, anchors, image_data, boxes, validation_split=0.1):
    """
    Trains a yolo model
    """
    def yolo_loss_helper(y_true, y_pred):
        # print(y_true)
        # print(y_pred)
        return y_pred

    model.compile(
        optimizer='adam',
        loss={
            'yolo_loss': yolo_loss_helper
        }
    )
    
    model.fit([image_data, boxes],
              np.zeros(len(image_data)),
              validation_split=validation_split,
              batch_size=30, # 90 total
              epochs=5
    )

In [26]:
train(
    model,
    voc2012_classes,
    voc_anchors,
    processed_images,
    processed_boxes
)
print('Done')


Train on 90 samples, validate on 10 samples
Epoch 1/5
90/90 [==============================] - 410s - loss: 2645.4650 - val_loss: 99196232.0000
Epoch 2/5
90/90 [==============================] - 381s - loss: 1503.3178 - val_loss: 97022232.0000
Epoch 3/5
90/90 [==============================] - 378s - loss: 868.8492 - val_loss: 3002830.0000
Epoch 4/5
90/90 [==============================] - 378s - loss: 572.7653 - val_loss: 2900123.0000
Epoch 5/5
90/90 [==============================] - 372s - loss: 443.7727 - val_loss: 2781025.0000
Done

In [ ]: