YOLO V2

First, here's the config from their darknet/yolo repo:

cnf
[net]
# Testing
batch=1
subdivisions=1
# Training
# batch=64
# subdivisions=8
width=608
height=608
channels=3
momentum=0.9
decay=0.0005
angle=0
saturation = 1.5
exposure = 1.5
hue=.1

learning_rate=0.001
burn_in=1000
max_batches = 500200
policy=steps
steps=400000,450000
scales=.1,.1

[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky


#######

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[route]
layers=-9

[convolutional]
batch_normalize=1
size=1
stride=1
pad=1
filters=64
activation=leaky

[reorg]
stride=2

[route]
layers=-1,-4

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=425
activation=linear


[region]
anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
bias_match=1
classes=80
coords=4
num=5
softmax=1
jitter=.3
rescore=1

object_scale=5
noobject_scale=1
class_scale=1
coord_scale=1

absolute=1
thresh = .6
random=1

this can be found here: https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolo.cfg

This is the model we'll be building, we'll be referencing this later, but first let's import and prepare the data



In [1]:

    
# Imports, helpers, and constants

# numpy/tensorflow
import numpy as np
import tensorflow as tf

# verious keras bits
from keras import backend as K
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# keras layers
from keras.layers import (Conv2D,
                          GlobalAveragePooling2D,
                          Input,
                          Lambda,
                          MaxPooling2D,
                          LeakyReLU,
                          concatenate,
                          BatchNormalization)

# Images
import PIL
from PIL import Image, ImageDraw, ImageFont

# System
import os
import sys
import random
from functools import reduce

# data
import python_voc_parser as voc

# tools
import functools
from functools import partial

# ---

IMAGE_SET_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/Annotations'
IMAGE_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/JPEGImages'

IMAGE_SET_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/Annotations'
IMAGE_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/JPEGImages'

# # temp for macbook
# IMAGE_SET_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt'
# ANNOTATIONS_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/Annotations'
# IMAGE_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/JPEGImages'

IMAGE_SIZE = (608, 608)
CONV_MAP_SIZE = (19, 19)
WEIGHT_DECAY = 0.0005









    



Using TensorFlow backend.

Prepare the data



In [2]:

    
parser2012 = voc.VocAnnotationsParser(IMAGE_PATH_2012, IMAGE_SET_PATH_2012, ANNOTATIONS_PATH_2012)



In [3]:

    
df2012 = parser2012.get_annotation_dataframe()



In [4]:

    
df2012









    Out[4]:







  
    
      
      class_name
      filename
      height
      img_full_path
      width
      xmax
      xmin
      ymax
      ymin
    
  
  
    
      0
      tvmonitor
      2008_000002
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      448
      34
      293
      11
    
    
      1
      train
      2008_000003
      333.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      500
      46
      333
      11
    
    
      2
      person
      2008_000003
      333.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      83
      62
      243
      190
    
    
      3
      boat
      2008_000007
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      428
      1
      293
      230
    
    
      4
      horse
      2008_000008
      442.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      471
      53
      420
      87
    
    
      5
      person
      2008_000008
      442.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      289
      158
      167
      44
    
    
      6
      cow
      2008_000009
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      294
      217
      221
      161
    
    
      7
      cow
      2008_000009
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      500
      465
      218
      167
    
    
      8
      bottle
      2008_000015
      327.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      378
      270
      176
      1
    
    
      9
      bottle
      2008_000015
      327.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      164
      57
      150
      1
    
    
      10
      tvmonitor
      2008_000016
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      392
      91
      353
      15
    
    
      11
      dog
      2008_000019
      272.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      480.0
      372
      139
      197
      2
    
    
      12
      dog
      2008_000019
      272.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      480.0
      318
      165
      236
      66
    
    
      13
      dog
      2008_000019
      272.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      480.0
      480
      361
      112
      1
    
    
      14
      aeroplane
      2008_000021
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      475
      14
      288
      148
    
    
      15
      tvmonitor
      2008_000023
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      333.0
      314
      6
      262
      1
    
    
      16
      bottle
      2008_000023
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      333.0
      121
      40
      411
      97
    
    
      17
      person
      2008_000023
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      333.0
      169
      137
      109
      36
    
    
      18
      person
      2008_000023
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      333.0
      216
      180
      104
      36
    
    
      19
      person
      2008_000023
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      333.0
      123
      96
      103
      39
    
    
      20
      person
      2008_000026
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      372
      122
      375
      7
    
    
      21
      dog
      2008_000026
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      325
      211
      255
      147
    
    
      22
      car
      2008_000027
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      500
      9
      375
      32
    
    
      23
      car
      2008_000028
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      231
      158
      240
      199
    
    
      24
      car
      2008_000028
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      94
      47
      238
      202
    
    
      25
      bus
      2008_000032
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      489
      6
      274
      118
    
    
      26
      person
      2008_000032
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      364
      336
      224
      173
    
    
      27
      aeroplane
      2008_000033
      333.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      476
      104
      257
      61
    
    
      28
      bottle
      2008_000034
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      338.0
      45
      6
      362
      234
    
    
      29
      person
      2008_000034
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      338.0
      103
      1
      336
      156
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      31531
      train
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      349
      1
      314
      140
    
    
      31532
      train
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      67
      1
      208
      165
    
    
      31533
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      330
      310
      226
      182
    
    
      31534
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      343
      328
      228
      182
    
    
      31535
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      367
      336
      266
      192
    
    
      31536
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      487
      458
      274
      190
    
    
      31537
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      428
      410
      243
      192
    
    
      31538
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      29
      14
      218
      182
    
    
      31539
      person
      2011_003260
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      47
      35
      208
      184
    
    
      31540
      bicycle
      2011_003261
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      375.0
      186
      132
      332
      245
    
    
      31541
      person
      2011_003261
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      375.0
      186
      130
      292
      210
    
    
      31542
      bottle
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      307
      283
      194
      106
    
    
      31543
      bottle
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      248
      223
      173
      97
    
    
      31544
      bottle
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      253
      215
      272
      154
    
    
      31545
      diningtable
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      386
      80
      359
      116
    
    
      31546
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      190
      2
      359
      45
    
    
      31547
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      219
      98
      222
      49
    
    
      31548
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      226
      116
      192
      24
    
    
      31549
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      266
      178
      139
      31
    
    
      31550
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      397
      341
      153
      39
    
    
      31551
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      430
      364
      172
      54
    
    
      31552
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      448
      336
      221
      71
    
    
      31553
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      500
      285
      359
      77
    
    
      31554
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      200
      111
      60
      1
    
    
      31555
      person
      2011_003262
      359.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      246
      221
      66
      1
    
    
      31556
      car
      2011_003269
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      334.0
      309
      19
      461
      347
    
    
      31557
      bottle
      2011_003271
      500.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      499
      98
      405
      1
    
    
      31558
      train
      2011_003274
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      384
      1
      215
      113
    
    
      31559
      aeroplane
      2011_003275
      403.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      448
      42
      282
      50
    
    
      31560
      bird
      2011_003276
      400.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      292
      264
      217
      142
    
  

31561 rows × 9 columns



In [5]:

    
df2012.index.values









    Out[5]:





array([    0,     1,     2, ..., 31558, 31559, 31560])



In [6]:

    
df2012.iloc[[0]]









    Out[6]:







  
    
      
      class_name
      filename
      height
      img_full_path
      width
      xmax
      xmin
      ymax
      ymin
    
  
  
    
      0
      tvmonitor
      2008_000002
      375.0
      /Users/adammenges/Dropbox/Datasets/VOC/VOC 201...
      500.0
      448
      34
      293
      11



In [7]:

    
df2012.iloc[[0]].img_full_path.values[0]









    Out[7]:





'/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/JPEGImages/2008_000002.jpg'



In [8]:

    
from collections import defaultdict

annotations = defaultdict(list)

for i in df2012.index.values:
    filename = df2012.iloc[[i]].img_full_path.values[0]
    xmax = float(df2012.iloc[[i]].xmax.values[0])
    xmin = float(df2012.iloc[[i]].xmin.values[0])
    ymax = float(df2012.iloc[[i]].ymax.values[0])
    ymin = float(df2012.iloc[[i]].ymin.values[0])
    c = df2012.iloc[[i]].class_name.values[0]
    annotations[filename].append([xmin,xmax,ymin,ymax,c])

all_boxes = list(annotations.values())
len_boxes = [len(x) for x in all_boxes]
max_number_of_boxes = max(len_boxes)
max_loc = len_boxes.index(max_number_of_boxes)

print("this dataset has a max number of {} boxes in one image, will use this for padding later".format(max_number_of_boxes))









    



this dataset has a max number of 56 boxes in one image, will use this for padding later



In [9]:

    
voc2012_classes = list(set(parser2012.get_annotation_dataframe().class_name))

For reference while I'm doing this:



In [10]:

    
def convert_box(size, box):
    dw = 1.0/size[0]
    dh = 1.0/size[1]
    x = (box[0] + box[1])/2.0
    y = (box[2] + box[3])/2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

# Keras voc data generator, use later with new loss
def yolo_voc_generator(batch_size):
    while True:
        # I prob shouldn't be doing it this way...
        random_images = random.sample(annotations.keys(), batch_size)

        # Mmmk you know the drill...
        processed_images = [Image.open(image_path).resize((608, 608), PIL.Image.BICUBIC)\
                            for image_path in random_images]
        processed_images = [np.array(image, dtype=np.float) for image in processed_images]
        processed_images = [image/255.0 for image in processed_images]
        
        boxes = [] # all boxes, for all images
        
        for k in random_images:
            print(k)
            new_image_boxes = []
            for box in annotations[k]:
                print(box)
                origanal_image = Image.open(k) # move up there so I'm not doing this twice
                new_box = list(map(float, box[:4]))
                new_box = convert_box(origanal_image.size, new_box)
                label_idx = voc2012_classes.index(box[4])
                new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
                new_image_boxes.append(new_image_box)
            if len(new_image_boxes) < max_number_of_boxes:
                for z in range(max_number_of_boxes - len(new_image_boxes)):
                    new_image_boxes.append(np.zeros(5))
            boxes.append(np.array(new_image_boxes))
        yield([boxes, processed_images])

# For educational proposes and ease, this method can just do it on all the data
def process_data(amount=None):
    if amount:
        all_images = list(annotations.keys())[:amount] # Only a subset for now
    else:
        all_images = list(annotations.keys())

    # Mmmk you know the drill...
    processed_images = [Image.open(image_path).resize((IMAGE_SIZE[0], IMAGE_SIZE[0]), PIL.Image.BICUBIC) \
                        for image_path in all_images]
    processed_images = [np.array(image, dtype=np.float) for image in processed_images]
    processed_images = [image/255.0 for image in processed_images]

    processed_boxes = [] # all boxes, for all images

    for k in all_images:
        new_image_boxes = []
        for box in annotations[k]:
            origanal_image = Image.open(k) # move up there so I'm not doing this twice
            new_box = list(map(float, box[:4]))
            new_box = convert_box(origanal_image.size, new_box)
            label_idx = voc2012_classes.index(box[4])
            new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
            new_image_boxes.append(new_image_box)
        if len(new_image_boxes) < max_number_of_boxes:
            for z in range(max_number_of_boxes - len(new_image_boxes)):
                new_image_boxes.append(np.zeros(5))
        processed_boxes.append(np.array(new_image_boxes))

    processed_boxes = np.array(processed_boxes)
    processed_images = np.array(processed_images)
    processed_images = np.uint8(processed_images)
    
    return processed_images, processed_boxes



In [11]:

    
processed_images, processed_boxes = process_data(100)



In [12]:

    
processed_images.shape









    Out[12]:





(100, 608, 608, 3)



In [13]:

    
processed_boxes.shape









    Out[13]:





(100, 56, 5)

VOC Classes and Anchors



In [14]:

    
voc_anchors = np.array(
    [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]
)

# voc_classes = [
#     "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
#     "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
#     "pottedplant", "sheep", "sofa", "train", "tvmonitor"
# ]

voc_classes = voc2012_classes

Create our model

Now that the data's been munged correctly, time to create our model.

(tried this on a whim, didn't realize you could have gifs in a notebook, I'll be using this liberally in the future)



In [15]:

    
input_image_tensor = Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

# time for conv
conv_layer_1 = Conv2D(
    filters=32,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(input_image_tensor)
conv_layer_1 = BatchNormalization()(conv_layer_1)
conv_layer_1 = LeakyReLU(alpha=0.1)(conv_layer_1)

# time for max pool
maxpool_layer_1 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_1)

# next conv
conv_layer_2 = Conv2D(
    filters=64,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_1)
conv_layer_2 = BatchNormalization()(conv_layer_2)
conv_layer_2 = LeakyReLU(alpha=0.1)(conv_layer_2)

# now for a max pool
maxpool_layer_2 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_2)

# next conv
conv_layer_3 = Conv2D(
    filters=128,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_2)
conv_layer_3 = BatchNormalization()(conv_layer_3)
conv_layer_3 = LeakyReLU(alpha=0.1)(conv_layer_3)

# next conv
conv_layer_4 = Conv2D(
    filters=64,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_3)
conv_layer_4 = BatchNormalization()(conv_layer_4)
conv_layer_4 = LeakyReLU(alpha=0.1)(conv_layer_4)


# next conv
conv_layer_5 = Conv2D(
    filters=128,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_4)
conv_layer_5 = BatchNormalization()(conv_layer_5)
conv_layer_5 = LeakyReLU(alpha=0.1)(conv_layer_5)

# next max pool
maxpool_layer_3 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)
maxpool_layer_3 = maxpool_layer_3(conv_layer_5)

# next conv
conv_layer_6 = Conv2D(
    filters=256,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_3)
conv_layer_6 = BatchNormalization()(conv_layer_6)
conv_layer_6 = LeakyReLU(alpha=0.1)(conv_layer_6)

# next conv
conv_layer_7 = Conv2D(
    filters=128,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_6)
conv_layer_7 = BatchNormalization()(conv_layer_7)
conv_layer_7 = LeakyReLU(alpha=0.1)(conv_layer_7)


# next conv
conv_layer_8 = Conv2D(
    filters=256,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_7)
conv_layer_8 = BatchNormalization()(conv_layer_8)
conv_layer_8 = LeakyReLU(alpha=0.1)(conv_layer_8)

# now for a max pool
maxpool_layer_4 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_8)

# next conv
conv_layer_9 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_4)
conv_layer_9 = BatchNormalization()(conv_layer_9)
conv_layer_9 = LeakyReLU(alpha=0.1)(conv_layer_9)

# next conv
conv_layer_10 = Conv2D(
    filters=256,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_9)
conv_layer_10 = BatchNormalization()(conv_layer_10)
conv_layer_10 = LeakyReLU(alpha=0.1)(conv_layer_10)


# next conv
conv_layer_11 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_10)
conv_layer_11 = BatchNormalization()(conv_layer_11)
conv_layer_11 = LeakyReLU(alpha=0.1)(conv_layer_11)

# next conv
conv_layer_12 = Conv2D(
    filters=256,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_11)
conv_layer_12 = BatchNormalization()(conv_layer_12)
conv_layer_12 = LeakyReLU(alpha=0.1)(conv_layer_12)


# next conv
conv_layer_13 = Conv2D(
    filters=512,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_12)
conv_layer_13 = BatchNormalization()(conv_layer_13)
conv_layer_13 = LeakyReLU(alpha=0.1)(conv_layer_13)


# now for a max pool
maxpool_layer_5 = MaxPooling2D(
    pool_size=(2, 2),
    strides=(2, 2),
    padding='same'
)(conv_layer_13)


# next conv
conv_layer_14 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(maxpool_layer_5)
conv_layer_14 = BatchNormalization()(conv_layer_14)
conv_layer_14 = LeakyReLU(alpha=0.1)(conv_layer_14)


# next conv
conv_layer_15 = Conv2D(
    filters=512,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_14)
conv_layer_15 = BatchNormalization()(conv_layer_15)
conv_layer_15 = LeakyReLU(alpha=0.1)(conv_layer_15)


# next conv
conv_layer_16 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_15)
conv_layer_16 = BatchNormalization()(conv_layer_16)
conv_layer_16 = LeakyReLU(alpha=0.1)(conv_layer_16)


# next conv
conv_layer_17 = Conv2D(
    filters=512,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_16)
conv_layer_17 = BatchNormalization()(conv_layer_17)
conv_layer_17 = LeakyReLU(alpha=0.1)(conv_layer_17)


# next conv
conv_layer_18 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_17)
conv_layer_18 = BatchNormalization()(conv_layer_18)
conv_layer_18 = LeakyReLU(alpha=0.1)(conv_layer_18)

# next conv
conv_layer_19 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_18)
conv_layer_19 = BatchNormalization()(conv_layer_19)
conv_layer_19 = LeakyReLU(alpha=0.1)(conv_layer_19)


# next conv
conv_layer_20 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_19)
conv_layer_20 = BatchNormalization()(conv_layer_20)
conv_layer_20 = LeakyReLU(alpha=0.1)(conv_layer_20)


# next conv
conv_layer_21 = Conv2D(
    filters=64,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(conv_layer_13) # skiping back to layer 13 for a higher res representation
conv_layer_21 = BatchNormalization()(conv_layer_21)
conv_layer_21 = LeakyReLU(alpha=0.1)(conv_layer_21)

###
# Reorg Helpers
###

def space_to_depth_helper(x):
    import tensorflow as tf
    return tf.space_to_depth(x, block_size=2)


def space_to_depth_output_shape_helper(input_shape):
    return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 *
            input_shape[3]) if input_shape[1] else (input_shape[0], None, None,
                                                    4 * input_shape[3])

# reorg time
space_to_depth_helper_func = Lambda(
        space_to_depth_helper,
        output_shape=space_to_depth_output_shape_helper,
        name='space_to_depth_helper'
)(conv_layer_21)


# routing time
route_layers = [space_to_depth_helper_func, conv_layer_20]
concatenate_layer = concatenate(route_layers)


# next conv
conv_layer_22 = Conv2D(
    filters=1024,
    kernel_size=(3, 3),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=False, # because we're using batch norm
    activation='linear',
    padding='same'
)(concatenate_layer) # use concat layer
conv_layer_22 = BatchNormalization()(conv_layer_22)
conv_layer_22 = LeakyReLU(alpha=0.1)(conv_layer_22)


# final conv!
final_conv_filters = len(voc_anchors)*(5+len(voc_classes)) # see paper
conv_layer_23 = Conv2D(
    filters=final_conv_filters,
    kernel_size=(1, 1),
    strides=(1, 1),
    kernel_regularizer=l2(WEIGHT_DECAY),
    use_bias=True, # because we're NOT using batch norm
    activation='linear',
    padding='same'
)(conv_layer_22)

YOLO Loss



In [19]:

    
def yolo_loss(args,
              anchors,
              num_classes):
    """
    Loss for yolov2, given the features from the last conv, compute loss and return
    """
    (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args

    num_anchors = len(anchors)
    object_scale = 5
    no_object_scale = 1
    class_scale = 1
    coordinates_scale = 1
    pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_convert_boxes(
        yolo_output, anchors, num_classes
    )

    yolo_output_shape = K.shape(yolo_output)
    feats = K.reshape(yolo_output, [
        -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors,
        num_classes + 5
    ])

    pred_boxes = K.concatenate(
        (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1)

    # Expand pred x,y,w,h to allow comparison with ground truth.
    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    pred_xy = K.expand_dims(pred_xy, 4)
    pred_wh = K.expand_dims(pred_wh, 4)

    pred_wh_half = pred_wh / 2.
    pred_mins = pred_xy - pred_wh_half
    pred_maxes = pred_xy + pred_wh_half

    true_boxes_shape = K.shape(true_boxes)

    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    true_boxes = K.reshape(true_boxes, [
        true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2]
    ])
    true_xy = true_boxes[..., 0:2]
    true_wh = true_boxes[..., 2:4]

    # Find IOU of each predicted box with each ground truth box.
    true_wh_half = true_wh / 2.
    true_mins = true_xy - true_wh_half
    true_maxes = true_xy + true_wh_half

    intersect_mins = K.maximum(pred_mins, true_mins)
    intersect_maxes = K.minimum(pred_maxes, true_maxes)
    intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
    true_areas = true_wh[..., 0] * true_wh[..., 1]

    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = intersect_areas / union_areas

    # Best IOUs for each location.
    best_ious = K.max(iou_scores, axis=4)  # Best IOU scores.
    best_ious = K.expand_dims(best_ious)

    # A detector has found an object if IOU > thresh for some true box.
    object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious))


    # Losses!
    no_object_weights = (no_object_scale * (1 - object_detections) *
                         (1 - detectors_mask))
    no_objects_loss = no_object_weights * K.square(-pred_confidence)

    # Object loss
    objects_loss = (object_scale * detectors_mask *
                        K.square(1 - pred_confidence))

    # Confidence loss
    confidence_loss = objects_loss + no_objects_loss

    matching_classes = K.cast(matching_true_boxes[..., 4], 'int32')
    matching_classes = K.one_hot(matching_classes, num_classes)

    # Classification loss
    classification_loss = (class_scale * detectors_mask *
                           K.square(matching_classes - pred_class_prob))

    matching_boxes = matching_true_boxes[..., 0:4]

    # Coord loss
    coordinates_loss = (coordinates_scale * detectors_mask *
                        K.square(matching_boxes - pred_boxes))

    
    # print(detectors_mask.shape)
    # print(matching_boxes.shape)
    # print(pred_boxes.shape)

    
    # Sum them together
    confidence_loss_sum = K.sum(confidence_loss)
    classification_loss_sum = K.sum(classification_loss)
    coordinates_loss_sum = K.sum(coordinates_loss)

    # Calculate final loss
    return 0.5 * (confidence_loss_sum + classification_loss_sum + coordinates_loss_sum)

def yolo_convert_boxes(feats, anchors, num_classes):
    """
    Convert features into boxes
    """

    num_anchors = len(anchors)

    # Reshape to batch, height, width, num_anchors, box_params.
    anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2])

    # Dynamic implementation of conv dims for fully convolutional model.
    conv_dims = K.shape(feats)[1:3]  # assuming channels last

    # In YOLO the height index is the inner most iteration.
    conv_height_index = K.arange(0, stop=conv_dims[0])
    conv_width_index = K.arange(0, stop=conv_dims[1])
    conv_height_index = K.tile(conv_height_index, [conv_dims[1]])

    conv_width_index = K.tile(
        K.expand_dims(conv_width_index, 0), [conv_dims[0], 1])
    conv_width_index = K.flatten(K.transpose(conv_width_index))
    conv_index = K.transpose(K.stack([conv_height_index, conv_width_index]))
    conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])
    conv_index = K.cast(conv_index, K.dtype(feats))

    feats = K.reshape(
        feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5])
    conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats))

    box_xy = K.sigmoid(feats[..., :2])
    box_wh = K.exp(feats[..., 2:4])
    box_confidence = K.sigmoid(feats[..., 4:5])
    box_class_probs = K.softmax(feats[..., 5:])

    # Adjust preditions to each spatial grid point and anchor size.
    box_xy = (box_xy + conv_index) / conv_dims
    box_wh = box_wh * anchors_tensor / conv_dims

    return box_xy, box_wh, box_confidence, box_class_probs

def preprocess_true_boxes(true_boxes, anchors, image_size):
    """
    Process our true boxes and masks
    """
    height, width = image_size
    num_anchors = len(anchors)

    # 19 in our case
    conv_height = height // 32
    conv_width = width // 32 

    num_box_params = true_boxes.shape[1]
    
    detectors_mask = np.zeros(
        (conv_height, conv_width, num_anchors, 1), dtype=np.float32)
    
    matching_true_boxes = np.zeros(
        (conv_height, conv_width, num_anchors, num_box_params),
        dtype=np.float32)

    for box in true_boxes:
        # scale box to convolutional feature spatial dimensions
        box_class = box[4:5]
        box = box[0:4] * np.array(
            [conv_width, conv_height, conv_width, conv_height])
        i = np.floor(box[1]).astype('int')
        j = np.floor(box[0]).astype('int')
        best_iou = 0
        best_anchor = 0
        for k, anchor in enumerate(anchors):
            # Find IOU between box shifted to origin and anchor box.
            box_maxes = box[2:4] / 2.
            box_mins = -box_maxes
            anchor_maxes = (anchor / 2.)
            anchor_mins = -anchor_maxes

            intersect_mins = np.maximum(box_mins, anchor_mins)
            intersect_maxes = np.minimum(box_maxes, anchor_maxes)
            intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
            intersect_area = intersect_wh[0] * intersect_wh[1]

            box_area = box[2] * box[3]
            anchor_area = anchor[0] * anchor[1]
            iou = intersect_area / (box_area + anchor_area - intersect_area)

            if iou > best_iou:
                best_iou = iou
                best_anchor = k

        if best_iou > 0:
            detectors_mask[i, j, best_anchor] = 1

            # adjust final box my ancor
            adjusted_box = np.array(
                [
                    box[0] - j, box[1] - i,
                    np.log(box[2] / anchors[best_anchor][0]),
                    np.log(box[3] / anchors[best_anchor][1]), box_class
                ],
                dtype=np.float32
            )

            matching_true_boxes[i, j, best_anchor] = adjusted_box

    return detectors_mask, matching_true_boxes

def get_detector_mask(boxes, anchors):
    """
    Process each box
    """
    detectors_mask = [0 for i in range(len(boxes))]
    matching_true_boxes = [0 for i in range(len(boxes))]
    for i, box in enumerate(boxes):
        detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors,\
                                                                          [IMAGE_SIZE[0], IMAGE_SIZE[1]])

    return [np.array(detectors_mask), np.array(matching_true_boxes)]

# TODO: do something about this ugly hack (!!!)
# Should be done as pre processing, but in my case we can't so we have to do
# it for each mini batch
def hack_py_func(x):
    # Just make my own copy
    voc_anchors = np.array([[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
    return get_detector_mask(x, voc_anchors)

def lambda_hack_box(x):
    return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[1]

def lambda_hack_mask(x):
    return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[0]

Model Crafting



In [20]:

    
def craft_model(anchors, class_names):
    """
    Now a function to create our model
    """
    
    # Used for the loss
    detectors_mask_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 1)
    matching_boxes_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 5)
    detectors_mask_input = Input(shape=detectors_mask_shape)
    matching_boxes_input = Input(shape=matching_boxes_shape)

    # Create model input layers, besides image.
    boxes_input = Input(shape=(56, 5))

    # Process our mask and true boxes
    matching_true_boxes = Lambda(
                lambda_hack_box,
                output_shape=(19, 19, 5, 5),
                name='yolo_loss_hack_box')([boxes_input])
        
    detectors_mask = Lambda(
                lambda_hack_mask,
                output_shape=(19, 19, 5, 1),
                name='yolo_loss_hack_mask')([boxes_input])

    # Set the shape becuase Keras doesn't use it's own output shape here
    # later, :/ ==== Thanks @markus
    detectors_mask.set_shape((None, 19, 19, 5, 1))
    matching_true_boxes.set_shape((None, 19, 19, 5, 5))

    # Final layer that computes loss, hack around limitations in Keras loss formation
    model_loss = Lambda(
        yolo_loss,
        output_shape=(1, ),
        name='yolo_loss',
        arguments={'anchors': anchors,
                   'num_classes': len(class_names)})([
                       conv_layer_23, boxes_input,
                       detectors_mask, matching_true_boxes
                   ])

    # Create model
    model = Model([input_image_tensor, boxes_input], model_loss)

    return model



In [21]:

    
model = craft_model(voc_anchors, voc_classes)

For reference, model should be:

Total params: 50,676,061
Trainable params: 50,655,389
Non-trainable params: 20,672



In [22]:

    
model.summary()









    



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_1 (InputLayer)             (None, 608, 608, 3)   0                                            
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 608, 608, 32)  864         input_1[0][0]                    
____________________________________________________________________________________________________
batch_normalization_1 (BatchNorm (None, 608, 608, 32)  128         conv2d_1[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)        (None, 608, 608, 32)  0           batch_normalization_1[0][0]      
____________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)   (None, 304, 304, 32)  0           leaky_re_lu_1[0][0]              
____________________________________________________________________________________________________
conv2d_2 (Conv2D)                (None, 304, 304, 64)  18432       max_pooling2d_1[0][0]            
____________________________________________________________________________________________________
batch_normalization_2 (BatchNorm (None, 304, 304, 64)  256         conv2d_2[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_2 (LeakyReLU)        (None, 304, 304, 64)  0           batch_normalization_2[0][0]      
____________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)   (None, 152, 152, 64)  0           leaky_re_lu_2[0][0]              
____________________________________________________________________________________________________
conv2d_3 (Conv2D)                (None, 152, 152, 128) 73728       max_pooling2d_2[0][0]            
____________________________________________________________________________________________________
batch_normalization_3 (BatchNorm (None, 152, 152, 128) 512         conv2d_3[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_3 (LeakyReLU)        (None, 152, 152, 128) 0           batch_normalization_3[0][0]      
____________________________________________________________________________________________________
conv2d_4 (Conv2D)                (None, 152, 152, 64)  8192        leaky_re_lu_3[0][0]              
____________________________________________________________________________________________________
batch_normalization_4 (BatchNorm (None, 152, 152, 64)  256         conv2d_4[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_4 (LeakyReLU)        (None, 152, 152, 64)  0           batch_normalization_4[0][0]      
____________________________________________________________________________________________________
conv2d_5 (Conv2D)                (None, 152, 152, 128) 73728       leaky_re_lu_4[0][0]              
____________________________________________________________________________________________________
batch_normalization_5 (BatchNorm (None, 152, 152, 128) 512         conv2d_5[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)        (None, 152, 152, 128) 0           batch_normalization_5[0][0]      
____________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)   (None, 76, 76, 128)   0           leaky_re_lu_5[0][0]              
____________________________________________________________________________________________________
conv2d_6 (Conv2D)                (None, 76, 76, 256)   294912      max_pooling2d_3[0][0]            
____________________________________________________________________________________________________
batch_normalization_6 (BatchNorm (None, 76, 76, 256)   1024        conv2d_6[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)        (None, 76, 76, 256)   0           batch_normalization_6[0][0]      
____________________________________________________________________________________________________
conv2d_7 (Conv2D)                (None, 76, 76, 128)   32768       leaky_re_lu_6[0][0]              
____________________________________________________________________________________________________
batch_normalization_7 (BatchNorm (None, 76, 76, 128)   512         conv2d_7[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_7 (LeakyReLU)        (None, 76, 76, 128)   0           batch_normalization_7[0][0]      
____________________________________________________________________________________________________
conv2d_8 (Conv2D)                (None, 76, 76, 256)   294912      leaky_re_lu_7[0][0]              
____________________________________________________________________________________________________
batch_normalization_8 (BatchNorm (None, 76, 76, 256)   1024        conv2d_8[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_8 (LeakyReLU)        (None, 76, 76, 256)   0           batch_normalization_8[0][0]      
____________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)   (None, 38, 38, 256)   0           leaky_re_lu_8[0][0]              
____________________________________________________________________________________________________
conv2d_9 (Conv2D)                (None, 38, 38, 512)   1179648     max_pooling2d_4[0][0]            
____________________________________________________________________________________________________
batch_normalization_9 (BatchNorm (None, 38, 38, 512)   2048        conv2d_9[0][0]                   
____________________________________________________________________________________________________
leaky_re_lu_9 (LeakyReLU)        (None, 38, 38, 512)   0           batch_normalization_9[0][0]      
____________________________________________________________________________________________________
conv2d_10 (Conv2D)               (None, 38, 38, 256)   131072      leaky_re_lu_9[0][0]              
____________________________________________________________________________________________________
batch_normalization_10 (BatchNor (None, 38, 38, 256)   1024        conv2d_10[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_10 (LeakyReLU)       (None, 38, 38, 256)   0           batch_normalization_10[0][0]     
____________________________________________________________________________________________________
conv2d_11 (Conv2D)               (None, 38, 38, 512)   1179648     leaky_re_lu_10[0][0]             
____________________________________________________________________________________________________
batch_normalization_11 (BatchNor (None, 38, 38, 512)   2048        conv2d_11[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_11 (LeakyReLU)       (None, 38, 38, 512)   0           batch_normalization_11[0][0]     
____________________________________________________________________________________________________
conv2d_12 (Conv2D)               (None, 38, 38, 256)   131072      leaky_re_lu_11[0][0]             
____________________________________________________________________________________________________
batch_normalization_12 (BatchNor (None, 38, 38, 256)   1024        conv2d_12[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_12 (LeakyReLU)       (None, 38, 38, 256)   0           batch_normalization_12[0][0]     
____________________________________________________________________________________________________
conv2d_13 (Conv2D)               (None, 38, 38, 512)   1179648     leaky_re_lu_12[0][0]             
____________________________________________________________________________________________________
batch_normalization_13 (BatchNor (None, 38, 38, 512)   2048        conv2d_13[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_13 (LeakyReLU)       (None, 38, 38, 512)   0           batch_normalization_13[0][0]     
____________________________________________________________________________________________________
max_pooling2d_5 (MaxPooling2D)   (None, 19, 19, 512)   0           leaky_re_lu_13[0][0]             
____________________________________________________________________________________________________
conv2d_14 (Conv2D)               (None, 19, 19, 1024)  4718592     max_pooling2d_5[0][0]            
____________________________________________________________________________________________________
batch_normalization_14 (BatchNor (None, 19, 19, 1024)  4096        conv2d_14[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_14 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_14[0][0]     
____________________________________________________________________________________________________
conv2d_15 (Conv2D)               (None, 19, 19, 512)   524288      leaky_re_lu_14[0][0]             
____________________________________________________________________________________________________
batch_normalization_15 (BatchNor (None, 19, 19, 512)   2048        conv2d_15[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_15 (LeakyReLU)       (None, 19, 19, 512)   0           batch_normalization_15[0][0]     
____________________________________________________________________________________________________
conv2d_16 (Conv2D)               (None, 19, 19, 1024)  4718592     leaky_re_lu_15[0][0]             
____________________________________________________________________________________________________
batch_normalization_16 (BatchNor (None, 19, 19, 1024)  4096        conv2d_16[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_16 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_16[0][0]     
____________________________________________________________________________________________________
conv2d_17 (Conv2D)               (None, 19, 19, 512)   524288      leaky_re_lu_16[0][0]             
____________________________________________________________________________________________________
batch_normalization_17 (BatchNor (None, 19, 19, 512)   2048        conv2d_17[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_17 (LeakyReLU)       (None, 19, 19, 512)   0           batch_normalization_17[0][0]     
____________________________________________________________________________________________________
conv2d_18 (Conv2D)               (None, 19, 19, 1024)  4718592     leaky_re_lu_17[0][0]             
____________________________________________________________________________________________________
batch_normalization_18 (BatchNor (None, 19, 19, 1024)  4096        conv2d_18[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_18 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_18[0][0]     
____________________________________________________________________________________________________
conv2d_19 (Conv2D)               (None, 19, 19, 1024)  9437184     leaky_re_lu_18[0][0]             
____________________________________________________________________________________________________
batch_normalization_19 (BatchNor (None, 19, 19, 1024)  4096        conv2d_19[0][0]                  
____________________________________________________________________________________________________
conv2d_21 (Conv2D)               (None, 38, 38, 64)    32768       leaky_re_lu_13[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_19 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_19[0][0]     
____________________________________________________________________________________________________
batch_normalization_21 (BatchNor (None, 38, 38, 64)    256         conv2d_21[0][0]                  
____________________________________________________________________________________________________
conv2d_20 (Conv2D)               (None, 19, 19, 1024)  9437184     leaky_re_lu_19[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_21 (LeakyReLU)       (None, 38, 38, 64)    0           batch_normalization_21[0][0]     
____________________________________________________________________________________________________
batch_normalization_20 (BatchNor (None, 19, 19, 1024)  4096        conv2d_20[0][0]                  
____________________________________________________________________________________________________
space_to_depth_helper (Lambda)   (None, 19, 19, 256)   0           leaky_re_lu_21[0][0]             
____________________________________________________________________________________________________
leaky_re_lu_20 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_20[0][0]     
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 19, 19, 1280)  0           space_to_depth_helper[0][0]      
                                                                   leaky_re_lu_20[0][0]             
____________________________________________________________________________________________________
conv2d_22 (Conv2D)               (None, 19, 19, 1024)  11796480    concatenate_1[0][0]              
____________________________________________________________________________________________________
batch_normalization_22 (BatchNor (None, 19, 19, 1024)  4096        conv2d_22[0][0]                  
____________________________________________________________________________________________________
leaky_re_lu_22 (LeakyReLU)       (None, 19, 19, 1024)  0           batch_normalization_22[0][0]     
____________________________________________________________________________________________________
input_7 (InputLayer)             (None, 56, 5)         0                                            
____________________________________________________________________________________________________
conv2d_23 (Conv2D)               (None, 19, 19, 125)   128125      leaky_re_lu_22[0][0]             
____________________________________________________________________________________________________
yolo_loss_hack_mask (Lambda)     (None, 19, 19, 5, 1)  0           input_7[0][0]                    
____________________________________________________________________________________________________
yolo_loss_hack_box (Lambda)      (None, 19, 19, 5, 5)  0           input_7[0][0]                    
____________________________________________________________________________________________________
yolo_loss (Lambda)               (None, 1)             0           conv2d_23[0][0]                  
                                                                   input_7[0][0]                    
                                                                   yolo_loss_hack_mask[0][0]        
                                                                   yolo_loss_hack_box[0][0]         
====================================================================================================
Total params: 50,676,061
Trainable params: 50,655,389
Non-trainable params: 20,672
____________________________________________________________________________________________________

Training Model



In [25]:

    
def train(model, class_names, anchors, image_data, boxes, validation_split=0.1):
    """
    Trains a yolo model
    """
    def yolo_loss_helper(y_true, y_pred):
        # print(y_true)
        # print(y_pred)
        return y_pred

    model.compile(
        optimizer='adam',
        loss={
            'yolo_loss': yolo_loss_helper
        }
    )
    
    model.fit([image_data, boxes],
              np.zeros(len(image_data)),
              validation_split=validation_split,
              batch_size=30, # 90 total
              epochs=5
    )



In [26]:

    
train(
    model,
    voc2012_classes,
    voc_anchors,
    processed_images,
    processed_boxes
)
print('Done')









    



Train on 90 samples, validate on 10 samples
Epoch 1/5
90/90 [==============================] - 410s - loss: 2645.4650 - val_loss: 99196232.0000
Epoch 2/5
90/90 [==============================] - 381s - loss: 1503.3178 - val_loss: 97022232.0000
Epoch 3/5
90/90 [==============================] - 378s - loss: 868.8492 - val_loss: 3002830.0000
Epoch 4/5
90/90 [==============================] - 378s - loss: 572.7653 - val_loss: 2900123.0000
Epoch 5/5
90/90 [==============================] - 372s - loss: 443.7727 - val_loss: 2781025.0000
Done



In [ ]:

	class_name	filename	height	img_full_path	width	xmax	xmin	ymax	ymin
0	tvmonitor	2008_000002	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	448	34	293	11
1	train	2008_000003	333.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	500	46	333	11
2	person	2008_000003	333.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	83	62	243	190
3	boat	2008_000007	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	428	1	293	230
4	horse	2008_000008	442.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	471	53	420	87
5	person	2008_000008	442.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	289	158	167	44
6	cow	2008_000009	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	294	217	221	161
7	cow	2008_000009	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	500	465	218	167
8	bottle	2008_000015	327.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	378	270	176	1
9	bottle	2008_000015	327.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	164	57	150	1
10	tvmonitor	2008_000016	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	392	91	353	15
11	dog	2008_000019	272.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	480.0	372	139	197	2
12	dog	2008_000019	272.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	480.0	318	165	236	66
13	dog	2008_000019	272.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	480.0	480	361	112	1
14	aeroplane	2008_000021	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	475	14	288	148
15	tvmonitor	2008_000023	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	333.0	314	6	262	1
16	bottle	2008_000023	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	333.0	121	40	411	97
17	person	2008_000023	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	333.0	169	137	109	36
18	person	2008_000023	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	333.0	216	180	104	36
19	person	2008_000023	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	333.0	123	96	103	39
20	person	2008_000026	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	372	122	375	7
21	dog	2008_000026	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	325	211	255	147
22	car	2008_000027	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	500	9	375	32
23	car	2008_000028	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	231	158	240	199
24	car	2008_000028	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	94	47	238	202
25	bus	2008_000032	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	489	6	274	118
26	person	2008_000032	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	364	336	224	173
27	aeroplane	2008_000033	333.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	476	104	257	61
28	bottle	2008_000034	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	338.0	45	6	362	234
29	person	2008_000034	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	338.0	103	1	336	156
...	...	...	...	...	...	...	...	...	...
31531	train	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	349	1	314	140
31532	train	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	67	1	208	165
31533	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	330	310	226	182
31534	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	343	328	228	182
31535	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	367	336	266	192
31536	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	487	458	274	190
31537	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	428	410	243	192
31538	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	29	14	218	182
31539	person	2011_003260	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	47	35	208	184
31540	bicycle	2011_003261	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	375.0	186	132	332	245
31541	person	2011_003261	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	375.0	186	130	292	210
31542	bottle	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	307	283	194	106
31543	bottle	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	248	223	173	97
31544	bottle	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	253	215	272	154
31545	diningtable	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	386	80	359	116
31546	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	190	2	359	45
31547	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	219	98	222	49
31548	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	226	116	192	24
31549	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	266	178	139	31
31550	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	397	341	153	39
31551	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	430	364	172	54
31552	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	448	336	221	71
31553	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	500	285	359	77
31554	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	200	111	60	1
31555	person	2011_003262	359.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	246	221	66	1
31556	car	2011_003269	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	334.0	309	19	461	347
31557	bottle	2011_003271	500.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	499	98	405	1
31558	train	2011_003274	375.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	384	1	215	113
31559	aeroplane	2011_003275	403.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	448	42	282	50
31560	bird	2011_003276	400.0	/Users/adammenges/Dropbox/Datasets/VOC/VOC 201...	500.0	292	264	217	142