First, here's the config from their darknet/yolo repo:
cnf
[net]
# Testing
batch=1
subdivisions=1
# Training
# batch=64
# subdivisions=8
width=608
height=608
channels=3
momentum=0.9
decay=0.0005
angle=0
saturation = 1.5
exposure = 1.5
hue=.1
learning_rate=0.001
burn_in=1000
max_batches = 500200
policy=steps
steps=400000,450000
scales=.1,.1
[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky
[maxpool]
size=2
stride=2
[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky
[maxpool]
size=2
stride=2
[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky
[maxpool]
size=2
stride=2
[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky
[maxpool]
size=2
stride=2
[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky
[maxpool]
size=2
stride=2
[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky
[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky
#######
[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky
[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky
[route]
layers=-9
[convolutional]
batch_normalize=1
size=1
stride=1
pad=1
filters=64
activation=leaky
[reorg]
stride=2
[route]
layers=-1,-4
[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky
[convolutional]
size=1
stride=1
pad=1
filters=425
activation=linear
[region]
anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
bias_match=1
classes=80
coords=4
num=5
softmax=1
jitter=.3
rescore=1
object_scale=5
noobject_scale=1
class_scale=1
coord_scale=1
absolute=1
thresh = .6
random=1
this can be found here: https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolo.cfg
This is the model we'll be building, we'll be referencing this later, but first let's import and prepare the data
In [1]:
# Imports, helpers, and constants
# numpy/tensorflow
import numpy as np
import tensorflow as tf
# verious keras bits
from keras import backend as K
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
# keras layers
from keras.layers import (Conv2D,
GlobalAveragePooling2D,
Input,
Lambda,
MaxPooling2D,
LeakyReLU,
concatenate,
BatchNormalization)
# Images
import PIL
from PIL import Image, ImageDraw, ImageFont
# System
import os
import sys
import random
from functools import reduce
# data
import python_voc_parser as voc
# tools
import functools
from functools import partial
# ---
IMAGE_SET_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/Annotations'
IMAGE_PATH_2012 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2012/Training/VOC2012/JPEGImages'
IMAGE_SET_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/ImageSets/Main/trainval.txt'
ANNOTATIONS_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/Annotations'
IMAGE_PATH_2007 = '/Users/adammenges/Dropbox/Datasets/VOC/VOC 2007/Training/VOC2007/JPEGImages'
# # temp for macbook
# IMAGE_SET_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt'
# ANNOTATIONS_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/Annotations'
# IMAGE_PATH_2012 = '/Users/adammenges/Desktop/VOCdevkit/VOC2012/JPEGImages'
IMAGE_SIZE = (608, 608)
CONV_MAP_SIZE = (19, 19)
WEIGHT_DECAY = 0.0005
In [2]:
parser2012 = voc.VocAnnotationsParser(IMAGE_PATH_2012, IMAGE_SET_PATH_2012, ANNOTATIONS_PATH_2012)
In [3]:
df2012 = parser2012.get_annotation_dataframe()
In [4]:
df2012
Out[4]:
In [5]:
df2012.index.values
Out[5]:
In [6]:
df2012.iloc[[0]]
Out[6]:
In [7]:
df2012.iloc[[0]].img_full_path.values[0]
Out[7]:
In [8]:
from collections import defaultdict
annotations = defaultdict(list)
for i in df2012.index.values:
filename = df2012.iloc[[i]].img_full_path.values[0]
xmax = float(df2012.iloc[[i]].xmax.values[0])
xmin = float(df2012.iloc[[i]].xmin.values[0])
ymax = float(df2012.iloc[[i]].ymax.values[0])
ymin = float(df2012.iloc[[i]].ymin.values[0])
c = df2012.iloc[[i]].class_name.values[0]
annotations[filename].append([xmin,xmax,ymin,ymax,c])
all_boxes = list(annotations.values())
len_boxes = [len(x) for x in all_boxes]
max_number_of_boxes = max(len_boxes)
max_loc = len_boxes.index(max_number_of_boxes)
print("this dataset has a max number of {} boxes in one image, will use this for padding later".format(max_number_of_boxes))
In [9]:
voc2012_classes = list(set(parser2012.get_annotation_dataframe().class_name))
For reference while I'm doing this:
In [10]:
def convert_box(size, box):
dw = 1.0/size[0]
dh = 1.0/size[1]
x = (box[0] + box[1])/2.0
y = (box[2] + box[3])/2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x*dw
w = w*dw
y = y*dh
h = h*dh
return (x,y,w,h)
# Keras voc data generator, use later with new loss
def yolo_voc_generator(batch_size):
while True:
# I prob shouldn't be doing it this way...
random_images = random.sample(annotations.keys(), batch_size)
# Mmmk you know the drill...
processed_images = [Image.open(image_path).resize((608, 608), PIL.Image.BICUBIC)\
for image_path in random_images]
processed_images = [np.array(image, dtype=np.float) for image in processed_images]
processed_images = [image/255.0 for image in processed_images]
boxes = [] # all boxes, for all images
for k in random_images:
print(k)
new_image_boxes = []
for box in annotations[k]:
print(box)
origanal_image = Image.open(k) # move up there so I'm not doing this twice
new_box = list(map(float, box[:4]))
new_box = convert_box(origanal_image.size, new_box)
label_idx = voc2012_classes.index(box[4])
new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
new_image_boxes.append(new_image_box)
if len(new_image_boxes) < max_number_of_boxes:
for z in range(max_number_of_boxes - len(new_image_boxes)):
new_image_boxes.append(np.zeros(5))
boxes.append(np.array(new_image_boxes))
yield([boxes, processed_images])
# For educational proposes and ease, this method can just do it on all the data
def process_data(amount=None):
if amount:
all_images = list(annotations.keys())[:amount] # Only a subset for now
else:
all_images = list(annotations.keys())
# Mmmk you know the drill...
processed_images = [Image.open(image_path).resize((IMAGE_SIZE[0], IMAGE_SIZE[0]), PIL.Image.BICUBIC) \
for image_path in all_images]
processed_images = [np.array(image, dtype=np.float) for image in processed_images]
processed_images = [image/255.0 for image in processed_images]
processed_boxes = [] # all boxes, for all images
for k in all_images:
new_image_boxes = []
for box in annotations[k]:
origanal_image = Image.open(k) # move up there so I'm not doing this twice
new_box = list(map(float, box[:4]))
new_box = convert_box(origanal_image.size, new_box)
label_idx = voc2012_classes.index(box[4])
new_image_box = np.array([new_box[0], new_box[1], new_box[2], new_box[3], label_idx])
new_image_boxes.append(new_image_box)
if len(new_image_boxes) < max_number_of_boxes:
for z in range(max_number_of_boxes - len(new_image_boxes)):
new_image_boxes.append(np.zeros(5))
processed_boxes.append(np.array(new_image_boxes))
processed_boxes = np.array(processed_boxes)
processed_images = np.array(processed_images)
processed_images = np.uint8(processed_images)
return processed_images, processed_boxes
In [11]:
processed_images, processed_boxes = process_data(100)
In [12]:
processed_images.shape
Out[12]:
In [13]:
processed_boxes.shape
Out[13]:
In [14]:
voc_anchors = np.array(
[[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]
)
# voc_classes = [
# "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
# "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
# "pottedplant", "sheep", "sofa", "train", "tvmonitor"
# ]
voc_classes = voc2012_classes
Now that the data's been munged correctly, time to create our model.
(tried this on a whim, didn't realize you could have gifs in a notebook, I'll be using this liberally in the future)
In [15]:
input_image_tensor = Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
# time for conv
conv_layer_1 = Conv2D(
filters=32,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(input_image_tensor)
conv_layer_1 = BatchNormalization()(conv_layer_1)
conv_layer_1 = LeakyReLU(alpha=0.1)(conv_layer_1)
# time for max pool
maxpool_layer_1 = MaxPooling2D(
pool_size=(2, 2),
strides=(2, 2),
padding='same'
)(conv_layer_1)
# next conv
conv_layer_2 = Conv2D(
filters=64,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(maxpool_layer_1)
conv_layer_2 = BatchNormalization()(conv_layer_2)
conv_layer_2 = LeakyReLU(alpha=0.1)(conv_layer_2)
# now for a max pool
maxpool_layer_2 = MaxPooling2D(
pool_size=(2, 2),
strides=(2, 2),
padding='same'
)(conv_layer_2)
# next conv
conv_layer_3 = Conv2D(
filters=128,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(maxpool_layer_2)
conv_layer_3 = BatchNormalization()(conv_layer_3)
conv_layer_3 = LeakyReLU(alpha=0.1)(conv_layer_3)
# next conv
conv_layer_4 = Conv2D(
filters=64,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_3)
conv_layer_4 = BatchNormalization()(conv_layer_4)
conv_layer_4 = LeakyReLU(alpha=0.1)(conv_layer_4)
# next conv
conv_layer_5 = Conv2D(
filters=128,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_4)
conv_layer_5 = BatchNormalization()(conv_layer_5)
conv_layer_5 = LeakyReLU(alpha=0.1)(conv_layer_5)
# next max pool
maxpool_layer_3 = MaxPooling2D(
pool_size=(2, 2),
strides=(2, 2),
padding='same'
)
maxpool_layer_3 = maxpool_layer_3(conv_layer_5)
# next conv
conv_layer_6 = Conv2D(
filters=256,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(maxpool_layer_3)
conv_layer_6 = BatchNormalization()(conv_layer_6)
conv_layer_6 = LeakyReLU(alpha=0.1)(conv_layer_6)
# next conv
conv_layer_7 = Conv2D(
filters=128,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_6)
conv_layer_7 = BatchNormalization()(conv_layer_7)
conv_layer_7 = LeakyReLU(alpha=0.1)(conv_layer_7)
# next conv
conv_layer_8 = Conv2D(
filters=256,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_7)
conv_layer_8 = BatchNormalization()(conv_layer_8)
conv_layer_8 = LeakyReLU(alpha=0.1)(conv_layer_8)
# now for a max pool
maxpool_layer_4 = MaxPooling2D(
pool_size=(2, 2),
strides=(2, 2),
padding='same'
)(conv_layer_8)
# next conv
conv_layer_9 = Conv2D(
filters=512,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(maxpool_layer_4)
conv_layer_9 = BatchNormalization()(conv_layer_9)
conv_layer_9 = LeakyReLU(alpha=0.1)(conv_layer_9)
# next conv
conv_layer_10 = Conv2D(
filters=256,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_9)
conv_layer_10 = BatchNormalization()(conv_layer_10)
conv_layer_10 = LeakyReLU(alpha=0.1)(conv_layer_10)
# next conv
conv_layer_11 = Conv2D(
filters=512,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_10)
conv_layer_11 = BatchNormalization()(conv_layer_11)
conv_layer_11 = LeakyReLU(alpha=0.1)(conv_layer_11)
# next conv
conv_layer_12 = Conv2D(
filters=256,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_11)
conv_layer_12 = BatchNormalization()(conv_layer_12)
conv_layer_12 = LeakyReLU(alpha=0.1)(conv_layer_12)
# next conv
conv_layer_13 = Conv2D(
filters=512,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_12)
conv_layer_13 = BatchNormalization()(conv_layer_13)
conv_layer_13 = LeakyReLU(alpha=0.1)(conv_layer_13)
# now for a max pool
maxpool_layer_5 = MaxPooling2D(
pool_size=(2, 2),
strides=(2, 2),
padding='same'
)(conv_layer_13)
# next conv
conv_layer_14 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(maxpool_layer_5)
conv_layer_14 = BatchNormalization()(conv_layer_14)
conv_layer_14 = LeakyReLU(alpha=0.1)(conv_layer_14)
# next conv
conv_layer_15 = Conv2D(
filters=512,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_14)
conv_layer_15 = BatchNormalization()(conv_layer_15)
conv_layer_15 = LeakyReLU(alpha=0.1)(conv_layer_15)
# next conv
conv_layer_16 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_15)
conv_layer_16 = BatchNormalization()(conv_layer_16)
conv_layer_16 = LeakyReLU(alpha=0.1)(conv_layer_16)
# next conv
conv_layer_17 = Conv2D(
filters=512,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_16)
conv_layer_17 = BatchNormalization()(conv_layer_17)
conv_layer_17 = LeakyReLU(alpha=0.1)(conv_layer_17)
# next conv
conv_layer_18 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_17)
conv_layer_18 = BatchNormalization()(conv_layer_18)
conv_layer_18 = LeakyReLU(alpha=0.1)(conv_layer_18)
# next conv
conv_layer_19 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_18)
conv_layer_19 = BatchNormalization()(conv_layer_19)
conv_layer_19 = LeakyReLU(alpha=0.1)(conv_layer_19)
# next conv
conv_layer_20 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_19)
conv_layer_20 = BatchNormalization()(conv_layer_20)
conv_layer_20 = LeakyReLU(alpha=0.1)(conv_layer_20)
# next conv
conv_layer_21 = Conv2D(
filters=64,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(conv_layer_13) # skiping back to layer 13 for a higher res representation
conv_layer_21 = BatchNormalization()(conv_layer_21)
conv_layer_21 = LeakyReLU(alpha=0.1)(conv_layer_21)
###
# Reorg Helpers
###
def space_to_depth_helper(x):
import tensorflow as tf
return tf.space_to_depth(x, block_size=2)
def space_to_depth_output_shape_helper(input_shape):
return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 *
input_shape[3]) if input_shape[1] else (input_shape[0], None, None,
4 * input_shape[3])
# reorg time
space_to_depth_helper_func = Lambda(
space_to_depth_helper,
output_shape=space_to_depth_output_shape_helper,
name='space_to_depth_helper'
)(conv_layer_21)
# routing time
route_layers = [space_to_depth_helper_func, conv_layer_20]
concatenate_layer = concatenate(route_layers)
# next conv
conv_layer_22 = Conv2D(
filters=1024,
kernel_size=(3, 3),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=False, # because we're using batch norm
activation='linear',
padding='same'
)(concatenate_layer) # use concat layer
conv_layer_22 = BatchNormalization()(conv_layer_22)
conv_layer_22 = LeakyReLU(alpha=0.1)(conv_layer_22)
# final conv!
final_conv_filters = len(voc_anchors)*(5+len(voc_classes)) # see paper
conv_layer_23 = Conv2D(
filters=final_conv_filters,
kernel_size=(1, 1),
strides=(1, 1),
kernel_regularizer=l2(WEIGHT_DECAY),
use_bias=True, # because we're NOT using batch norm
activation='linear',
padding='same'
)(conv_layer_22)
In [19]:
def yolo_loss(args,
anchors,
num_classes):
"""
Loss for yolov2, given the features from the last conv, compute loss and return
"""
(yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args
num_anchors = len(anchors)
object_scale = 5
no_object_scale = 1
class_scale = 1
coordinates_scale = 1
pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_convert_boxes(
yolo_output, anchors, num_classes
)
yolo_output_shape = K.shape(yolo_output)
feats = K.reshape(yolo_output, [
-1, yolo_output_shape[1], yolo_output_shape[2], num_anchors,
num_classes + 5
])
pred_boxes = K.concatenate(
(K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1)
# Expand pred x,y,w,h to allow comparison with ground truth.
# batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
pred_xy = K.expand_dims(pred_xy, 4)
pred_wh = K.expand_dims(pred_wh, 4)
pred_wh_half = pred_wh / 2.
pred_mins = pred_xy - pred_wh_half
pred_maxes = pred_xy + pred_wh_half
true_boxes_shape = K.shape(true_boxes)
# batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
true_boxes = K.reshape(true_boxes, [
true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2]
])
true_xy = true_boxes[..., 0:2]
true_wh = true_boxes[..., 2:4]
# Find IOU of each predicted box with each ground truth box.
true_wh_half = true_wh / 2.
true_mins = true_xy - true_wh_half
true_maxes = true_xy + true_wh_half
intersect_mins = K.maximum(pred_mins, true_mins)
intersect_maxes = K.minimum(pred_maxes, true_maxes)
intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
true_areas = true_wh[..., 0] * true_wh[..., 1]
union_areas = pred_areas + true_areas - intersect_areas
iou_scores = intersect_areas / union_areas
# Best IOUs for each location.
best_ious = K.max(iou_scores, axis=4) # Best IOU scores.
best_ious = K.expand_dims(best_ious)
# A detector has found an object if IOU > thresh for some true box.
object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious))
# Losses!
no_object_weights = (no_object_scale * (1 - object_detections) *
(1 - detectors_mask))
no_objects_loss = no_object_weights * K.square(-pred_confidence)
# Object loss
objects_loss = (object_scale * detectors_mask *
K.square(1 - pred_confidence))
# Confidence loss
confidence_loss = objects_loss + no_objects_loss
matching_classes = K.cast(matching_true_boxes[..., 4], 'int32')
matching_classes = K.one_hot(matching_classes, num_classes)
# Classification loss
classification_loss = (class_scale * detectors_mask *
K.square(matching_classes - pred_class_prob))
matching_boxes = matching_true_boxes[..., 0:4]
# Coord loss
coordinates_loss = (coordinates_scale * detectors_mask *
K.square(matching_boxes - pred_boxes))
# print(detectors_mask.shape)
# print(matching_boxes.shape)
# print(pred_boxes.shape)
# Sum them together
confidence_loss_sum = K.sum(confidence_loss)
classification_loss_sum = K.sum(classification_loss)
coordinates_loss_sum = K.sum(coordinates_loss)
# Calculate final loss
return 0.5 * (confidence_loss_sum + classification_loss_sum + coordinates_loss_sum)
def yolo_convert_boxes(feats, anchors, num_classes):
"""
Convert features into boxes
"""
num_anchors = len(anchors)
# Reshape to batch, height, width, num_anchors, box_params.
anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2])
# Dynamic implementation of conv dims for fully convolutional model.
conv_dims = K.shape(feats)[1:3] # assuming channels last
# In YOLO the height index is the inner most iteration.
conv_height_index = K.arange(0, stop=conv_dims[0])
conv_width_index = K.arange(0, stop=conv_dims[1])
conv_height_index = K.tile(conv_height_index, [conv_dims[1]])
conv_width_index = K.tile(
K.expand_dims(conv_width_index, 0), [conv_dims[0], 1])
conv_width_index = K.flatten(K.transpose(conv_width_index))
conv_index = K.transpose(K.stack([conv_height_index, conv_width_index]))
conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])
conv_index = K.cast(conv_index, K.dtype(feats))
feats = K.reshape(
feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5])
conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats))
box_xy = K.sigmoid(feats[..., :2])
box_wh = K.exp(feats[..., 2:4])
box_confidence = K.sigmoid(feats[..., 4:5])
box_class_probs = K.softmax(feats[..., 5:])
# Adjust preditions to each spatial grid point and anchor size.
box_xy = (box_xy + conv_index) / conv_dims
box_wh = box_wh * anchors_tensor / conv_dims
return box_xy, box_wh, box_confidence, box_class_probs
def preprocess_true_boxes(true_boxes, anchors, image_size):
"""
Process our true boxes and masks
"""
height, width = image_size
num_anchors = len(anchors)
# 19 in our case
conv_height = height // 32
conv_width = width // 32
num_box_params = true_boxes.shape[1]
detectors_mask = np.zeros(
(conv_height, conv_width, num_anchors, 1), dtype=np.float32)
matching_true_boxes = np.zeros(
(conv_height, conv_width, num_anchors, num_box_params),
dtype=np.float32)
for box in true_boxes:
# scale box to convolutional feature spatial dimensions
box_class = box[4:5]
box = box[0:4] * np.array(
[conv_width, conv_height, conv_width, conv_height])
i = np.floor(box[1]).astype('int')
j = np.floor(box[0]).astype('int')
best_iou = 0
best_anchor = 0
for k, anchor in enumerate(anchors):
# Find IOU between box shifted to origin and anchor box.
box_maxes = box[2:4] / 2.
box_mins = -box_maxes
anchor_maxes = (anchor / 2.)
anchor_mins = -anchor_maxes
intersect_mins = np.maximum(box_mins, anchor_mins)
intersect_maxes = np.minimum(box_maxes, anchor_maxes)
intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
intersect_area = intersect_wh[0] * intersect_wh[1]
box_area = box[2] * box[3]
anchor_area = anchor[0] * anchor[1]
iou = intersect_area / (box_area + anchor_area - intersect_area)
if iou > best_iou:
best_iou = iou
best_anchor = k
if best_iou > 0:
detectors_mask[i, j, best_anchor] = 1
# adjust final box my ancor
adjusted_box = np.array(
[
box[0] - j, box[1] - i,
np.log(box[2] / anchors[best_anchor][0]),
np.log(box[3] / anchors[best_anchor][1]), box_class
],
dtype=np.float32
)
matching_true_boxes[i, j, best_anchor] = adjusted_box
return detectors_mask, matching_true_boxes
def get_detector_mask(boxes, anchors):
"""
Process each box
"""
detectors_mask = [0 for i in range(len(boxes))]
matching_true_boxes = [0 for i in range(len(boxes))]
for i, box in enumerate(boxes):
detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors,\
[IMAGE_SIZE[0], IMAGE_SIZE[1]])
return [np.array(detectors_mask), np.array(matching_true_boxes)]
# TODO: do something about this ugly hack (!!!)
# Should be done as pre processing, but in my case we can't so we have to do
# it for each mini batch
def hack_py_func(x):
# Just make my own copy
voc_anchors = np.array([[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
return get_detector_mask(x, voc_anchors)
def lambda_hack_box(x):
return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[1]
def lambda_hack_mask(x):
return tf.py_func(hack_py_func, x, [tf.float32, tf.float32])[0]
In [20]:
def craft_model(anchors, class_names):
"""
Now a function to create our model
"""
# Used for the loss
detectors_mask_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 1)
matching_boxes_shape = (CONV_MAP_SIZE[0], CONV_MAP_SIZE[1], 5, 5)
detectors_mask_input = Input(shape=detectors_mask_shape)
matching_boxes_input = Input(shape=matching_boxes_shape)
# Create model input layers, besides image.
boxes_input = Input(shape=(56, 5))
# Process our mask and true boxes
matching_true_boxes = Lambda(
lambda_hack_box,
output_shape=(19, 19, 5, 5),
name='yolo_loss_hack_box')([boxes_input])
detectors_mask = Lambda(
lambda_hack_mask,
output_shape=(19, 19, 5, 1),
name='yolo_loss_hack_mask')([boxes_input])
# Set the shape becuase Keras doesn't use it's own output shape here
# later, :/ ==== Thanks @markus
detectors_mask.set_shape((None, 19, 19, 5, 1))
matching_true_boxes.set_shape((None, 19, 19, 5, 5))
# Final layer that computes loss, hack around limitations in Keras loss formation
model_loss = Lambda(
yolo_loss,
output_shape=(1, ),
name='yolo_loss',
arguments={'anchors': anchors,
'num_classes': len(class_names)})([
conv_layer_23, boxes_input,
detectors_mask, matching_true_boxes
])
# Create model
model = Model([input_image_tensor, boxes_input], model_loss)
return model
In [21]:
model = craft_model(voc_anchors, voc_classes)
For reference, model should be:
Total params: 50,676,061
Trainable params: 50,655,389
Non-trainable params: 20,672
In [22]:
model.summary()
In [25]:
def train(model, class_names, anchors, image_data, boxes, validation_split=0.1):
"""
Trains a yolo model
"""
def yolo_loss_helper(y_true, y_pred):
# print(y_true)
# print(y_pred)
return y_pred
model.compile(
optimizer='adam',
loss={
'yolo_loss': yolo_loss_helper
}
)
model.fit([image_data, boxes],
np.zeros(len(image_data)),
validation_split=validation_split,
batch_size=30, # 90 total
epochs=5
)
In [26]:
train(
model,
voc2012_classes,
voc_anchors,
processed_images,
processed_boxes
)
print('Done')
In [ ]: