Fully Convolutional Networks (FCN)

Notes from Udacity's Self-Driving Car Nanodegree

  • Encoder extracts features that the decoder uses layer

Pieces:

  • Pre-train encoder on VGG/ResNet
  • Do a 1x1 convolution
  • Tansposed convolutions to upsample

Skip connections are added. If VGG is used then only 3rd and 4th pooling layers are used as skip connections. Too many skip connections can lead to an explosion of the model size.


In [125]:
import numpy as np
import tensorflow as tf
import collections

1- Replace Fully Connected (FC) with 1x1 convolutions


In [7]:
# custom init with the seed set to 0 by default
def custom_init(shape, dtype=tf.float32, partition_info=None, seed=0):
    return tf.random_normal(shape, dtype=dtype, seed=seed)

# TODO: Use `tf.layers.conv2d` to reproduce the result of `tf.layers.dense`.
# Set the `kernel_size` and `stride`.
def conv_1x1(x, num_outputs):
    kernel_size = 1
    stride = 1
    return tf.layers.conv2d(x, num_outputs, kernel_size, stride, kernel_initializer=custom_init)

In [8]:
num_outputs = 2
x = tf.constant(np.random.randn(1, 2, 2, 1), dtype=tf.float32)
# `tf.layers.dense` flattens the input tensor if the rank > 2 and reshapes it back to the original rank
# as the output.
dense_out = tf.layers.dense(x, num_outputs, kernel_initializer=custom_init)
conv_out = conv_1x1(x, num_outputs)

    
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    a = sess.run(dense_out)
    b = sess.run(conv_out)
    print("Dense Output =", a)
    print("Conv 1x1 Output =", b)

    print("Same output? =", np.allclose(a, b, atol=1.e-5))


Dense Output = [[[[ 0.08041782 -0.42397892]
   [-0.45524108  2.40012264]]

  [[-0.14200903  0.7487002 ]
   [ 0.38274264 -2.01789618]]]]
Conv 1x1 Output = [[[[ 0.08041782 -0.42397892]
   [-0.45524108  2.40012264]]

  [[-0.14200903  0.7487002 ]
   [ 0.38274264 -2.01789618]]]]
Same output? = True

In [8]:
a.shape


Out[8]:
(1, 2, 2, 2)

In [9]:
b.shape


Out[9]:
(1, 2, 2, 2)

2- Upsampling through transposed convolution


In [22]:
def upsample(x):
    """
    Apply a two times upsample on x and return the result.
    :x: 4-Rank Tensor
    :return: TF Operation
    """
    # TODO: Use `tf.layers.conv2d_transpose`
    
    return tf.layers.conv2d_transpose(x,
                                      x.shape[3],
                                      kernel_size=(3, 3),
                                      strides=2,
                                      padding='SAME')


x = tf.constant(np.random.randn(1, 4, 4, 3), dtype=tf.float32)
conv = upsample(x)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(conv)

    print('Input Shape: {}'.format(x.get_shape()))
    print('Output Shape: {}'.format(result.shape))


Input Shape: (1, 4, 4, 3)
Output Shape: (1, 8, 8, 3)

3- Skip connection

  • Retrain information
  • Use info from multiple resolutions

Semantic Segmentation

  • Bounding boxes for object detection, easier than segmentation

    • YOLO and SSD which work well:
      • High frames per second (FPS)
      • Can detect cars, people, traffic signs, etc
  • Semantic segmentation

    • Pixel level
    • Scene understanding
    • Multiple decoders for different tasks (e.g. segmentation, depth)

Intersection over Union (IoU)

  • Intersection => TP
  • Union => classified T (TP + FP) + actually T (TP + FN)

TensorFlow Implementation


In [145]:
truth = np.array(
    [[0, 0, 0, 0],
     [1, 1, 1, 1],
     [2, 2, 2, 2],
     [3, 3, 3, 3]
    ]
)

prediction = np.array([
    [0, 0, 0, 0],
    [1, 0, 0, 1],
    [1, 2, 2, 1],
    [3, 3, 0, 3]
])

def iou1(truth, pred):
    t = truth + 1
    p = pred + 1
    classes = np.unique(t)
    
    a = ((t == p) *  t).flatten()
    tp = Counter(a[a > 0])
    b = ((t != p) * t).flatten()
    fn = Counter(b[b > 0])
    c = ((t != p) * p).flatten()
    fp = Counter(c[c > 0])
    
    ious = {
        class_: tp.get(class_) / count
        for class_, count in (tp + fp + fn).items()
    }
    
    print(ious)
    return sum(ious.values()) / len(ious)

In [146]:
iou1(truth, prediction)


{1: 0.5714285714285714, 2: 0.3333333333333333, 3: 0.5, 4: 0.75}
Out[146]:
0.5386904761904762

Tensorflow implementation


In [171]:
def mean_iou(ground_truth, prediction, num_classes):
    # TODO: Use `tf.metrics.mean_iou` to compute the mean IoU.
    iou, iou_op = tf.metrics.mean_iou(ground_truth,
                                      prediction,
                                      num_classes)
    return iou, iou_op


ground_truth = tf.constant([
    [0, 0, 0, 0], 
    [1, 1, 1, 1], 
    [2, 2, 2, 2], 
    [3, 3, 3, 3]], dtype=tf.float32)
prediction = tf.constant([
    [0, 0, 0, 0], 
    [1, 0, 0, 1], 
    [1, 2, 2, 1], 
    [3, 3, 0, 3]], dtype=tf.float32)
    
# TODO: use `mean_iou` to compute the mean IoU
iou, iou_op = mean_iou(ground_truth, prediction, 4)

with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # need to initialize local variables for this to run `tf.metrics.mean_iou`
        sess.run(tf.local_variables_initializer())
        
        sess.run(iou_op)
        # should be 0.53869
        print("Mean IoU =", sess.run(iou))


Mean IoU = 0.53869

In [ ]: