In [1]:
import numpy as np
import matplotlib.pylab as pylab
import imageio
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from imageio.core.util import asarray as imgToArr
%matplotlib inline
We perform some preprocessing on the data before feeding into the models. Namely, we will split the data into training, validation, and tests sets that will have equivalent amounts of braking and nonbraking frames. Note: frames 80,000 to 100,000 correspond to congested city center traffic following a large truck.
In [2]:
# videoFile = './data/driving.avi'
# vid = imageio.get_reader(videoFile, 'ffmpeg')
# Columns: Frame, Brake, GazeX, GazeY
dataFile = './data/cleaned_data.csv'
df = pd.read_csv(dataFile, delimiter='\t')
brake = df[df['Brake'] > 0]
nonbrake = df[df['Brake'] == 0]
nonbrake = nonbrake[:len(brake)] # Braking is far fewer than nonbraking, so trim down
df = pd.concat([brake, nonbrake])
df = df.drop(df[df['GazeX'] < 0].index)
df = df.drop(df[df['GazeY'] < 0].index)
df = df.dropna()
df = df.reset_index(drop=True) # Resets the index to the usual 0, 1, 2, ...
# One-hot encode brakes
outputs = OneHotEncoder(sparse=False).fit_transform(df['Brake'].reshape(-1,1)) # column 0: no brake, column 1: brake
In [3]:
def get_sub_seq(seq, start, end):
"""Get the sub sequence starting at the start index and ending at the end index."""
arr = seq[max([0, start]):end]
if start < 0:
arr = np.append(np.zeros((abs(start),2)), arr, axis=0)
for i in range(len(arr)):
if np.sum(arr[i]) == 0:
arr[i] = [1, 0]
return arr
def minibatch(data, batch_size, data_size):
"""Generates a minibatch from the given data and parameters."""
randomized = np.random.permutation(data)
batches = []
num_batches = 0
while num_batches * batch_size < data_size:
new_batch = randomized[num_batches * batch_size:(num_batches + 1) * batch_size]
batches.append(new_batch)
num_batches += 1
return batches
def get_glimpses(images, coords):
"""Gets a batch of glimpses."""
arr = []
for img, coord in zip(images, coords):
arr.append(get_glimpse(img, coord[0], coord[1]))
return np.array(arr)
def get_glimpse(image, x, y, stride=14):
"""Returns a subsection (glimpse) of the image centered on the given point."""
x = int(x) # Force to int
y = int(y) # Force to int
min_x = x - stride
max_x = x + stride
min_y = y - stride
max_y = y + stride
image_glimpse = image[min_y:max_y, min_x:max_x, :] # NOTE: row, column, RGB
# image_glimpse = image[min_y:max_y, min_x:max_x, 0] # NOTE: row, column, RGB; everything is greyscale; flatten RGB layer
return imgToArr(image_glimpse)
In [4]:
# In training, we sometimes pull straight from the video and specify a stride length
# but we have also stored 28x28x3 glimpses for each frame
input_glimpses = np.zeros((80000, 28, 28, 3))
input_gazes = np.zeros((80000, 2))
outputs = np.zeros((80000, 2))
for batch in range(1, 9):
file_name = "data/glimpse_batchc_{0}.npz".format(batch)
array = np.load(file_name)
input_glimpses[(batch - 1) * 10000: batch * 10000] = array['frames']
input_gazes[(batch - 1) * 10000: batch * 10000] = array['gazes']
outputs[(batch - 1) * 10000: batch * 10000] = array['braking']
for i in range(len(outputs)):
if np.sum(outputs[i]) == 0:
outputs[i] = [1, 0]
sequences = np.array([get_sub_seq(outputs, i-3, i) for i in range(len(outputs))])
sequences = sequences.reshape(-1, 3*2)
In [5]:
test_inds = minibatch(range(len(input_glimpses)), 10000, len(input_glimpses))[0]
training_inds = [i for i in range(len(input_glimpses)) if i not in test_inds]
In [6]:
"""Create a logistic regression model for brake classification with 28x28x3 image input."""
# Create placeholders for inputs that will be placed via batches
image_input = tf.placeholder(tf.float32, [None, 28*28*3], name="image")
gaze_input = tf.placeholder(tf.float32, [None, 2], name="gaze")
y_ = tf.placeholder(tf.float32, [None, 2], name="output")
image_weights = tf.Variable(tf.truncated_normal([28*28*3, 2], stddev=1), name="image_weights")
gaze_weights = tf.Variable(tf.truncated_normal([2, 2], stddev=1), name="gaze_weights")
image_bias = tf.Variable(tf.truncated_normal([2], stddev=1), name="image_bias")
gaze_bias = tf.Variable(tf.truncated_normal([2], stddev=1), name="gaze_bias")
image_logits = tf.matmul(image_input, image_weights) + image_bias
gaze_logits = tf.matmul(gaze_input, gaze_weights) + gaze_bias
logits = tf.mul(tf.add(image_logits, gaze_logits), 0.5)
y = tf.nn.softmax(logits)
cross_entropy = tf.reduce_mean(tf.reduce_sum(-y_*tf.log(tf.clip_by_value(y, 1e-10,1.0)),reduction_indices=[1]))
optimizer = tf.train.AdamOptimizer().minimize(cross_entropy)
# initialization of variables
init = tf.initialize_all_variables()
# Define computations for accuracy calculation
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [7]:
input_glimpse_flat = input_glimpses.reshape(-1, 28*28*3)
with tf.Session() as sess:
sess = tf.Session()
sess.run(init)
indices = range(len(input_glimpses))
for epoch in range(100):
batches = minibatch(indices, 10000, len(indices))
for index_batch in batches:
glimpses = input_glimpses[index_batch]
glimpses = glimpses.reshape(-1, 28*28*3)
gazes = input_gazes[index_batch]
output = outputs[index_batch]
# print("Nonbraking: {:.0f}\tBraking: {:.0f}".format(output[:, 0].sum(), output[:, 1].sum()))
sess.run(optimizer, feed_dict={image_input: glimpses, gaze_input: gazes, y_: output})
ce = sess.run(cross_entropy, feed_dict={image_input: input_glimpse_flat, gaze_input: input_gazes, y_: outputs})
acc = sess.run(accuracy, feed_dict={image_input: input_glimpse_flat, gaze_input: input_gazes, y_: outputs})
# pred = sess.run(y, feed_dict={image_input: glimpses, gaze_input: gazes, y_: output})
# num_pred_nonbrake = pred[:, 0].sum()
# num_pred_brake = pred[:, 1].sum()
# print("\tNon-brake: {:.0f}\tBrake: {:.0f}".format(num_pred_nonbrake, num_pred_brake))
print("\tCross-entropy: {:.3f}\tAccuracy: {:.3f}".format(ce, acc))
In [ ]:
"""Create neural network for brake classification with 28x28x3 image input."""
# Create placeholders for inputs that will be placed via batches
image_input = tf.placeholder(tf.float32, [None, 28*28*3], name="image")
gaze_input = tf.placeholder(tf.float32, [None, 2], name="gaze")
y_ = tf.placeholder(tf.float32, [None, 2], name="output")
image_weights = tf.Variable(tf.truncated_normal([28*28*3, 1024], stddev=1), name="image_weights")
image_hidden_weights = tf.Variable(tf.truncated_normal([1024, 2], stddev=1), name="image_hidden_weights")
gaze_weights = tf.Variable(tf.truncated_normal([2, 1024], stddev=1), name="gaze_weights")
gaze_hidden_weights = tf.Variable(tf.truncated_normal([1024, 2], stddev=1), name="gaze_hidden_weights")
image_bias = tf.Variable(tf.truncated_normal([1024], stddev=1), name="image_bias")
image_hidden_bias = tf.Variable(tf.truncated_normal([2], stddev=1), name="image_hidden_bias")
gaze_bias = tf.Variable(tf.truncated_normal([1024], stddev=1), name="gaze_bias")
gaze_hidden_bias = tf.Variable(tf.truncated_normal([2], stddev=1), name="gaze_hidden_bias")
image_input_layer = tf.matmul(image_input, image_weights) + image_bias
image_hidden_layer = tf.matmul(tf.nn.relu(image_input_layer), image_hidden_weights) + image_hidden_bias
gaze_input_layer = tf.matmul(gaze_input, gaze_weights) + gaze_bias
gaze_hidden_layer = tf.matmul(tf.nn.relu(gaze_input_layer), gaze_hidden_weights) + gaze_hidden_bias
logits = tf.mul(tf.add(image_hidden_layer, gaze_hidden_layer), 0.5)
y = tf.nn.softmax(logits)
cross_entropy = tf.reduce_mean(tf.reduce_sum(-y_*tf.log(tf.clip_by_value(y, 1e-10,1.0)),reduction_indices=[1]))
optimizer = tf.train.AdamOptimizer().minimize(cross_entropy)
# initialization of variables
init = tf.initialize_all_variables()
# Define computations for accuracy calculation
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [ ]:
input_glimpse_flat = input_glimpses.reshape(-1, 28*28*3)
with tf.Session() as sess:
sess = tf.Session()
sess.run(init)
indices = range(len(input_glimpses))
for epoch in range(100):
batches = minibatch(indices, 1000, len(indices))
for index_batch in batches:
glimpses = input_glimpses[index_batch]
glimpses = glimpses.reshape(-1, 28*28*3)
gazes = input_gazes[index_batch]
output = outputs[index_batch]
# print("Nonbraking: {:.0f}\tBraking: {:.0f}".format(output[:, 0].sum(), output[:, 1].sum()))
sess.run(optimizer, feed_dict={image_input: glimpses, gaze_input: gazes, y_: output})
ce = sess.run(cross_entropy, feed_dict={image_input: input_glimpse_flat, gaze_input: input_gazes, y_: outputs})
acc = sess.run(accuracy, feed_dict={image_input: input_glimpse_flat, gaze_input: input_gazes, y_: outputs})
# pred = sess.run(y, feed_dict={image_input: glimpses, gaze_input: gazes, y_: output})
# num_pred_nonbrake = pred[:, 0].sum()
# num_pred_brake = pred[:, 1].sum()
# print("\tNon-brake: {:.0f}\tBrake: {:.0f}".format(num_pred_nonbrake, num_pred_brake))
print("\tCross-entropy: {:.3f}\tAccuracy: {:.3f}".format(ce, acc))
The feedforward network's results were promising! With just one hidden layer, we brought the initial training cross-entropy down from 11 to 10 (still bad, but it's getting better). Now, we'll try a wide-and-deep network where the deep part is a convnet on the image and the wide portion processes the driver gaze's coordinates.
In [6]:
# Define some helper methods that will abstract variable initialization and layer definitions
def weight_variable(shape, mean=0.0, wd=None):
initial = tf.truncated_normal(shape, mean=mean, stddev=0.1)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay) # Store losses in a collection
return tf.Variable(initial)
def bias_variable(shape, wd=None):
initial = tf.constant(0.5, shape=shape)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay) # Store losses in a collection
return tf.Variable(initial)
def conv2d(x, W, name='conv'):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name=name)
def max_pool_2x2(x, name='max_pool_2x2'):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
In [7]:
# Define convolutional computation graph
# Placeholders for parameters: image, gaze (x, y), output, dropout probability
image_input = tf.placeholder(tf.float32, [None, 28, 28, 1], name="image")
gaze_input = tf.placeholder(tf.float32, [None, 2], name="gaze")
brake_seq_input = tf.placeholder(tf.float32, [None, 3*2], name="brake_sequence")
y_ = tf.placeholder(tf.float32, [None, 2], name="output")
keep_prob = 0.5
# Convolutional net for image processing
# First layer
W_conv1 = weight_variable([5, 5, 1, 32]) # 5x5x1 filter with 32 features
b_conv1 = bias_variable([32]) # Bias for each filter
h_conv1 = tf.nn.relu(conv2d(image_input, W_conv1) + b_conv1, name='conv1')
h_pool1 = max_pool_2x2(h_conv1, name='pool1')
# Second layer
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='conv2')
h_pool2 = max_pool_2x2(h_conv2, name='pool2')
# Fully-connected layer hidden layer
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1, 'hidden1')
# Add dropout for fully-connected hidden layer
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# Final logits
W_fc2 = weight_variable([1024, 2])
b_fc2 = bias_variable([2])
image_logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
# Logistic regression for human gaze
W_g = weight_variable([2, 2])
b_g = bias_variable([2])
gaze_logits = tf.matmul(gaze_input, W_g) + b_g
# Logistic regression for braking sequence
W_bs = weight_variable([3*2, 2])
b_bs = bias_variable([2])
bs_logits = tf.matmul(brake_seq_input, W_bs) + b_bs
# Weights for final logits
image_logits_weights = weight_variable([2, 2], mean=0.3)
gaze_logits_weights = weight_variable([2, 2], mean=0.3)
bs_logits_weights = weight_variable([2, 2], mean=0.3)
bias = bias_variable([2])
# Combine logistic and convnet
logits = tf.add(tf.matmul(image_logits, image_logits_weights) + tf.matmul(gaze_logits, gaze_logits_weights) + tf.matmul(bs_logits, bs_logits_weights), bias)
y = tf.nn.softmax(logits)
cross_entropy = tf.reduce_mean(tf.reduce_sum(-y_*tf.log(tf.clip_by_value(y, 1e-10,1.0)),reduction_indices=[1]))
optimizer = tf.train.AdamOptimizer().minimize(cross_entropy)
# initialization of variables
init = tf.initialize_all_variables()
# Define computations for accuracy calculation
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [8]:
CONVNET_FILE_NAME = "model/convnet.ckpt"
with tf.Session() as sess:
sess = tf.Session()
sess.run(init)
saver = tf.train.Saver()
saver.restore(sess, CONVNET_FILE_NAME)
for epoch in range(100):
# batches = minibatch(training_inds, 10000, len(training_inds))
# print("Batches: {0}".format(len(batches)))
# for batch_num, index_batch in enumerate(batches):
# print("\tProcessing batch {0}".format(batch_num + 1))
# glimpses = input_glimpses[index_batch, :, :, :1]
# gazes = input_gazes[index_batch]
# seq = sequences[index_batch]
# output = outputs[index_batch]
# # print("Nonbraking: {:.0f}\tBraking: {:.0f}".format(output[:, 0].sum(), output[:, 1].sum()))
# sess.run(optimizer, feed_dict={image_input: glimpses, gaze_input: gazes, brake_seq_input: seq, y_: output})
# save_path = saver.save(sess, CONVNET_FILE_NAME)
# print("Model saved in file: %s" % save_path)
# Calculate cross-entropy, accuracy on last 10,000
ce, acc = sess.run([cross_entropy, accuracy], feed_dict={image_input: input_glimpses[test_inds, :, :, :1],
gaze_input: input_gazes[test_inds],
brake_seq_input: sequences[test_inds],
y_: outputs[test_inds]})
# num_pred_nonbrake = pred[:, 0].sum()
# num_pred_brake = pred[:, 1].sum()
# print("\tNon-brake: {:.0f}\tBrake: {:.0f}".format(num_pred_nonbrake, num_pred_brake))
print("\tCross-entropy: {:.3f}\tAccuracy: {:.3f}".format(ce, acc))
# if index == 67:
# fig = pylab.figure()
# pylab.imshow(this_glimpse)
# pylab.show()
In [ ]: