Copyright 2018 Google LLC
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
In [0]:
import matplotlib.pyplot as plt
import numpy as np
from __future__ import division
from __future__ import print_function
import math
import gym
from gym import spaces
import pandas as pd
import tensorflow as tf
from IPython import display
import time
from third_party import np_box_ops
import annotator, detector, dialog, environment
To specify the experiments, 3 paramters need to be defined:
All together, it gives 8 possible experiment, 6 of which were presented in the paper.
In [0]:
# desired quality: high (min_iou=0.7) and low (min_iou=0.5)
min_iou = 0.7 # @param ["0.5", "0.7"]
# drawing speed: high (time_draw=7) and low (time_draw=25)
time_draw = 7 # @param ["7", "25"]
# if detector is weak, then we use best MIL, if it is strong, we use detector trained on PASCAL 2012
detector_weak = False # @param ['False']
Other parameters of the experiment
In [0]:
random_seed = 805 # global variable that fixes the random seed everywhere for replroducibility of results
# what kind of features will be used to represent the state
# numerical values 1-20 correspond to one hot encoding of class
predictive_fields = ['prediction_score', 'relative_size', 'avg_score', 'dif_avg_score', 'dif_max_score', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
time_verify = 1.8 # @param
# select one of the 10 folds
fold = 8 # @param
In [0]:
# Download GT:
# wget wget https://storage.googleapis.com/iad_pascal_annotations_and_detections/pascal_gt_for_iad.h5
# Download detections with features
# wget https://storage.googleapis.com/iad_pascal_annotations_and_detections/pascal_proposals_plus_features_for_iad.h5
download_dir = ''
ground_truth = pd.read_hdf(download_dir + 'pascal_gt_for_iad.h5', 'ground_truth')
box_proposal_features = pd.read_hdf(download_dir + 'pascal_proposals_plus_features_for_iad.h5', 'box_proposal_features')
In [6]:
ground_truth.sample(n=3)
Out[6]:
In [7]:
box_proposal_features.sample(n=3)
Out[7]:
The ReplayBuffer class is used for storing transitions of interaction with the environment. Each transition consists of:
The ReplayBuffer class provides a random samples of transitions in a Minibatch.
In [0]:
class Minibatch:
def __init__(self, state, action, reward, next_state, terminal):
self.state = state
self.action = action
self.reward = reward
self.next_state = next_state
self.terminal = terminal
def __str__(self):
return str(zip(self.state, self.action, self.reward, self.next_state, self.terminal))
def __getitem__(self, x):
return self.state[x], self.action[x], self.reward[x], self.next_state[x], self.terminal[x]
class ReplayBuffer:
def __init__(self, buffer_size=1e4):
self.buffer_size = int(buffer_size)
self.n = 0
self.write_index = 0
# Initialize numpy arrays to the full maximum size of the ReplayBuffer
def _init_nparray(self, state, action, reward, next_state, terminal):
# For each column, initialize the column for entire buffer_size.
self.all_states = np.array([state] * self.buffer_size)
self.all_actions = np.array([action] * self.buffer_size)
self.all_rewards = np.array([reward] * self.buffer_size)
self.all_next_states = np.array([next_state] * self.buffer_size)
self.all_terminals = np.array([terminal] * self.buffer_size)
self.n = 1
self.write_index = 1
def store_transition(self, state, action, reward, next_state, terminal):
# If buffer arrays not yet initialized, initialize them
if self.n == 0:
self._init_nparray(state, action, reward, next_state, terminal)
return
self.all_states[self.write_index] = state
self.all_actions[self.write_index] = action
self.all_rewards[self.write_index] = reward
self.all_next_states[self.write_index] = next_state
self.all_terminals[self.write_index] = terminal
self.write_index += 1
if self.write_index >= self.buffer_size:
self.write_index = 0
# Keep track of the max index to be used for sampling.
if self.n < self.buffer_size:
self.n += 1
def sample_minibatch(self, batch_size=32):
minibatch_indices = np.random.permutation(self.n)[:batch_size]
minibatch = Minibatch(
self.all_states[minibatch_indices],
self.all_actions[minibatch_indices],
self.all_rewards[minibatch_indices],
self.all_next_states[minibatch_indices],
self.all_terminals[minibatch_indices],
)
return minibatch
The following code block implements Deepmind's DQN algorithm: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
The algorithm uses Q-learning to train an estimator Q(s,a), for actions and states in the environment.
In [0]:
class DQN:
def __init__(self,
observation_space,
action_space,
learning_rate=1e-3,
batch_size=32,
is_target_dqn=False,
hidden_layer_sizes=[30, 30],
discount_rate=0.99,
target_copy_factor=0.001,
session=None
):
self.observation_space = observation_space
self.action_space = action_space
self.learning_rate = learning_rate
self.batch_size = batch_size
self.is_target_dqn = is_target_dqn
self.hidden_layer_sizes = hidden_layer_sizes
self.discount_rate = discount_rate
self.target_copy_factor = target_copy_factor
self.session = session or tf.Session()
self._initialized = False
if not is_target_dqn:
self._target_dqn = DQN(
observation_space=observation_space,
action_space=action_space,
learning_rate=learning_rate,
batch_size=batch_size,
hidden_layer_sizes=hidden_layer_sizes,
is_target_dqn=True,
session=self.session,
)
# Make the net for inference and training. Requires that target DQN made first.
self._make_net()
def _make_net(self):
observation_length = sum([len(observ) for observ in self.observation_space.sample()])
var_scope_name = "dqn" if not self.is_target_dqn else "target_dqn"
with tf.variable_scope(var_scope_name):
# Placeholder for states, first dimension for batch size, second for observation vector length.
self._state_placeholder = tf.placeholder(dtype=tf.float32, shape=(self.batch_size, observation_length))
# Make first hidden layer
self._layers = []
self._layers.append(tf.contrib.layers.fully_connected(
inputs = self._state_placeholder,
num_outputs = self.hidden_layer_sizes[0],
trainable = not self.is_target_dqn,
variables_collections = [var_scope_name],
scope = "layer0"
))
# Make subsequent hidden layers.
for i in xrange(1, len(self.hidden_layer_sizes)):
self._layers.append(tf.contrib.layers.fully_connected(
inputs = self._layers[-1],
num_outputs = self.hidden_layer_sizes[i],
trainable = not self.is_target_dqn,
variables_collections = [var_scope_name],
scope = "layer{}".format(i)
))
# Make action-value predictions layer.
self._av_predictions = tf.contrib.layers.linear(
inputs = self._layers[-1],
num_outputs = self.action_space.n,
trainable = not self.is_target_dqn,
variables_collections = [var_scope_name],
scope = "av_predictions"
)
# If not the target DQN, make the placeholders and ops for computing Bellman loss and training.
if not self.is_target_dqn:
self._action_placeholder = tf.placeholder(dtype=tf.int32, shape=(self.batch_size))
self._reward_placeholder = tf.placeholder(dtype=tf.float32, shape=(self.batch_size))
self._terminal_placeholder = tf.placeholder(dtype=tf.bool, shape=(self.batch_size))
ones = tf.ones(shape=(self.batch_size))
zeros = tf.zeros(shape=(self.batch_size))
# Contains 1 where not terminal, 0 where terminal. (batch_size x 1)
terminal_mask = tf.where(self._terminal_placeholder, zeros, ones)
# Contains 1 where action was taken. (batch_size x action_space.n)
action_taken_mask = tf.one_hot(
indices = self._action_placeholder,
depth = self.action_space.n,
on_value = 1.0,
off_value = 0.0,
dtype = tf.float32
)
# Contains 1 where action was not taken. (batch_size x action_space.n)
action_not_taken_mask = tf.one_hot(
indices = self._action_placeholder,
depth = self.action_space.n,
on_value = 0.0,
off_value = 1.0,
dtype = tf.float32
)
# For samples that are not terminal, contains max next step action value predictions. (batch_size x 1)
masked_target_av_predictions = tf.reduce_max(self._target_dqn._av_predictions, reduction_indices=[1]) * terminal_mask
# Target values for actions taken. (batch_size x 1)
# = r + discount_rate * Q_target(s', a') , for non-terminal transitions
# = r , for terminal transitions
actions_taken_targets = self._reward_placeholder + self.discount_rate * masked_target_av_predictions
actions_taken_targets = tf.reshape(actions_taken_targets, (self.batch_size, 1))
# Target values for all actions. (batch_size x action_space.n)
# = the target predicted av, for indices corresponding to actions taken
# = the current predicted av, for indices corresponding to actions not taken
all_action_targets = actions_taken_targets * action_taken_mask + self._av_predictions * action_not_taken_mask
self._all_action_targets = all_action_targets
# Define error, loss
error = all_action_targets - self._av_predictions
self._loss = tf.reduce_sum(tf.square(error))
# Define train op
opt = tf.train.AdamOptimizer(self.learning_rate)
self._opt = opt
self._train_op = opt.minimize(self._loss, var_list=tf.get_collection('dqn'))
# Construct ops to copy over weighted average of parameter values to target net
copy_factor = self.target_copy_factor
copy_factor_complement = 1 - copy_factor
self._copy_ops = [target_var.assign(copy_factor * my_var + copy_factor_complement * target_var)
for (my_var, target_var)
in zip(tf.get_collection('dqn'), tf.get_collection('target_dqn'))]
def _copy_to_target_dqn(self):
assert not self.is_target_dqn, "cannot call _copy_to_target_dqn on target DQN"
self.session.run(self._copy_ops)
def _check_initialized(self):
if self._initialized:
return
self.session.run(tf.initialize_all_variables())
self._initialized = True
def get_action(self, state):
self._check_initialized()
state_batch = (state,) * self.batch_size
av_predictions = self.session.run(
self._av_predictions,
feed_dict = { self._state_placeholder : state_batch }
)
# av_predictions currently holds a whole minibatch. Extract first row.
av_predictions = av_predictions[0]
# Choose the index of max action.
max_action = 0
max_action_value = -float("inf")
for i in xrange(self.action_space.n):
if av_predictions[i] > max_action_value:
max_action = i
max_action_value = av_predictions[max_action]
return max_action
def save_params(self):
params = {}
for var in tf.get_collection('dqn'):
params[var.name] = self.session.run(var)
return params
def load_params(self, params):
for var in tf.get_collection('dqn'):
self.session.run(var.assign(params[var.name]))
def train(self, minibatch):
assert not self.is_target_dqn, "cannot call train() on target DQN"
self._check_initialized()
# Run a step of optimization with the minibatch fields.
loss, _ = self.session.run(
[self._loss, self._train_op],
feed_dict = {
self._state_placeholder : minibatch.state,
self._action_placeholder : minibatch.action,
self._reward_placeholder : minibatch.reward,
self._target_dqn._state_placeholder : minibatch.next_state,
self._terminal_placeholder : minibatch.terminal,
},
)
self._copy_to_target_dqn()
return loss
In [0]:
the_annotator = annotator.AnnotatorSimple(ground_truth, random_seed, time_verify, time_draw, min_iou)
the_detector = detector.Detector(box_proposal_features, predictive_fields)
In [0]:
image_class = ground_truth[['image_id', 'class_id']]
image_class = image_class.drop_duplicates()
Select the training and testing data according to the selected fold. We split all images in 10 approximately equal parts and each fold includes these images together with all classes present in them.
In [0]:
# get a list of unique images
unique_image = image_class['image_id'].drop_duplicates()
# a list of image+class pairs
image_class_array = image_class.values[:,0]
if fold==1:
index_image_class1 = 0
else:
image_division1 = unique_image.iloc[502+501*(fold-2)]
index_image_class1 = np.searchsorted(image_class_array, image_division1, side='right')
if fold==10:
index_image_class2 = len(image_class_array)
else:
image_division2 = unique_image.iloc[502+501*(fold-1)]
index_image_class2 = np.searchsorted(image_class_array, image_division2, side='right')
# the selected fold becomes the training set
image_class_trainval = image_class.iloc[index_image_class1:index_image_class2]
# the other 9 folds become test set
image_class_test = pd.concat([image_class.iloc[0:index_image_class1],image_class.iloc[index_image_class2:]])
n_train = 500 # reserve samples for training
# permute data to get training and validation subsets
all_indeces_permuted = np.random.permutation(len(image_class_trainval))
indeces_for_train = all_indeces_permuted[0:n_train]
indeces_for_val = all_indeces_permuted[n_train:]
image_class_train = image_class_trainval.iloc[indeces_for_train]
image_class_val = image_class_trainval.iloc[indeces_for_val]
Initialise the environment for testing the strategies.
In [0]:
env_train = environment.AnnotatingDataset(the_annotator, the_detector, image_class_train)
env_val = environment.AnnotatingDataset(the_annotator, the_detector, image_class_val)
env_test = environment.AnnotatingDataset(the_annotator, the_detector, image_class_test)
This class implements the environment simulating a user annotating images of PASCAL dataset, following the OpenAI Gym interface for RL environments.
The following code block runs the training process on the AnnotatingPASCAL environment, using the DQN as an agent.
First, some initial episodes are taken, storing their results in the ReplayBuffer, this warm-starts the replay buffer with some experience, so that early stages of learning do not overfit.
Next, many training iterations are performed. Each training iteration has several phases:
Finally, an average of the test returns and nn train error are plotted over the iteration index.
Hyperparameter Explanation:
In [0]:
# Warm start episodes
tf.reset_default_graph()
# Initialize the DQN agent
agent = DQN(env_train.observation_space, env_train.action_space,
batch_size=80, # @param
learning_rate=1e-3, # @param
hidden_layer_sizes=[30, 30], # @param
discount_rate=1,
)
REPLAY_BUFFER_SIZE = 1e4 # @param
replay_buffer = ReplayBuffer(buffer_size=REPLAY_BUFFER_SIZE)
num_action_classes = env_train.action_space.n
# Warm-start the replay buffer with some random actions.
WARM_START_EPISODES = 100 # @param
for _ in xrange(WARM_START_EPISODES):
state = env_train.reset()
terminal = False
while not terminal:
# Choose a random action
action = np.random.randint(0, num_action_classes)
next_state, reward, terminal, _ = env_train.step(action)
# Store the transition in the replay buffer.
replay_buffer.store_transition(state, action, reward, next_state, terminal)
# Get ready for next step
state = next_state
In [21]:
# Run training and validation episodes
# Run multiple training iterations. Each iteration consits of:
# - training episodes (with exploration)
# - neural network updates
# - test episodes for evaluating performance
# Exploration rate
EPSILON = 0.2 # @param
# Can experiemnt with dynamically changing the eps: EPSILON -EPSILON*(iteration/TRAINING_ITERATIONS)
TRAINING_ITERATIONS = 500 # @param
# at each training ietration TRAINING_EPISODES_PER_ITERATION episodes are simulated
TRAINING_EPISODES_PER_ITERATION = 10 # @param
# at each training iteration NN_UPDATES_PER_ITERATION gradient steps are made
NN_UPDATES_PER_ITERATION = 30 # @param
train_episode_rewards = []
val_episode_rewards = []
agent_params = {}
best_iteration = 0
best_time = -float("inf")
# can set the number of samples to be used for estimating training error or validation error to be smaller for faster executions
n_for_trainerror = 200 #len(image_class_train)
n_for_valerror = 200 #len(image_class_val)
for iteration in xrange(TRAINING_ITERATIONS):
# Simulate training episodes.
for _ in xrange(TRAINING_EPISODES_PER_ITERATION):
state = env_train.reset()
terminal = False
while not terminal:
action = agent.get_action(state)
# With epsilon probability, take a random action.
if np.random.ranf() < EPSILON:
action = np.random.randint(0, num_action_classes)
next_state, reward, terminal, _ = env_train.step(action)
replay_buffer.store_transition(state, action, reward, next_state, terminal)
state = next_state
# Do neural network updates
for _ in xrange(NN_UPDATES_PER_ITERATION):
minibatch = replay_buffer.sample_minibatch(agent.batch_size)
agent.train(minibatch)
# Store the agent params from this iteration.
agent_params[iteration] = agent.save_params()
# Compute the training and validation error 20 times during the training iterations
if (iteration+1) % (TRAINING_ITERATIONS / 20) == 0:
print('Episode ', iteration, end = ': ')
# Run episodes to evaluate train reward.
train_reward = 0
for i in xrange(n_for_trainerror):
state = env_train.reset(current_index=i)
terminal = False
while not terminal:
action = agent.get_action(state)
next_state, reward, terminal, _ = env_train.step(action)
state = next_state
train_reward += reward
# Store the train episode stats.
print('average trainign error = ', - train_reward/n_for_trainerror)
train_episode_rewards.append(train_reward/n_for_trainerror)
# Run episodes to evaluate validation reward.
val_reward = 0
for i in xrange(n_for_valerror):
state = env_val.reset(current_index=i)
terminal = False
while not terminal:
action = agent.get_action(state)
next_state, reward, terminal, _ = env_val.step(action)
state = next_state
val_reward += reward
# Store the test episode stats.
val_episode_rewards.append(val_reward/n_for_valerror)
# remember the iteration with the lowest validation error for early stopping
if val_reward/n_for_valerror>best_time:
best_time = val_reward/n_for_valerror
best_iteration = iteration
In [22]:
# plot the training and validation errors
plt.plot(train_episode_rewards, 'b', label = 'train reward')
plt.plot(val_episode_rewards, 'g', label = 'validation reward')
Out[22]:
In [23]:
%output_height 300
# load the agent from the iteration with the lowest validation error
print('Best iteration = ', best_iteration)
print('Best validation time = ', best_time)
agent.load_params(agent_params[best_iteration])
test_reward = 0
for i in xrange(len(image_class_test)):
state = env_test.reset(current_index=i)
terminal = False
print('Episode ', i, end = ': ')
while not terminal:
# Take an environment step
action = agent.get_action(state)
if action==0:
print('V', end='')
elif action==1:
print('D', end='')
next_state, reward, terminal, _ = env_test.step(action)
state = next_state
test_reward += reward
print()
print('Total duration of all episodes = ', -test_reward)
print('Average episode duration = ', -test_reward/len(image_class_test))