In [ ]:
from __future__ import print_function, division

In [ ]:
# If you are running on a server, launch xvfb to record game videos
# Please make sure you have xvfb installed
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ['DISPLAY'] = ':1'

If you are new to this course and want more instructions on how to set up environement and all the libs (docker / windows / gpu / blas / etc.), you could read vital instructions here.

Please make sure that your have bleeding edge versions of Theano, Lasagne and Agentnet.

General purpose libs import


In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from timeit import default_timer as timer

from IPython.core import display

In [ ]:
# if you have  GPU uncomment the line below
%env THEANO_FLAGS = device = gpu0, floatX = float32

Universal collection of a gentleman:


In [ ]:
import gym

from agentnet.agent import Agent
from agentnet.experiments.openai_gym.wrappers import PreprocessImage
from agentnet.memory import WindowAugmentation, LSTMCell, GRUCell
from agentnet.target_network import TargetNetwork
from agentnet.resolver import EpsilonGreedyResolver, ProbabilisticResolver
from agentnet.experiments.openai_gym.pool import EnvPool
from agentnet.learning import qlearning

import theano
import theano.tensor as T

import lasagne
from lasagne.layers import DenseLayer, Conv2DLayer, InputLayer, NonlinearityLayer
from lasagne.layers import batch_norm, get_all_params, get_output, reshape, concat, dropout
from lasagne.nonlinearities import rectify, leaky_rectify, elu, tanh, softmax

Helper function definitions

Downsample image, and crop it, showing only the most useful part of image.


In [ ]:
def make_env():
    env = gym.make("KungFuMaster-v0")
    env = PreprocessImage(env, height=64, width=64,
                          grayscale=True, crop=lambda img: img[60:-30, 7:])
    return env

Function for tracking performance while training


In [ ]:
def eval_and_plot(rewards, epoch_counter, pool, target_score, th_times, loop_times):
    rewards[epoch_counter] = np.mean(pool.evaluate(
        n_games=N_EVAL_GAMES, record_video=False, verbose=False))
    info_string = "Time (DL/All) {:.1f}/{:.1f}  epoch={}, mean_score={:.2f}"
    info_string = info_string.format(np.mean(th_times), np.mean(loop_times),
                                     epoch_counter, np.mean(rewards[epoch_counter]))
    plt.figure(figsize=(8, 5))
    plt.plot([rewards[i] for i in sorted(rewards.keys())])
    plt.grid()
    plt.ylabel("Mean reward over evaluation games")
    plt.title(info_string)
    plt.show()
    display.clear_output(wait=True)

Experiment setup

Here we basically just load the game and check that it works


In [ ]:
env = gym.make('KungFuMaster-v0')

In [ ]:
print(env.env.get_action_meanings())

In [ ]:
plt.imshow(env.reset())

In [ ]:
env = make_env()
plt.imshow(np.squeeze(env.reset()), interpolation='none', cmap='gray')

Global constants definition

All hyperparameters (except number of layers and neurons) are declared here as upper case letters along with global varaibles.


In [ ]:
N_ACTIONS = env.action_space.n
OBS_SHAPE = env.observation_space.shape
OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH = OBS_SHAPE

# These 4 constanst were shown to lead to nearly state of the art on kung-fu master game
N_SIMULTANEOUS_GAMES = 10  # this is also known as number of agents in exp_replay_pool
SEQ_LENGTH = 25

EVAL_EVERY_N_ITER = 100
N_EVAL_GAMES = 2

N_FRAMES_IN_BUFFER = 4  # number of consequent frames to feed in CNN

A2C with memory


In [ ]:
observation_layer = InputLayer((None,) + OBS_SHAPE)
prev_wnd = InputLayer(
    [None, N_FRAMES_IN_BUFFER, OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH])
new_wnd = WindowAugmentation(observation_layer, prev_wnd)
wnd_reshape = reshape(
    new_wnd, [-1,  N_FRAMES_IN_BUFFER * OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH])

In [ ]:
# TYPE YOUR CODE HERE
# provide the main body of the network : first three convolutional layers and dense one on top
# you may want to change nonlinearity - feel free to do this
# note that we have changed filter size here because of reduced image width and height compared to those in papers
conv1 = Conv2DLayer(wnd_reshape, ...)
...
dense = Dense(...)

In [ ]:
<YOUR CODE>
# define 256 neuron LSTM cell:
# - define two input layers each of n_lstm_cells (maybe 256 is a good baseline) neurons
# - feed into `LSTMcell` this two layers and
#   input layer (last `Dense` in case of A2C+LSTM) as additional third parameter

In [ ]:
neck_layer = concat([ <dense layer before lstm> , <output of LSTM layer> ]) # network neck

In [ ]:
<YOUR CODE>
# define actors head as
# - logits_layer – dense(neck) with nonlinearity=None
# - policy layer – softmax over logits_layer
........
action_layer = ProbabilisticResolver(policy_layer)

In [ ]:
# critic head
V_layer = DenseLayer(neck_layer, 1, nonlinearity=None)

In [ ]:
<YOUR CODE>
# `observation_layers` is input layer to NN, as usual
# `policy_estimators` should include 1) logits_layer and 2) V_layer
# `agent_states` is a dictionary of {new_value: old_value}. You should bother to update
#    a) prev window (input buffer, prev_wnd)  b) previous LSTM cell state  c) output of LSTM cell
# `action_layers` is action_layer, as usual : )
agent = Agent(....)

In [ ]:
# may need to adjust (increasing N_SIMULTANEOUS_GAMES is usually a good idea)
pool = EnvPool(agent, make_env, n_games=N_SIMULTANEOUS_GAMES)
replay = pool.experience_replay

In [ ]:
_, _, _, action_seq, (logits_seq, V_seq) = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True
)

In [ ]:
# compute pi(a|s) and log(pi(a|s)) manually [use logsoftmax]
# we can't guarantee that theano optimizes logsoftmax automatically since it's still in dev
# for more info see (https://github.com/Theano/Theano/issues/2944 of 2015 year)

# logits_seq.shape is (batch_size, SEQ_LENGTH, N_ACTIONS)
logits_flat = logits_seq.reshape([-1, N_ACTIONS])
policy_seq = T.nnet.softmax(logits_flat).reshape(logits_seq.shape)
logpolicy_seq = T.nnet.logsoftmax(logits_flat).reshape(logits_seq.shape)

In [ ]:
# get policy gradient
from agentnet.learning import a2c
elwise_actor_loss, elwise_critic_loss = a2c.get_elementwise_objective(
    policy=logpolicy_seq,
    treat_policy_as_logpolicy=True,
    state_values=V_seq[:, :, 0],
    actions=replay.actions[0],
    rewards=replay.rewards/10,
    is_alive=replay.is_alive,
    gamma_or_gammas=0.99,
    n_steps=None,
    return_separate=True
)

# add losses with magic numbers
# (you can change them more or less harmlessly, this usually just makes learning faster/slower)
# actor and critic multipliers were selected guided by prior knowledge
# entropy / regularization multipliers were tuned with logscale gridsearch
# NB: regularization affects exploration
reg_logits = T.mean(logits_seq ** 2)
reg_entropy = T.mean(T.sum(policy_seq * logpolicy_seq, axis=-1))
loss = 0.1 * elwise_actor_loss.mean() + 0.25 * elwise_critic_loss.mean() + \
    1e-3 * reg_entropy + 1e-3 * reg_logits

In [ ]:
# Compute weight updates, clip by norm for stability
weights = lasagne.layers.get_all_params(
    [V_layer, policy_layer], trainable=True)
grads = T.grad(loss, weights)
grads = lasagne.updates.total_norm_constraint(grads, 10)
updates = lasagne.updates.adam(grads, weights)
train_step = theano.function([], loss, updates=updates)

Train


In [ ]:
epoch_counter = 1  # starting epoch
rewards = {}  # full game rewards
target_score = 10000
loss, eval_rewards = 0, []

In [ ]:
untrained_reward = np.mean(pool.evaluate(
    n_games=5, record_video=False, verbose=False))
untrained_reward

In [ ]:
# IF you feel disgust about stderr messages due to pool.evaluate() execution
# which pollutes output of jupyter cell, you could do one of the following:
# 1. use warnings.filterwarnings("ignore")
# 2. use cell magic %%capture
# 3. simply redirect stderr to /dev/null with command
#    import os, sys
#    stder_old = sys.stderr
#    sys.stderr = open(os.devnull, 'w')

In [ ]:
th_times, loop_times = [], []
for i in range(2000):
    loop_starts = timer()
    pool.update(SEQ_LENGTH)

    train_starts = timer()

    <YOUR CODE: train network (actor and critic)>
    raise NotImplementedError

    th_times.append(timer() - train_starts)
    epoch_counter += 1
    loop_times.append(timer() - loop_starts)

    # You may want to set EVAL_EVERY_N_ITER=1 for the time being
    if epoch_counter % EVAL_EVERY_N_ITER == 0:
        eval_and_plot(rewards, epoch_counter, pool,
                      target_score, th_times, loop_times)
        if rewards[epoch_counter] >= target_score:
            print("VICTORY!")
            break
        th_times, loop_times = [], []

In [ ]:
eval_and_plot(rewards, epoch_counter, pool, target_score, th_times, loop_times)