In this worksheet we are going to go over training a Reinforcement Learning (RL) agent trained on ATARI 2600 games. We will use the famous game of pong as a our training test bed.
One can download the code required to train the DQN agent at https://sites.google.com/a/deepmind.com/dqn/
In [1]:
    
require 'cutorch'
require 'cunn'
require 'alewrap'
torch.setdefaulttensortype('torch.FloatTensor')
torch.setnumthreads(4)
    
In [2]:
    
-- Create the Game Object
game_options = {
    -- name of the game to play (you need the ROM file for this game)
    env='pong',
    -- directory where the ROMS are stored
    game_path='/home/ubuntu/torch/install/share/lua/5.1/dqn/roms/',
    -- we want get RGB frames
    env_params = {useRGB = true},
    -- we will repeat each action 4 times
    actrep = 4,
    -- for every new episode, play null actions a random number of time [0,30]
    random_starts = 30,
    -- use gpu
    gpu = 1,
    -- have some info logs
    verbose = 2
}
game_env = alewrap.GameEnvironment(game_options)
game_actions = game_env:getActions()
    
    Out[2]:
In [3]:
    
-- These are the valid actions for this game. The total possible action set is in (0-17)
print(game_actions)
    
    Out[3]:
In [4]:
    
require 'dqn'
agent_params = {
    -- The agent only knows about actions it can take in the environment
    actions = game_actions,
    -- we will use gpu
    gpu = 1,
    -- we will print info
    verbose = 2,
    -- learning rate for SGD
    lr=0.00025,
    -- Random exploration ratio, start from 100% exploration
    ep=1,
    -- Drop down to 10% exploration
    ep_end=0.1,
    -- Linear decay over 1M steps
    ep_endt=1000000,
    -- Discount factor \gamma for Q-Learning
    discount=0.99,
    -- Number of frames to input into convolutional net
    hist_len=4,
    -- Learning starts after a delay of 50K actions, we do not want to overfit onto early experience
    learn_start=50000,
    -- We will store last 1M transitions
    replay_memory=1000000,
    -- We will update every 4 actions
    update_freq=4,
    -- Will update only once
    n_replay=1,
    -- Network spec
    network= "dqn.convnet_atari3",
    -- pre-processing spec (just scale down to grayscale 84x84)
    preproc="dqn.net_downsample_2x_full_y",
    -- size of inputs after rescale (84*84)
    state_dim=7056,
    -- size of minibatch for SGD
    minibatch_size=32,
    -- we will scale reward values to limit to 1,-1
    rescale_r=1,
    -- we use Y channel
    ncols=1,
    -- buffer on GPU
    bufferSize=512,
    -- set of validation transitions to track training progress
    valid_size=500,
    -- update target Q network every 10K updates
    target_q=10000,
    -- we will clip errors that go into DQN
    clip_delta=1,
    -- clip reward between -1,1
    min_reward=-1,
    -- clip reward between -1,1
    max_reward=1
}
agent = dqn.NeuralQLearner(agent_params)
    
    Out[4]:
    Out[4]:
    Out[4]:
    Out[4]:
    Out[4]:
In [5]:
    
-- The following quantities are collected during evaluation of the agent
-- How many times did the agent end up in reward states
reward_counts = {}
-- How many episodes did the agent finish
episode_counts = {}
-- How long does it take to learn/test
time_history = { 0 }
-- The reward agent gets during testing
reward_history = {}
-- The following are training measures collected by agent during training
-- Maximum q-value during training
qmax_history = {}
-- Value of the validation states
v_history = {}
-- TD error over the validation states
td_history = {}
    
In [6]:
    
screen, reward, terminal = game_env:getState()
print({screen = screen, reward = reward, terminal = terminal})
    
    Out[6]:
In [7]:
    
local screens = {}
for i=1,36 do
    local screen, reward, terminal = game_env:step(game_actions[torch.random(3)])
    table.insert(screens, screen[1]:clone())
end
itorch.image(screens)
    
    
In [8]:
    
-- Training options
opt = {
    -- number of evaluation steps
    eval_steps = 125000,
    -- frequency of evaluation
    eval_freq = 250000,
    -- total number of training steps
    steps = 50000000,
    -- frequency of progress reporting
    prog_freq = 10000,
    -- frequency to save agent on disk
    save_freq = 125000,
    -- filename for saved agent
    name = 'Itorch_DQN3_0_1_pong_FULL_Y',
    -- we want to use random starts of up to 30 nil steps
    random_starts = 30,
    -- we repeat every action 4 times
    actrep = 4,
}
    
In [ ]:
    
local total_reward
local nrewards
local nepisodes
local episode_reward
local learn_start = agent.learn_start
local start_time = sys.clock()
local step = 0
print("Iteration ..", step)
while step < opt.steps do
    step = step + 1
    local action_index = agent:perceive(reward, screen, terminal)
    -- game over? get next game!                                                                                                                      
    if not terminal then
        screen, reward, terminal = game_env:step(game_actions[action_index], true)
    else
        if opt.random_starts > 0 then
            screen, reward, terminal = game_env:nextRandomGame()
        else
            screen, reward, terminal = game_env:newGame()
        end
    end
    if step % opt.prog_freq == 0 then
        assert(step==agent.numSteps, 'trainer step: ' .. step ..
                ' & agent.numSteps: ' .. agent.numSteps)
        print("Training Steps: ", step)
        agent:report()
        collectgarbage()
    end
    if step%1000 == 0 then collectgarbage() end
    if step % opt.eval_freq == 0 and step > learn_start then
        print('Evaluating')
        screen, reward, terminal = game_env:newGame()
        total_reward = 0
        nrewards = 0
        nepisodes = 0
        episode_reward = 0
        local eval_time = sys.clock()
        for estep=1,opt.eval_steps do
            local action_index = agent:perceive(reward, screen, terminal, true, 0.05)
            -- Play game in test mode (episodes don't end when losing a life)                                                                         
            screen, reward, terminal = game_env:step(game_actions[action_index])
            if estep%1000 == 0 then collectgarbage() end
            -- record every reward                                                                                                                    
            episode_reward = episode_reward + reward
            if reward ~= 0 then
               nrewards = nrewards + 1
            end
            if terminal then
                total_reward = total_reward + episode_reward
                episode_reward = 0
                nepisodes = nepisodes + 1
                screen, reward, terminal = game_env:nextRandomGame()
            end
        end
        eval_time = sys.clock() - eval_time
        start_time = start_time + eval_time
        agent:compute_validation_statistics()
        local ind = #reward_history+1
        total_reward = total_reward/math.max(1, nepisodes)
        if #reward_history == 0 or total_reward > torch.Tensor(reward_history):max() then
            agent.best_network = agent.network:clone()
        end
        if agent.v_avg then
            v_history[ind] = agent.v_avg
            td_history[ind] = agent.tderr_avg
            qmax_history[ind] = agent.q_max
        end
        print("V", v_history[ind], "TD error", td_history[ind], "Qmax", qmax_history[ind])
        reward_history[ind] = total_reward
        reward_counts[ind] = nrewards
        episode_counts[ind] = nepisodes
        time_history[ind+1] = sys.clock() - start_time
        local time_dif = time_history[ind+1] - time_history[ind]
        local training_rate = opt.actrep*opt.eval_freq/time_dif
        print(string.format(
            '\nSteps: %d (frames: %d), reward: %.2f, epsilon: %.2f, lr: %G, ' ..
            'training time: %ds, training rate: %dfps, testing time: %ds, ' ..
            'testing rate: %dfps,  num. ep.: %d,  num. rewards: %d',
            step, step*opt.actrep, total_reward, agent.ep, agent.lr, time_dif,
            training_rate, eval_time, opt.actrep*opt.eval_steps/eval_time,
            nepisodes, nrewards))
    end
    if step % opt.save_freq == 0 or step == opt.steps then
        local filename = opt.name
        torch.save(filename .. ".t7", {
                                reward_history = reward_history,
                                reward_counts = reward_counts,
                                episode_counts = episode_counts,
                                time_history = time_history,
                                v_history = v_history,
                                td_history = td_history,
                                qmax_history = qmax_history,
                                opt = opt})
-- we are not going to bother saving agents for now.
--         torch.save(filename .. "_agent.t7", {
--                                 agent = agent,
--                                 best_model = agent.best_network,
--             })
        print('Saved:', filename .. '.t7')
        io.flush()
        collectgarbage()
    end
end
    
    Out[ ]:
In [ ]: