In this worksheet we are going to go over training a Reinforcement Learning (RL) agent trained on ATARI 2600 games. We will use the famous game of pong as a our training test bed.
One can download the code required to train the DQN agent at https://sites.google.com/a/deepmind.com/dqn/
In [1]:
--
-- Most probably this function should have been part of statndard lua or Torch
--
function addpath(new_path)
package.path = package.path .. ';' .. new_path .. '/?.lua;' .. new_path .. '/?/init.lua'
end
--
-- This is where the code lives on your machine.
--
addpath('/home/ubuntu/DQN/Human_Level_Control_through_Deep_Reinforcement_Learning/dqn/')
In [2]:
--
-- The training arguments are stored in a file in the following file. We load them here and then explain one by one.
-- In lua, when a process is run a global 'arg' table contains all the command line arguments, just like **argv.
--
arg = torch.load('/home/ubuntu/DQN/Human_Level_Control_through_Deep_Reinforcement_Learning/dqn/pong_arguments.t7')
print(arg)
Out[2]:
In [3]:
--
-- Load initialization functions
--
require "initenv";
In [4]:
--
-- We start by defining the valid command line arguments. Each argument is defined by its flag, followed by its
-- default value and finally a help string.
--
cmd = torch.CmdLine()
cmd:text()
cmd:text('Train Agent in Environment:')
cmd:text()
cmd:text('Options:')
cmd:option('-framework', '', 'name of training framework')
cmd:option('-env', '', 'name of environment to use')
cmd:option('-game_path', '', 'path to environment file (ROM)')
cmd:option('-env_params', '', 'string of environment parameters')
cmd:option('-pool_frms', '',
'string of frame pooling parameters (e.g.: size=2,type="max")')
cmd:option('-actrep', 1, 'how many times to repeat action')
cmd:option('-random_starts', 0, 'play action 0 between 1 and random_starts ' ..
'number of times at the start of each training episode')
cmd:option('-name', '', 'filename used for saving network and training history')
cmd:option('-network', '', 'reload pretrained network')
cmd:option('-agent', '', 'name of agent file to use')
cmd:option('-agent_params', '', 'string of agent parameters')
cmd:option('-seed', 1, 'fixed input seed for repeatable experiments')
cmd:option('-saveNetworkParams', false,
'saves the agent network in a separate file')
cmd:option('-prog_freq', 5*10^3, 'frequency of progress output')
cmd:option('-save_freq', 5*10^4, 'the model is saved every save_freq steps')
cmd:option('-eval_freq', 10^4, 'frequency of greedy evaluation')
cmd:option('-save_versions', 0, '')
cmd:option('-steps', 10^5, 'number of training steps to perform')
cmd:option('-eval_steps', 10^5, 'number of evaluation steps')
cmd:option('-verbose', 2,
'the higher the level, the more information is printed to screen')
cmd:option('-threads', 1, 'number of BLAS threads')
cmd:option('-gpu', -1, 'gpu flag')
cmd:text()
-- Here we parse the table arg to load all command line arguments into opt
opt = cmd:parse(arg)
In [5]:
--
-- Observe the parameters that are required to specify the training.
--
print(opt)
Out[5]:
In [6]:
-- We do not have too much ram, so make sure to deallocate the current agent.
if agent then
agent = nil
collectgarbage()
collectgarbage()
end
--
-- Main setup function
-- game : the game environment
-- game_actions : valid actions that can be used in this game
-- agent : the RL agent that we will train
--
-- if you don't like too much debug info in ipython
--opt.verbose = 0
-- run setup to load agent and game
game_env, game_actions, agent, opt = setup(opt);
Out[6]:
Out[6]:
Out[6]:
Out[6]:
Out[6]:
Out[6]:
Out[6]:
Out[6]:
Out[6]:
In [7]:
print(opt)
Out[7]:
In [8]:
print(game_actions)
screens = {}
for i=1,36 do
screen, reward, terminal = game_env:step(0)
table.insert(screens, screen[1]:clone())
end
itorch.image(screens)
Out[8]:
In [11]:
agent
Out[11]:
Out[11]:
Out[11]:
Out[11]:
Out[11]:
Out[11]:
Out[11]:
Out[11]:
In [ ]:
local learn_start = agent.learn_start
local start_time = sys.clock()
local reward_counts = {}
local episode_counts = {}
local time_history = {}
local v_history = {}
local qmax_history = {}
local td_history = {}
local reward_history = {}
local step = 0
time_history[1] = 0
local total_reward
local nrewards
local nepisodes
local episode_reward
-- When we save agent, there are many temporary states in the agent. The proper way would be to implement
-- agent:read() and agent:write() functions as in TransitionTable.lua
-- But using Lua closures, we can also have a nice interface like follows.
-- agent is the agent to make slim so that we do not dump GBs of data
-- returns a deslim function that does not take any arguments and reinstates the parameters in the agent.
local function slim_agent(agent)
local s, a, r, s2, term = agent.valid_s, agent.valid_a, agent.valid_r,agent.valid_s2, agent.valid_term
agent.valid_s, agent.valid_a, agent.valid_r, agent.valid_s2, agent.valid_term = nil, nil, nil, nil, nil, nil, nil
local w, dw, g, g2, delta, delta2, deltas, tmp = agent.w, agent.dw, agent.g, agent.g2, agent.delta, agent.delta2, agent.deltas, agent.tmp
agent.w, agent.dw, agent.g, agent.g2, agent.delta, agent.delta2, agent.deltas, agent.tmp = nil, nil, nil, nil, nil, nil, nil, nil
-- this function now has pointers to all the local variables here when it was called.
-- so it can remodify the agent with the correct values. One could call slim_agent with many different
-- agent instances, and every deslim function would point to the correct context.
local function deslim()
agent.valid_s, agent.valid_a, agent.valid_r, agent.valid_s2, agent.valid_term = s, a, r, s2, term
agent.w, agent.dw, agent.g, agent.g2, agent.delta, agent.delta2, agent.deltas, agent.tmp = w, dw, g, g2, delta, delta2, deltas, tmp
end
return deslim
end
local screen, reward, terminal = game_env:getState()
print("Iteration ..", step)
while step < opt.steps do
step = step + 1
local action_index = agent:perceive(reward, screen, terminal)
-- game over? get next game!
if not terminal then
screen, reward, terminal = game_env:step(game_actions[action_index], true)
else
if opt.random_starts > 0 then
screen, reward, terminal = game_env:nextRandomGame()
else
screen, reward, terminal = game_env:newGame()
end
end
if step % opt.prog_freq == 0 then
assert(step==agent.numSteps, 'trainer step: ' .. step ..
' & agent.numSteps: ' .. agent.numSteps)
print("Training Steps: ", step)
--agent:report()
collectgarbage()
end
if step%1000 == 0 then collectgarbage() end
if step % opt.eval_freq == 0 and step > learn_start then
print('Evaluating')
screen, reward, terminal = game_env:newGame()
total_reward = 0
nrewards = 0
nepisodes = 0
episode_reward = 0
local eval_time = sys.clock()
for estep=1,opt.eval_steps do
local action_index = agent:perceive(reward, screen, terminal, true, 0.05)
-- Play game in test mode (episodes don't end when losing a life)
screen, reward, terminal = game_env:step(game_actions[action_index])
if estep%1000 == 0 then collectgarbage() end
-- record every reward
episode_reward = episode_reward + reward
if reward ~= 0 then
nrewards = nrewards + 1
end
if terminal then
total_reward = total_reward + episode_reward
episode_reward = 0
nepisodes = nepisodes + 1
screen, reward, terminal = game_env:nextRandomGame()
end
end
eval_time = sys.clock() - eval_time
start_time = start_time + eval_time
agent:compute_validation_statistics()
local ind = #reward_history+1
total_reward = total_reward/math.max(1, nepisodes)
if #reward_history == 0 or total_reward > torch.Tensor(reward_history):max() then
agent.best_network = agent.network:clone()
end
if agent.v_avg then
v_history[ind] = agent.v_avg
td_history[ind] = agent.tderr_avg
qmax_history[ind] = agent.q_max
end
print("V", v_history[ind], "TD error", td_history[ind], "Qmax", qmax_history[ind])
reward_history[ind] = total_reward
reward_counts[ind] = nrewards
episode_counts[ind] = nepisodes
time_history[ind+1] = sys.clock() - start_time
local time_dif = time_history[ind+1] - time_history[ind]
local training_rate = opt.actrep*opt.eval_freq/time_dif
print(string.format(
'\nSteps: %d (frames: %d), reward: %.2f, epsilon: %.2f, lr: %G, ' ..
'training time: %ds, training rate: %dfps, testing time: %ds, ' ..
'testing rate: %dfps, num. ep.: %d, num. rewards: %d',
step, step*opt.actrep, total_reward, agent.ep, agent.lr, time_dif,
training_rate, eval_time, opt.actrep*opt.eval_steps/eval_time,
nepisodes, nrewards))
end
if step % opt.save_freq == 0 or step == opt.steps then
local deslim_agent = slim_agent(agent)
local filename = opt.name
if opt.save_versions > 0 then
filename = filename .. "_" .. math.floor(step / opt.save_versions)
end
filename = filename
torch.save(filename .. ".t7", {agent = agent,
model = agent.network,
best_model = agent.best_network,
reward_history = reward_history,
reward_counts = reward_counts,
episode_counts = episode_counts,
time_history = time_history,
v_history = v_history,
td_history = td_history,
qmax_history = qmax_history,
arguments=opt})
if opt.saveNetworkParams then
local nets = {network=w:clone():float()}
torch.save(filename..'.params.t7', nets, 'ascii')
end
deslim_agent()
print('Saved:', filename .. '.t7')
io.flush()
collectgarbage()
end
end
Out[ ]:
Out[ ]: