Copyright 2020 DeepMind Technologies Limited.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

RL Unplugged: Offline DQN - Atari

Guide to training an Acme DQN agent on Atari data.

Installation


In [ ]:
!pip install dm-acme
!pip install dm-acme[reverb]
!pip install dm-acme[tf]
!pip install dm-sonnet
!pip install dopamine-rl==3.0.1
!pip install atari-py
!git clone https://github.com/deepmind/deepmind-research.git
%cd deepmind-research

Imports


In [ ]:
import copy

import acme
from acme.agents.tf import actors
from acme.agents.tf.dqn import learning as dqn
from acme.tf import utils as acme_utils
from acme.utils import loggers
from rl_unplugged import atari
import sonnet as snt
import tensorflow as tf

Data


In [ ]:
game = 'Pong' #@param
run = 1  #@param

tmp_path = '/tmp/atari'
gs_path = 'gs://rl_unplugged/atari'

!mkdir -p {tmp_path}/{game}
!gsutil cp {gs_path}/{game}/run_{run}-00000-of-00001 {tmp_path}/{game}

Dataset and environment


In [ ]:
batch_size = 10  #@param

def discard_extras(sample):
  return sample._replace(data=sample.data[:5])

dataset = atari.dataset(path=tmp_path, game='Pong', run=1, num_shards=1)
# Small batch size, experiments in the paper were run with batch size 256.
dataset = dataset.map(discard_extras).batch(batch_size)

In [ ]:
environment = atari.environment(game='Pong')

DQN learner


In [ ]:
# Get total number of actions.
num_actions = environment.action_spec().num_values

# Create the Q network.
network = snt.Sequential([
    lambda x: tf.image.convert_image_dtype(x, tf.float32),
    snt.Conv2D(32, [8, 8], [4, 4]),
    tf.nn.relu,
    snt.Conv2D(64, [4, 4], [2, 2]),
    tf.nn.relu,
    snt.Conv2D(64, [3, 3], [1, 1]),
    tf.nn.relu,
    snt.Flatten(),
    snt.nets.MLP([512, num_actions])
])
acme_utils.create_variables(network, [environment.observation_spec()])


Out[ ]:
TensorSpec(shape=(6,), dtype=tf.float32, name=None)

In [ ]:
# Create a logger.
logger = loggers.TerminalLogger(label='learner', time_delta=1.)

# Create the DQN learner.
learner = dqn.DQNLearner(
    network=network,
    target_network=copy.deepcopy(network),
    discount=0.99,
    learning_rate=3e-4,
    importance_sampling_exponent=0.2,
    target_update_period=2500,
    dataset=dataset,
    logger=logger)

Training loop


In [ ]:
for _ in range(100):
  learner.step()


[Learner] Loss = 0.003 | Steps = 1 | Walltime = 0
[Learner] Loss = 0.004 | Steps = 54 | Walltime = 1.126

Evaluation


In [ ]:
# Create a logger.
logger = loggers.TerminalLogger(label='evaluation', time_delta=1.)

# Create an environment loop.
policy_network = snt.Sequential([
    network,
    lambda q: tf.argmax(q, axis=-1),
])
loop = acme.EnvironmentLoop(
    environment=environment,
    actor=actors.FeedForwardActor(policy_network=policy_network),
    logger=logger)

loop.run(5)


[Evaluation] Episode Length = 842 | Episode Return = -20.000 | Episodes = 1 | Steps = 842 | Steps Per Second = 265.850
[Evaluation] Episode Length = 792 | Episode Return = -21.000 | Episodes = 2 | Steps = 1634 | Steps Per Second = 270.043
[Evaluation] Episode Length = 812 | Episode Return = -21.000 | Episodes = 3 | Steps = 2446 | Steps Per Second = 274.792
[Evaluation] Episode Length = 812 | Episode Return = -21.000 | Episodes = 4 | Steps = 3258 | Steps Per Second = 270.967
[Evaluation] Episode Length = 812 | Episode Return = -21.000 | Episodes = 5 | Steps = 4070 | Steps Per Second = 274.253