In this tutorial we'll cover how to implement the Deep Deterministic Policy Gradients (DDPG), a policy-gradient actor-critic algorithm, that is off-policy and model-free. This work is based on Patrick Emami's previous write-up and code as described in the resources section.
This notebook might also be using JSAnimation which allows to include OpenAI gym environments as part of the notebooks.
Policy-Gradient (PG) algorithms optimize a policy end-to-end by computing noisy estimates of the gradient of the expected reward of the policy and then updating the policy in the gradient direction. Traditionally, the so called vanilla PG methods assumed a stochastic policy of the type $\mu (a|s)$, which gives a probability distribution over actions. Policy gradient algorithms utilize a form of policy iteration: they evaluate the policy, and then follow the policy gradient to maximize performance. Ideally, the algorithm sees lots of training examples of high rewards from good actions and negative rewards from bad actions.
For Reinforcement Learning (RL) problems with continuous action spaces, vanilla-PG is all but useless. You can, however, get vanilla-PG to work with some RL domains that take in visual inputs and have discrete action spaces with a convolutional neural network representing your policy.
Deep Deterministic Policy Gradients (DDPG) is a policy gradient algorithm that uses a stochastic behavior policy for good exploration but estimates a deterministic target policy, which is much easier to learn. Since DDPG is off-policy and uses a deterministic target policy, this allows for the use of the Deterministic Policy Gradient theorem (which will be derived shortly).
DDPG is an actor-critic algorithm, it primarily uses two neural networks, one for the actor and one for the critic. These networks compute action predictions for the current state and generate a temporal-difference (TD) error signal each time step. The input of the actor network is the current state, and the output is a single real value representing an action chosen from a continuous action space. The critic's output is simply the estimated Q-value of the current state and of the action given by the actor. The deterministic policy gradient theorem provides the update rule for the weights of the actor network. The critic network is updated from the gradients obtained from the TD error signal.
In [1]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
# reset everything to rerun in jupyter
tf.reset_default_graph()
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))
# Check for a GPU
if not tf.test.gpu_device_name():
warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
In [2]:
from collections import deque
import random
import numpy as np
class ReplayBuffer(object):
def __init__(self, buffer_size, random_seed=123):
"""
The right side of the deque contains the most recent experiences
"""
self.buffer_size = buffer_size
self.count = 0
self.buffer = deque()
random.seed(random_seed)
def add(self, s, a, r, t, s2):
experience = (s, a, r, t, s2)
if self.count < self.buffer_size:
self.buffer.append(experience)
self.count += 1
else:
self.buffer.popleft()
self.buffer.append(experience)
def size(self):
return self.count
def sample_batch(self, batch_size):
batch = []
if self.count < batch_size:
batch = random.sample(self.buffer, self.count)
else:
batch = random.sample(self.buffer, batch_size)
s_batch = np.array([_[0] for _ in batch])
a_batch = np.array([_[1] for _ in batch])
r_batch = np.array([_[2] for _ in batch])
t_batch = np.array([_[3] for _ in batch])
s2_batch = np.array([_[4] for _ in batch])
return s_batch, a_batch, r_batch, t_batch, s2_batch
def clear(self):
self.deque.clear()
self.count = 0
In [3]:
import tensorflow as tf
import numpy as np
import gym
from gym import wrappers
import tflearn
class ActorNetwork(object):
"""
Input to the network is the state, output is the action
under a deterministic policy.
The output layer activation is a tanh to keep the action
between -2 and 2
"""
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.action_bound = action_bound
self.learning_rate = learning_rate
self.tau = tau
# Actor Network
with tf.name_scope('actor'):
self.inputs, self.out, self.scaled_out = self.create_actor_network()
self.network_params = tf.trainable_variables()
# Target Network
with tf.name_scope('target_actor'):
self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network()
self.target_network_params = tf.trainable_variables()[
len(self.network_params):]
# Op for periodically updating target network with online network
# weights
self.update_target_network_params = \
[self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# This gradient will be provided by the critic network
self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])
# Combine the gradients here
self.actor_gradients = tf.gradients(
self.scaled_out, self.network_params, -self.action_gradient)
# Optimization Op
self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
apply_gradients(zip(self.actor_gradients, self.network_params))
self.num_trainable_vars = len(
self.network_params) + len(self.target_network_params)
def create_actor_network(self):
inputs = tflearn.input_data(shape=[None, self.s_dim])
net = tflearn.fully_connected(inputs, 400, activation='relu')
net = tflearn.fully_connected(net, 300, activation='relu')
# Final layer weights are init to Uniform[-3e-3, 3e-3]
w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
out = tflearn.fully_connected(
net, self.a_dim, activation='tanh', weights_init=w_init)
# Scale output to -action_bound to action_bound
scaled_out = tf.multiply(out, self.action_bound)
return inputs, out, scaled_out
def train(self, inputs, a_gradient):
with tf.name_scope('train_actor'):
self.sess.run(self.optimize, feed_dict={
self.inputs: inputs,
self.action_gradient: a_gradient
})
def predict(self, inputs):
return self.sess.run(self.scaled_out, feed_dict={
self.inputs: inputs
})
def predict_target(self, inputs):
return self.sess.run(self.target_scaled_out, feed_dict={
self.target_inputs: inputs
})
def update_target_network(self):
self.sess.run(self.update_target_network_params)
def get_num_trainable_vars(self):
return self.num_trainable_vars
In [6]:
class CriticNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
The action must be obtained from the output of the Actor network.
"""
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
# Create the critic network
with tf.name_scope('critic'):
self.inputs, self.action, self.out = self.create_critic_network()
self.network_params = tf.trainable_variables()[num_actor_vars:]
# Target Network
with tf.name_scope('target_critic'):
self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]
# Op for periodically updating target network with online network
# weights with regularization
self.update_target_network_params = \
[self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Network target (y_i)
with tf.name_scope('y_i'):
self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])
# Define loss and optimization Op
with tf.name_scope('loss'):
self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
with tf.name_scope('train'):
self.optimize = tf.train.AdamOptimizer(
self.learning_rate).minimize(self.loss)
# Get the gradient of the net w.r.t. the action.
# For each action in the minibatch (i.e., for each x in xs),
# this will sum up the gradients of each critic output in the minibatch
# w.r.t. that action. Each output is independent of all
# actions except for one.
self.action_grads = tf.gradients(self.out, self.action)
def create_critic_network(self):
with tf.name_scope('inputs'):
inputs = tflearn.input_data(shape=[None, self.s_dim])
with tf.name_scope('action'):
action = tflearn.input_data(shape=[None, self.a_dim])
net = tflearn.fully_connected(inputs, 400, activation='relu')
# Add the action tensor in the 2nd hidden layer
# Use two temp layers to get the corresponding weights and biases
t1 = tflearn.fully_connected(net, 300)
t2 = tflearn.fully_connected(action, 300)
net = tflearn.activation(
tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')
# linear layer connected to 1 output representing Q(s,a)
# Weights are init to Uniform[-3e-3, 3e-3]
w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
out = tflearn.fully_connected(net, 1, weights_init=w_init)
return inputs, action, out
def train(self, inputs, action, predicted_q_value):
with tf.name_scope('train_critic'):
return self.sess.run([self.out, self.optimize], feed_dict={
self.inputs: inputs,
self.action: action,
self.predicted_q_value: predicted_q_value
})
def predict(self, inputs, action):
return self.sess.run(self.out, feed_dict={
self.inputs: inputs,
self.action: action
})
def predict_target(self, inputs, action):
return self.sess.run(self.target_out, feed_dict={
self.target_inputs: inputs,
self.target_action: action
})
def action_gradients(self, inputs, actions):
return self.sess.run(self.action_grads, feed_dict={
self.inputs: inputs,
self.action: actions
})
def update_target_network(self):
self.sess.run(self.update_target_network_params)
In [7]:
def train(sess, env, actor, critic):
# Set up summary Ops
summary_ops, summary_vars = build_summaries()
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
# Initialize target network weights
actor.update_target_network()
critic.update_target_network()
# Initialize replay memory
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
ep_ave_max_q = 0
for j in range(MAX_EP_STEPS):
if RENDER_ENV:
env.render()
# Added exploration noise
a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
s2, r, terminal, info = env.step(a[0])
replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
terminal, np.reshape(s2, (actor.s_dim,)))
# Keep adding experience to the memory until
# there are at least minibatch size samples
if replay_buffer.size() > MINIBATCH_SIZE:
s_batch, a_batch, r_batch, t_batch, s2_batch = \
replay_buffer.sample_batch(MINIBATCH_SIZE)
# Calculate targets
target_q = critic.predict_target(
s2_batch, actor.predict_target(s2_batch))
y_i = []
for k in range(MINIBATCH_SIZE):
if t_batch[k]:
y_i.append(r_batch[k])
else:
y_i.append(r_batch[k] + GAMMA * target_q[k])
# Update the critic given the targets
predicted_q_value, _ = critic.train(
s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
ep_ave_max_q += np.amax(predicted_q_value)
# Update the actor policy using the sampled gradient
a_outs = actor.predict(s_batch)
grads = critic.action_gradients(s_batch, a_outs)
actor.train(s_batch, grads[0])
# Update target networks
actor.update_target_network()
critic.update_target_network()
s = s2
ep_reward += r
tf.summary.scalar('ep_reward', ep_reward)
if terminal:
summary_str = sess.run(summary_ops, feed_dict={
summary_vars[0]: ep_reward,
summary_vars[1]: ep_ave_max_q / float(j)
})
writer.add_summary(summary_str, i)
writer.flush()
print('| Reward: %.2i' % int(ep_reward), " | Episode", i, \
'| Qmax: %.4f' % (ep_ave_max_q / float(j)))
break
def build_summaries():
episode_reward = tf.Variable(0.)
tf.summary.scalar("Reward", episode_reward)
episode_ave_max_q = tf.Variable(0.)
tf.summary.scalar("Qmax_value", episode_ave_max_q)
summary_vars = [episode_reward, episode_ave_max_q]
summary_ops = tf.summary.merge_all()
return summary_ops, summary_vars
In [ ]:
# ==========================
# Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 50000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.0001
# Base learning rate for the Critic Network
CRITIC_LEARNING_RATE = 0.001
# Discount factor
GAMMA = 0.99
# Soft target update param
TAU = 0.001
# ===========================
# Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# Directory for storing gym results
MONITOR_DIR = './results/gym_ddpg'
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1234
# Size of replay buffer
BUFFER_SIZE = 10000
MINIBATCH_SIZE = 64
with tf.Session() as sess:
env = gym.make(ENV_NAME)
np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)
env.seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
# Ensure action bound is symmetric
assert (env.action_space.high == -env.action_space.low)
with tf.name_scope('actor_network'):
actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
ACTOR_LEARNING_RATE, TAU)
with tf.name_scope('critic_network'):
critic = CriticNetwork(sess, state_dim, action_dim,
CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())
if GYM_MONITOR_EN:
if not RENDER_ENV:
env = wrappers.Monitor(
env, MONITOR_DIR, video_callable=False, force=True)
else:
env = wrappers.Monitor(env, MONITOR_DIR, force=True)
train(sess, env, actor, critic)
if GYM_MONITOR_EN:
env.monitor.close()
In [ ]: