In [3]:
import tensorflow as tf
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline
tf.__version__
Out[3]:
In [4]:
env_name = 'Pendulum-v0'
env = gym.make(env_name)
obs = env.reset()
print('Observation Space:', env.observation_space.shape[0])
print('Action Space:', env.action_space.shape[0])
b = env.action_space.sample()
print(b.shape)
a = env.step(env.action_space.sample())
In [5]:
# action is a float between 2 and -2
In [16]:
tf.reset_default_graph()
n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.shape[0]
n_hidden = 4
n_layers = 2
learning_rate = 0.01
# Input Placehodlers
x = tf.placeholder(tf.float32, [None, n_inputs])
#y = tf.placeholder(tf.float32, [None, n_outputs])
# Layers
current_input = x
for i in range(n_layers):
h_layer = tf.contrib.layers.fully_connected(current_input, n_hidden)
current_input = h_layer
output_layer = tf.contrib.layers.fully_connected(inputs=current_input, num_outputs=n_outputs, activation_fn=None)
# Sigmoid * multinomial
decision = tf.nn.sigmoid(output_layer) * 4 - 2
action = tf.random_normal([1, 1], dtype=tf.float32, mean=0.0, stddev=0.3) + decision
#+ decision
# Assume action we have taken is the best
cross_entropy = tf.nn.l2_loss(action-decision)
optimiser = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimiser.compute_gradients(cross_entropy)
gradients = [grad for grad, var in grads_and_vars]
# Apply gradients
gradient_placeholders = []
grad_and_vars_feed = []
for grad, var in grads_and_vars:
grad_placeholder = tf.placeholder(tf.float32, grad.get_shape())
gradient_placeholders.append(grad_placeholder)
grad_and_vars_feed.append((grad_placeholder, var))
# Training op
training_op = optimiser.apply_gradients(grad_and_vars_feed)
# Init ops
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
In [17]:
def discount_rewards(rewards, discount_rate):
discounted_rewards = np.empty(len(rewards))
cumulative_rewards = 0
for i in reversed(range(len(rewards))):
cumulative_rewards = rewards[i] + cumulative_rewards*discount_rate
discounted_rewards[i] = cumulative_rewards
return discounted_rewards
def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
In [18]:
n_game_per_update = 10
n_max_steps = 300
n_iterations = 100
save_iterations = 10
discount_rate = 0.95
actions_record = []
model_path = "./checkpoint.ckpt"
sess = tf.InteractiveSession()
init_op.run()
saver.restore(sess, model_path)
for iteration in range(n_iterations):
print("\rIteration: {}".format(iteration), end="")
all_rewards = []
all_gradients = []
for game in range(n_game_per_update):
current_rewards = []
current_gradients = []
obs = env.reset()
for step in range(n_max_steps):
if game ==0:
env.render()
action_val, gradients_val = sess.run([action, gradients], feed_dict={x:obs.reshape(-1, n_inputs)})
obs, reward, done, info = env.step(action_val[0])
#actions_record.append(action_val[0][0])
current_rewards.append(reward)
current_gradients.append(gradients_val)
if done:
break
all_rewards.append(current_rewards)
all_gradients.append(current_gradients)
all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
feed_dict = {}
for var_index, gradient_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)], axis=0)
feed_dict[gradient_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations ==0:
saver.save(sess, model_path)
print('---Done---')
env.close()
In [11]:
obs = env.reset()
scores = []
with tf.Session() as sess:
saver.restore(sess, model_path)
for i in range(100):
obs = env.reset()
total_reward = 0
for step in range(200):
action_val = action.eval(feed_dict={x: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0])
total_reward+=reward
if done:
break
scores.append(total_reward)
env.close()
print('Mean Score: ', np.mean(scores))