In [1]:
import gym 
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib import animation

% matplotlib inline

In [2]:
env = gym.make('CartPole-v0')
obs = env.reset()


[2017-07-23 14:26:34,937] Making new env: CartPole-v0

Basic Policy


In [3]:
def basic_policy(obs):
    # cart position, cart velocity, pole angular positon and pole angular velocity
    if obs[2] > 0:
        action = 1
    else:
        action = 0
    return action

In [4]:
max_rewards = 250
n_episods = 100

total_rewards = np.empty((n_episods))

for e in range(n_episods):
    obs = env.reset()
    episode_reward = 0
    for s in range(max_rewards):
        action = basic_policy(obs)
        obs, rewards, done, info = env.step(action)
        episode_reward += rewards
        if done:
            break
    total_rewards[e] = episode_reward
    
    
print('Mean:', np.mean(total_rewards))
print('Mean:', np.std(total_rewards))
print('Mean:', np.max(total_rewards))
print('Mean:', np.min(total_rewards))


Mean: 42.47
Mean: 8.91566598746
Mean: 64.0
Mean: 25.0

Neural Network


In [5]:
tf.reset_default_graph()

n_inputs = env.observation_space.shape[0]
n_outputs = 1 # the probability of going left 
n_hidden = 4
learning_rate = 0.01

# Input Placehodlers
x = tf.placeholder(tf.float32, [None, n_inputs])

# Layers
input_layer = tf.contrib.layers.fully_connected(inputs=x, num_outputs=n_hidden)
hidden_layer = tf.contrib.layers.fully_connected(input_layer, n_hidden)
output_layer = tf.contrib.layers.fully_connected(inputs=hidden_layer, num_outputs=n_outputs, activation_fn=None)

# Sigmoid * multinomial
left_prob = tf.sigmoid(output_layer)
both_prob = tf.concat(values=[left_prob, 1-left_prob], axis=1)

action = tf.multinomial(tf.log(both_prob), 1)


# Policy Gradient 
# First let's assume the action we take is always the best
# action is 1, we are going to the right, left prob should be 0
# action is 0, we are going to the left, left prob should be 1
y = 1 - tf.to_float(action)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=output_layer)
optimiser = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimiser.compute_gradients(cross_entropy)

gradients = [grad for grad, var in grads_and_vars]

# Apply gradients
gradient_placeholders = []
grad_and_vars_feed = []

for grad, var in grads_and_vars:
    grad_placeholder = tf.placeholder(tf.float32, grad.get_shape())
    gradient_placeholders.append(grad_placeholder)
    grad_and_vars_feed.append((grad_placeholder, var))
    
# Training op
training_op = optimiser.apply_gradients(grad_and_vars_feed)

# Init ops
init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=100)

In [6]:
def discount_rewards(rewards, discount_rate):    
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for i in reversed(range(len(rewards))):
        cumulative_rewards = rewards[i] + cumulative_rewards*discount_rate
        discounted_rewards[i] = cumulative_rewards
    
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [7]:
n_game_per_update = 10
n_max_steps = 200
n_iterations = 300

save_iterations = 10
saved_at = []
discount_rate = 0.95
model_path = "./checkpoints/"

sess = tf.InteractiveSession()
init_op.run()

env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env, './gym_upload')

for iteration in range(n_iterations):
    #print("\rIteration: {}".format(iteration), end="")
    all_rewards = []
    all_gradients = []
    
    for game in range(n_game_per_update):
        current_rewards = []
        current_gradients = []
        obs = env.reset()
        
        for step in range(n_max_steps):
            #env.render()
            action_val, gradients_val = sess.run([action, gradients], feed_dict={x:obs.reshape(-1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0][0])
            #reward -= 0.5*np.abs(obs[0]) # modife rewards to keep the cart in the center
            current_rewards.append(reward)
            current_gradients.append(gradients_val)
            if done:
                break
        all_rewards.append(current_rewards)
        all_gradients.append(current_gradients)
    
    all_rewards_normal = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
    feed_dict = {}
    for var_index, gradient_placeholder in enumerate(gradient_placeholders):
        mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                  for game_index, rewards in enumerate(all_rewards_normal)
                                  for step, reward in enumerate(rewards)], axis=0)
        feed_dict[gradient_placeholder] = mean_gradients
        
    sess.run(training_op, feed_dict=feed_dict)
    
    if iteration % save_iterations ==0:
        saver.save(sess, model_path+'ckpt', global_step=iteration)
        saved_at.append(iteration)
        print('Iteration: ', iteration, 'Average Reward: ', np.mean([len(r) for r in all_rewards]))

env.close()


[2017-07-23 14:26:42,579] Making new env: CartPole-v0
[2017-07-23 14:26:42,583] Creating monitor directory ./gym_upload
[2017-07-23 14:26:42,588] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000000.mp4
[2017-07-23 14:26:44,513] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000001.mp4
[2017-07-23 14:26:45,792] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000008.mp4
Iteration:  0 Average Reward:  21.3
[2017-07-23 14:26:47,549] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000027.mp4
[2017-07-23 14:26:49,966] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000064.mp4
Iteration:  10 Average Reward:  21.4
[2017-07-23 14:26:53,264] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000125.mp4
Iteration:  20 Average Reward:  19.5
[2017-07-23 14:26:58,914] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000216.mp4
Iteration:  30 Average Reward:  42.6
[2017-07-23 14:27:11,310] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000343.mp4
Iteration:  40 Average Reward:  87.9
Iteration:  50 Average Reward:  126.4
[2017-07-23 14:27:40,117] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000512.mp4
Iteration:  60 Average Reward:  171.9
Iteration:  70 Average Reward:  185.6
[2017-07-23 14:29:02,402] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video000729.mp4
Iteration:  80 Average Reward:  198.0
Iteration:  90 Average Reward:  193.6
[2017-07-23 14:30:57,138] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video001000.mp4
Iteration:  100 Average Reward:  197.0
Iteration:  110 Average Reward:  156.0
Iteration:  120 Average Reward:  159.5
Iteration:  130 Average Reward:  200.0
Iteration:  140 Average Reward:  200.0
Iteration:  150 Average Reward:  200.0
Iteration:  160 Average Reward:  192.0
Iteration:  170 Average Reward:  200.0
Iteration:  180 Average Reward:  200.0
Iteration:  190 Average Reward:  194.1
[2017-07-23 14:37:30,756] Starting new video recorder writing to /home/ppyht2/1. Repositories/tf-exercise/G001. CartPole/gym_upload/openaigym.video.0.20384.video002000.mp4
Iteration:  200 Average Reward:  199.2
Iteration:  210 Average Reward:  186.1
Iteration:  220 Average Reward:  171.2
Iteration:  230 Average Reward:  156.3
Iteration:  240 Average Reward:  199.3
Iteration:  250 Average Reward:  200.0
Iteration:  260 Average Reward:  200.0
Iteration:  270 Average Reward:  200.0
Iteration:  280 Average Reward:  200.0
Iteration:  290 Average Reward:  200.0

In [ ]:
gym.upload('./gym_upload', api_key='')