In [3]:
import tensorflow as tf 
import numpy as np 
import gym 
import matplotlib.pyplot as plt

%matplotlib inline

tf.__version__


Out[3]:
'1.2.0'

In [4]:
env_name = 'Pendulum-v0'
env = gym.make(env_name)
obs = env.reset()

print('Observation Space:', env.observation_space.shape[0])
print('Action Space:', env.action_space.shape[0])

b = env.action_space.sample()
print(b.shape)
a = env.step(env.action_space.sample())


[2017-06-24 15:09:42,832] Making new env: Pendulum-v0
Observation Space: 3
Action Space: 1
(1,)

In [5]:
# action is a float between 2 and -2

Neural Network


In [16]:
tf.reset_default_graph()

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.shape[0]
n_hidden = 4
n_layers = 2
learning_rate = 0.01


# Input Placehodlers
x = tf.placeholder(tf.float32, [None, n_inputs])
#y = tf.placeholder(tf.float32, [None, n_outputs])

# Layers
current_input = x

for i in range(n_layers):
    h_layer = tf.contrib.layers.fully_connected(current_input, n_hidden)
    current_input = h_layer
    

output_layer = tf.contrib.layers.fully_connected(inputs=current_input, num_outputs=n_outputs, activation_fn=None)

# Sigmoid * multinomial
decision = tf.nn.sigmoid(output_layer) * 4 - 2

action = tf.random_normal([1, 1], dtype=tf.float32, mean=0.0, stddev=0.3)  + decision

#+ decision
# Assume action we have taken is the best

cross_entropy = tf.nn.l2_loss(action-decision)
optimiser = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimiser.compute_gradients(cross_entropy)

gradients = [grad for grad, var in grads_and_vars]


# Apply gradients
gradient_placeholders = []
grad_and_vars_feed = []

for grad, var in grads_and_vars:
    grad_placeholder = tf.placeholder(tf.float32, grad.get_shape())
    gradient_placeholders.append(grad_placeholder)
    grad_and_vars_feed.append((grad_placeholder, var))
    
# Training op
training_op = optimiser.apply_gradients(grad_and_vars_feed)


# Init ops
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()

In [17]:
def discount_rewards(rewards, discount_rate):    
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for i in reversed(range(len(rewards))):
        cumulative_rewards = rewards[i] + cumulative_rewards*discount_rate
        discounted_rewards[i] = cumulative_rewards
    
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [18]:
n_game_per_update = 10
n_max_steps = 300
n_iterations = 100
save_iterations = 10
discount_rate = 0.95

actions_record = []

model_path = "./checkpoint.ckpt"

sess = tf.InteractiveSession()
init_op.run()
saver.restore(sess, model_path)

for iteration in range(n_iterations):
    print("\rIteration: {}".format(iteration), end="")
    all_rewards = []
    all_gradients = []
    
    for game in range(n_game_per_update):
        current_rewards = []
        current_gradients = []
        obs = env.reset()
        
        for step in range(n_max_steps):
            if game ==0:
                env.render()
            action_val, gradients_val = sess.run([action, gradients], feed_dict={x:obs.reshape(-1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0])
            #actions_record.append(action_val[0][0])
            current_rewards.append(reward)
            current_gradients.append(gradients_val)
            if done:
                break
        all_rewards.append(current_rewards)
        all_gradients.append(current_gradients)
        
    all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
    feed_dict = {}
    for var_index, gradient_placeholder in enumerate(gradient_placeholders):
        mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                  for game_index, rewards in enumerate(all_rewards)
                                  for step, reward in enumerate(rewards)], axis=0)
        feed_dict[gradient_placeholder] = mean_gradients
        
    sess.run(training_op, feed_dict=feed_dict)
    
    if iteration % save_iterations ==0:
        saver.save(sess, model_path)

        
print('---Done---')
env.close()


INFO:tensorflow:Restoring parameters from ./checkpoint.ckpt
[2017-06-24 15:25:43,581] Restoring parameters from ./checkpoint.ckpt
Iteration: 9
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-42ed0af2deb9> in <module>()
     25         for step in range(n_max_steps):
     26             if game ==0:
---> 27                 env.render()
     28             action_val, gradients_val = sess.run([action, gradients], feed_dict={x:obs.reshape(-1, n_inputs)})
     29             obs, reward, done, info = env.step(action_val[0])

/home/ppyht2/.local/lib/python3.5/site-packages/gym/core.py in render(self, mode, close)
    151             elif mode not in modes:
    152                 raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes))
--> 153         return self._render(mode=mode, close=close)
    154 
    155     def close(self):

/home/ppyht2/.local/lib/python3.5/site-packages/gym/core.py in _render(self, mode, close)
    283 
    284     def _render(self, mode='human', close=False):
--> 285         return self.env.render(mode, close)
    286 
    287     def _close(self):

/home/ppyht2/.local/lib/python3.5/site-packages/gym/core.py in render(self, mode, close)
    151             elif mode not in modes:
    152                 raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes))
--> 153         return self._render(mode=mode, close=close)
    154 
    155     def close(self):

/home/ppyht2/.local/lib/python3.5/site-packages/gym/envs/classic_control/pendulum.py in _render(self, mode, close)
     85             self.imgtrans.scale = (-self.last_u/2, np.abs(self.last_u)/2)
     86 
---> 87         return self.viewer.render(return_rgb_array = mode=='rgb_array')
     88 
     89 def angle_normalize(x):

/home/ppyht2/.local/lib/python3.5/site-packages/gym/envs/classic_control/rendering.py in render(self, return_rgb_array)
    102             arr = arr.reshape(buffer.height, buffer.width, 4)
    103             arr = arr[::-1,:,0:3]
--> 104         self.window.flip()
    105         self.onetime_geoms = []
    106         return arr

/home/ppyht2/.local/lib/python3.5/site-packages/pyglet/window/xlib/__init__.py in flip(self)
    495         # TODO canvas.flip?
    496         if self.context:
--> 497             self.context.flip()
    498 
    499         self._sync_resize()

/home/ppyht2/.local/lib/python3.5/site-packages/pyglet/gl/xlib.py in flip(self)
    355         if self._vsync:
    356             self._wait_vsync()
--> 357         glx.glXSwapBuffers(self.x_display, self.glx_window)
    358 
    359 class XlibContextARB(XlibContext13):

KeyboardInterrupt: 

In [11]:
obs = env.reset()
scores = []
with tf.Session() as sess:
    saver.restore(sess, model_path)
    for i in range(100):
        obs = env.reset()
        total_reward = 0
        for step in range(200):
            action_val = action.eval(feed_dict={x: obs.reshape(1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0])
            total_reward+=reward
            if done:
                break
        scores.append(total_reward)
env.close()

print('Mean Score: ', np.mean(scores))


INFO:tensorflow:Restoring parameters from ./checkpoint.ckpt
[2017-06-24 15:17:33,317] Restoring parameters from ./checkpoint.ckpt
Mean Score:  -1601.46622121