In [1]:
import gym
env = gym.make('CartPole-v0')
obs = env.reset()
# position, velocity, anlge, algular velocity
obs
Out[1]:
In [2]:
img = env.render(mode = 'rgb_array')
In [3]:
img.shape
Out[3]:
In [4]:
obs, reward, done, info = env.step(1)
obs
Out[4]:
In [5]:
reward
Out[5]:
In [6]:
done
Out[6]:
In [7]:
info
Out[7]:
In [8]:
def basic_policy(obs):
angle = obs[2]
return 0 if angle < 0 else 1
totals = []
for episode in range(500):
episode_rewards = 0
obs = env.reset()
for step in range(1000):
action = basic_policy(obs)
obs, reward, done, info = env.step(action)
episode_rewards += reward
if done:
break
totals.append(episode_rewards)
import numpy as np
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))
In [9]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
learning_rate = 0.01
n_inputs = 4
n_hidden = 4
n_outputs = 1
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)
logits = fully_connected(hidden, n_outputs, activation_fn=None, weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)
prob = tf.concat(axis=1, values=[outputs, 1-outputs])
action = tf.multinomial(tf.log(prob), num_samples=1)
y = 1.0 - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
print(grads_and_vars)
gradients = [grad for grad, var in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, var in grads_and_vars:
gradient_placeholder = tf.placeholder(tf.float32, shape=var.get_shape())
gradient_placeholders.append(gradient_placeholder)
grads_and_vars_feed.append((gradient_placeholder, var))
training_op = optimizer.apply_gradients(grads_and_vars_feed)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
def discount_rewards(rewards, dis_rate):
discount_rewards = np.empty(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * dis_rate
discount_rewards[step] = cumulative_rewards
return discount_rewards
def discount_and_normalize_rewards(all_rewards, dis_rate):
all_dis_rewards = [discount_rewards(reward, dis_rate) for reward in all_rewards]
flat_rewards = np.concatenate(all_dis_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(reward - reward_mean) / reward_std for reward in all_dis_rewards]
In [19]:
n_iters = 101
n_max_steps = 1000
n_games_per_update = 10
save_iters = 10
dis_rate = 0.95
with tf.Session() as sess:
#init.run()
saver.restore(sess, './my_dnn_pg.ckpt')
for i in range(n_iters):
all_rewards = []
all_gradients = []
for game in range(n_games_per_update):
cur_rewards = []
cur_gradients = []
obs = env.reset()
for step in range(n_max_steps):
action_val, gradients_val = sess.run([action, gradients], feed_dict={X:obs.reshape(1, n_inputs)})
obs,reward,done,info = env.step(action_val[0][0])
cur_rewards.append(reward)
cur_gradients.append(gradients_val)
if done:
break
all_rewards.append(cur_rewards)
all_gradients.append(cur_gradients)
all_rewards = discount_and_normalize_rewards(all_rewards, dis_rate)
feed_dict = {}
for var_index, grad_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean(
[reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)],
axis = 0)
feed_dict[grad_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if i % save_iters == 0:
print(i)
saver.save(sess, './my_dnn_pg.ckpt')
In [ ]:
sess = tf.Session()
saver.restore(sess, './my_dnn_pg.ckpt')
def dnn_policy(obs):
action_val = sess.run(action, feed_dict={X:obs.reshape(1, n_inputs)})
return action_val[0][0]
totals = []
for episode in range(20):
episode_rewards = 0
obs = env.reset()
for step in range(2000):
env.render()
action_val = dnn_policy(obs)
obs, reward, done, info = env.step(action_val)
episode_rewards += reward
if done:
break
print(episode, episode_rewards)
totals.append(episode_rewards)
import numpy as np
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))
In [ ]: