In [1]:
import gym

env = gym.make('CartPole-v0')
obs = env.reset()

# position, velocity, anlge, algular velocity
obs


Out[1]:
array([ 0.01421779, -0.00525222, -0.01279807, -0.04138449])

In [2]:
img = env.render(mode = 'rgb_array')

In [3]:
img.shape


Out[3]:
(400, 600, 3)

In [4]:
obs, reward, done, info = env.step(1)
obs


Out[4]:
array([ 0.01411275,  0.1900509 , -0.01362576, -0.33807769])

In [5]:
reward


Out[5]:
1.0

In [6]:
done


Out[6]:
False

In [7]:
info


Out[7]:
{}

In [8]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)
import numpy as np
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))


41.74 9.012679956594486 24.0 70.0

In [9]:
import tensorflow as tf

from tensorflow.contrib.layers import fully_connected

learning_rate = 0.01

n_inputs = 4
n_hidden = 4
n_outputs = 1

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)
logits = fully_connected(hidden, n_outputs, activation_fn=None, weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

prob = tf.concat(axis=1, values=[outputs, 1-outputs])

action = tf.multinomial(tf.log(prob), num_samples=1)

y = 1.0 - tf.to_float(action)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)

optimizer = tf.train.AdamOptimizer(learning_rate)

grads_and_vars = optimizer.compute_gradients(cross_entropy)

print(grads_and_vars)
gradients = [grad for grad, var in grads_and_vars]

gradient_placeholders = []
grads_and_vars_feed = []
for grad, var in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=var.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, var))
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

def discount_rewards(rewards, dis_rate):
    discount_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * dis_rate
        discount_rewards[step] = cumulative_rewards
    return discount_rewards

def discount_and_normalize_rewards(all_rewards, dis_rate):
    all_dis_rewards = [discount_rewards(reward, dis_rate) for reward in all_rewards]
    
    flat_rewards = np.concatenate(all_dis_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(reward - reward_mean) / reward_std for reward in all_dis_rewards]


WARNING:tensorflow:From /home/han/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From <ipython-input-9-4391138a82c7>:20: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.random.categorical instead.
WARNING:tensorflow:From <ipython-input-9-4391138a82c7>:22: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
[(<tf.Tensor 'gradients/fully_connected/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 4) dtype=float32>, <tf.Variable 'fully_connected/weights:0' shape=(4, 4) dtype=float32_ref>), (<tf.Tensor 'gradients/fully_connected/BiasAdd_grad/tuple/control_dependency_1:0' shape=(4,) dtype=float32>, <tf.Variable 'fully_connected/biases:0' shape=(4,) dtype=float32_ref>), (<tf.Tensor 'gradients/fully_connected_1/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 1) dtype=float32>, <tf.Variable 'fully_connected_1/weights:0' shape=(4, 1) dtype=float32_ref>), (<tf.Tensor 'gradients/fully_connected_1/BiasAdd_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tf.Variable 'fully_connected_1/biases:0' shape=(1,) dtype=float32_ref>)]

In [19]:
n_iters = 101
n_max_steps = 1000
n_games_per_update = 10
save_iters = 10
dis_rate = 0.95

with tf.Session() as sess:
    #init.run()
    saver.restore(sess, './my_dnn_pg.ckpt')
    for i in range(n_iters):
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_update):
            cur_rewards = []
            cur_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X:obs.reshape(1, n_inputs)})
                obs,reward,done,info = env.step(action_val[0][0])
                cur_rewards.append(reward)
                cur_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(cur_rewards)
            all_gradients.append(cur_gradients)
        all_rewards = discount_and_normalize_rewards(all_rewards, dis_rate)
        feed_dict = {}
        for var_index, grad_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean(
                [reward * all_gradients[game_index][step][var_index]
                    for game_index, rewards in enumerate(all_rewards)
                    for step, reward in enumerate(rewards)],
                axis = 0)
            feed_dict[grad_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if i % save_iters == 0:
            print(i)
            saver.save(sess, './my_dnn_pg.ckpt')


INFO:tensorflow:Restoring parameters from ./my_dnn_pg.ckpt
0
10
20
30
40
50
60
70
80
90
100

In [ ]:
sess = tf.Session()
saver.restore(sess, './my_dnn_pg.ckpt')

def dnn_policy(obs):
    action_val = sess.run(action, feed_dict={X:obs.reshape(1, n_inputs)})
    return action_val[0][0]

totals = []
for episode in range(20):
    episode_rewards = 0
    obs = env.reset()
    for step in range(2000):
        env.render()
        action_val = dnn_policy(obs)
        obs, reward, done, info = env.step(action_val)
        episode_rewards += reward
        if done:
            break
    print(episode, episode_rewards)
    totals.append(episode_rewards)
import numpy as np
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))


INFO:tensorflow:Restoring parameters from ./my_dnn_pg.ckpt
0 200.0
1 200.0
2 200.0
3 128.0

In [ ]: