Basic Reinforcement Learning

Simple examples of RL using Tensorflow and OpenAI gym

Create cartpole environment


In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')


[2016-11-01 14:17:07,474] Making new env: CartPole-v0

Check action and observation spaces dimensions


In [3]:
env.action_space, env.observation_space


Out[3]:
(Discrete(2), Box(4,))

Check observation space boundaries


In [4]:
list(zip(env.observation_space.low, env.observation_space.high))


Out[4]:
[(-4.7999999999999998, 4.7999999999999998),
 (-3.4028234663852886e+38, 3.4028234663852886e+38),
 (-0.41887902047863906, 0.41887902047863906),
 (-3.4028234663852886e+38, 3.4028234663852886e+38)]

I would guess the dimensions are:

  • cart's horizontal position
  • cart's speed along horizontal position
  • pole's angle
  • pole's angular speed

The random guessing algorithm

We will use linear regression model to find the optimal policy (actions) given observation states. We will perform random search to find the best parameters.


In [5]:
import numpy as np
from tqdm import tqdm

In [6]:
N_MODELS = 10000 # models to try
MAX_STEPS = 200  # steps per episodes

Because the action space is discrete, we only need to take the sign of the linear regression output. In this case the weights can be sampled uniformly in [-1, 1]. However, if we want to use the bias term, it should be sampled according to the boundaries of the observation space (?)


In [7]:
# generate model weights: [n_models, n_params]
models = 2 * np.random.random([N_MODELS, env.observation_space.shape[0]]) - 1 # uniform in [-1, +1]

In [8]:
class LinearAgent(object):
    def __init__(self, weights=None):
        self._weights = weights
        
    @property
    def weights(self):
        return self._weights

    @weights.setter
    def weights(self, new_weights):
        self._weights = new_weights
        
    def action(self, state):
        a = np.dot(self._weights, state)
        return int(a > 0)

In [9]:
def play_episode(env, agent, max_steps=MAX_STEPS):
    observations = []
    rewards = []
    o = env.reset()
    for t in range(max_steps):
        a = agent.action(o)
        o, r, done, info = env.step(a)
        rewards.append(r)
        observations.append(o)
        if done:
            break
    return observations, rewards

In [10]:
def evaluate_agent(env, agent, n_experiments=20, max_steps=MAX_STEPS):
    episode_rewards = []
    for e in range(n_experiments):
        observations, rewards = play_episode(env, agent, max_steps)
        
        avg_reward = np.array(rewards).sum() / max_steps
        episode_rewards.append(avg_reward)
    
    episode_rewards = np.array(episode_rewards)
    return episode_rewards.sum(), episode_rewards.mean()

In [11]:
model_metrics = np.zeros((models.shape[0], 2)) # total and mean reward for each model
for i, m in tqdm(enumerate(models)):
    agent = LinearAgent(m)
    model_metrics[i, :] = evaluate_agent(env, agent)


10000it [01:23, 120.30it/s]

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [13]:
plt.hist(model_metrics[:,1], bins=50)
plt.title('Distribution of expected cumulative reward')
plt.xlabel('mean reward')
plt.show()



In [14]:
print('% of successful models', (model_metrics[:,1] == 1.0).mean())


% of successful models 0.0327

Visualize the observation space

Sample observations using random policies


In [15]:
observations = []
for m_id in np.random.randint(0, models.shape[0], size=200):
    agent = LinearAgent(models[m_id])
    os, rs = play_episode(env, agent)
    observations.extend(os)
    
observations = np.array(observations)

assign actions to each observation using the best performing model


In [16]:
# pick the best model
m_id = model_metrics.argmax(axis=0)[1]
weights = models[m_id]

In [17]:
# assign actions
actions = np.dot(observations, weights)
actions = (actions > 0).astype(int)

In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=0)
observations_pca = pca.fit_transform(observations)

In [19]:
plt.scatter(observations_pca[:,0], observations_pca[:,1], s=3, c=actions)
plt.title('PCA projection of the observation space')
plt.show()


The hill-climbing algorithm


In [20]:
n_experiments = 500
noise_scale = 1.0
experiment_steps = np.zeros(n_experiments)
for i in tqdm(range(n_experiments)):
    old_weights = 2 * np.random.random(env.observation_space.shape) - 1 # uniform in [-1, +1]
    agent = LinearAgent()
    r_best = 0.0
    for step in range(MAX_STEPS):
        noise = np.random.randn(*old_weights.shape)
        new_weights = old_weights + noise_scale * noise
        
        agent.weights = new_weights
        r_total, r_mean = evaluate_agent(env, agent)
        if r_mean == 1.0:
            experiment_steps[i] = step + 1
            break
        if r_mean > r_best:
            r_best = r_mean
            old_weights = new_weights


100%|██████████| 500/500 [02:34<00:00,  3.10it/s]

In [21]:
plt.hist(experiment_steps, bins=40)
plt.title('Distribution of steps for hill-climbing algorithm')
plt.xlabel('steps')
plt.show()


'0' bin means the algorithm did not converge

Policy gradient algorithm


In [22]:
import tensorflow as tf

In [23]:
tf.reset_default_graph()

In [24]:
n_state_dims = env.observation_space.shape[0]
n_action_dims = env.action_space.shape[0]
n_state_dims, n_action_dims


Out[24]:
(4, 2)

build graph


In [25]:
with tf.name_scope('policy'):
    states_in = tf.placeholder(tf.float32, [None, n_state_dims], name='states')
    W = tf.Variable(tf.truncated_normal([n_state_dims, n_action_dims], stddev=0.1), name='W')
    # b = tf.Variable(tf.constant(0.0, shape=[n_action_dims]), name='b')
    
    logits = tf.matmul(states_in, W)
    probs = tf.nn.softmax(logits, name='probabilities')

In [26]:
with tf.name_scope('policy_gradient'):
    actions_in = tf.placeholder(tf.int32, [None], name='actions')
    rewards_in = tf.placeholder(tf.float32, [None], name='cumulative_rewards')
    
    actions_one_hot = tf.one_hot(actions_in, n_action_dims)
    # because the actions are mutually exclusive it should be ok to use cross-entropy
    entropies = tf.nn.softmax_cross_entropy_with_logits(logits, actions_one_hot)
    weighted_entropies = tf.mul(entropies, rewards_in)
    loss = tf.reduce_sum(weighted_entropies, name='loss')

In [27]:
with tf.name_scope('optimizer'):
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = tf.train.AdamOptimizer(0.003).minimize(loss, global_step=global_step)

create session


In [28]:
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)

In [29]:
def policy(state):
    "Return sampled action given state"
    a_probs = sess.run(probs, feed_dict={states_in: [state]})[0]
    a = np.random.choice(2, p=a_probs)
    return a

def policy_grad_descent(states, actions, cumulative_rewards):
    "Run single gradient update and return loss"
    feed_dict = {states_in: states, actions_in: actions, rewards_in: cumulative_rewards}
    _, loss_val = sess.run([train_op, loss], feed_dict=feed_dict)
    return loss_val

In [30]:
def play_policy(env, policy, n_trajectories=20, n_steps=MAX_STEPS):
    "Generate trajectories given policy"
    trajectories = []
    for i in range(n_trajectories):
        s = env.reset()
        trajectory = []
        for step in range(n_steps):
            a = policy(s)
            new_s, r, done, info = env.step(a)
            trajectory.append([s, a, r])
            if done:
                break
            s = new_s
        trajectories.append(trajectory)
    return trajectories

In [31]:
def calc_cumulative_rewards(rewards, gamma=0.99):
    """Given rewards at each step calculate 
    discounted cumulative future reward for each step"""
    cumulative_rewards = np.zeros_like(rewards)
    discounts = np.power(gamma, np.arange(len(rewards)))
    for l in range(len(rewards)):
        future_rewards = rewards[l:]
        discounted = (future_rewards * discounts[:len(future_rewards)])
        cumulative_rewards[l] = discounted.sum()
    return cumulative_rewards

def update_policy(trajectories, policy_grad_descent, n_steps=MAX_STEPS):
    "Run policy gradient descent on each trajectory"
    losses = []
    total_rewards = []
    for trajectory in trajectories:
        states, actions, rewards = zip(*trajectory)
        cumulative_rewards = calc_cumulative_rewards(rewards)
        loss = policy_grad_descent(states, actions, cumulative_rewards)
        losses.append(loss / n_steps)
        total_reward = np.sum(rewards) / n_steps
        total_rewards.append(total_reward)
    return np.mean(losses), np.mean(total_rewards)

In [32]:
n_epochs = 100
avg_losses = []
avg_rewards = []
for _ in tqdm(range(n_epochs)):
    trajectories = play_policy(env, policy)
    avg_loss, avg_reward = update_policy(trajectories, policy_grad_descent)
    avg_losses.append(avg_loss)
    avg_rewards.append(avg_reward)
    if avg_reward == 1.0:
        print('early stopping')
        break


 97%|█████████▋| 97/100 [00:48<00:02,  1.31it/s]
early stopping


In [33]:
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(avg_losses, 'r')
ax2.plot(avg_rewards, 'g')

ax1.set_xlabel('epochs')
ax1.set_ylabel('total loss', color='r')
ax2.set_ylabel('avg rewards', color='g')

plt.show()


As the policy gets better trajectories get longer. That's why the total loss increases overtime, despite the objective which is to minimize it


In [ ]: