In [1]:
import gym
In [2]:
env = gym.make('CartPole-v0')
Check action and observation spaces dimensions
In [3]:
env.action_space, env.observation_space
Out[3]:
Check observation space boundaries
In [4]:
list(zip(env.observation_space.low, env.observation_space.high))
Out[4]:
I would guess the dimensions are:
We will use linear regression model to find the optimal policy (actions) given observation states. We will perform random search to find the best parameters.
In [5]:
import numpy as np
from tqdm import tqdm
In [6]:
N_MODELS = 10000 # models to try
MAX_STEPS = 200 # steps per episodes
Because the action space is discrete, we only need to take the sign of the linear regression output. In this case the weights can be sampled uniformly in [-1, 1]. However, if we want to use the bias term, it should be sampled according to the boundaries of the observation space (?)
In [7]:
# generate model weights: [n_models, n_params]
models = 2 * np.random.random([N_MODELS, env.observation_space.shape[0]]) - 1 # uniform in [-1, +1]
In [8]:
class LinearAgent(object):
def __init__(self, weights=None):
self._weights = weights
@property
def weights(self):
return self._weights
@weights.setter
def weights(self, new_weights):
self._weights = new_weights
def action(self, state):
a = np.dot(self._weights, state)
return int(a > 0)
In [9]:
def play_episode(env, agent, max_steps=MAX_STEPS):
observations = []
rewards = []
o = env.reset()
for t in range(max_steps):
a = agent.action(o)
o, r, done, info = env.step(a)
rewards.append(r)
observations.append(o)
if done:
break
return observations, rewards
In [10]:
def evaluate_agent(env, agent, n_experiments=20, max_steps=MAX_STEPS):
episode_rewards = []
for e in range(n_experiments):
observations, rewards = play_episode(env, agent, max_steps)
avg_reward = np.array(rewards).sum() / max_steps
episode_rewards.append(avg_reward)
episode_rewards = np.array(episode_rewards)
return episode_rewards.sum(), episode_rewards.mean()
In [11]:
model_metrics = np.zeros((models.shape[0], 2)) # total and mean reward for each model
for i, m in tqdm(enumerate(models)):
agent = LinearAgent(m)
model_metrics[i, :] = evaluate_agent(env, agent)
In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [13]:
plt.hist(model_metrics[:,1], bins=50)
plt.title('Distribution of expected cumulative reward')
plt.xlabel('mean reward')
plt.show()
In [14]:
print('% of successful models', (model_metrics[:,1] == 1.0).mean())
Sample observations using random policies
In [15]:
observations = []
for m_id in np.random.randint(0, models.shape[0], size=200):
agent = LinearAgent(models[m_id])
os, rs = play_episode(env, agent)
observations.extend(os)
observations = np.array(observations)
assign actions to each observation using the best performing model
In [16]:
# pick the best model
m_id = model_metrics.argmax(axis=0)[1]
weights = models[m_id]
In [17]:
# assign actions
actions = np.dot(observations, weights)
actions = (actions > 0).astype(int)
In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=0)
observations_pca = pca.fit_transform(observations)
In [19]:
plt.scatter(observations_pca[:,0], observations_pca[:,1], s=3, c=actions)
plt.title('PCA projection of the observation space')
plt.show()
In [20]:
n_experiments = 500
noise_scale = 1.0
experiment_steps = np.zeros(n_experiments)
for i in tqdm(range(n_experiments)):
old_weights = 2 * np.random.random(env.observation_space.shape) - 1 # uniform in [-1, +1]
agent = LinearAgent()
r_best = 0.0
for step in range(MAX_STEPS):
noise = np.random.randn(*old_weights.shape)
new_weights = old_weights + noise_scale * noise
agent.weights = new_weights
r_total, r_mean = evaluate_agent(env, agent)
if r_mean == 1.0:
experiment_steps[i] = step + 1
break
if r_mean > r_best:
r_best = r_mean
old_weights = new_weights
In [21]:
plt.hist(experiment_steps, bins=40)
plt.title('Distribution of steps for hill-climbing algorithm')
plt.xlabel('steps')
plt.show()
'0' bin means the algorithm did not converge
In [22]:
import tensorflow as tf
In [23]:
tf.reset_default_graph()
In [24]:
n_state_dims = env.observation_space.shape[0]
n_action_dims = env.action_space.shape[0]
n_state_dims, n_action_dims
Out[24]:
build graph
In [25]:
with tf.name_scope('policy'):
states_in = tf.placeholder(tf.float32, [None, n_state_dims], name='states')
W = tf.Variable(tf.truncated_normal([n_state_dims, n_action_dims], stddev=0.1), name='W')
# b = tf.Variable(tf.constant(0.0, shape=[n_action_dims]), name='b')
logits = tf.matmul(states_in, W)
probs = tf.nn.softmax(logits, name='probabilities')
In [26]:
with tf.name_scope('policy_gradient'):
actions_in = tf.placeholder(tf.int32, [None], name='actions')
rewards_in = tf.placeholder(tf.float32, [None], name='cumulative_rewards')
actions_one_hot = tf.one_hot(actions_in, n_action_dims)
# because the actions are mutually exclusive it should be ok to use cross-entropy
entropies = tf.nn.softmax_cross_entropy_with_logits(logits, actions_one_hot)
weighted_entropies = tf.mul(entropies, rewards_in)
loss = tf.reduce_sum(weighted_entropies, name='loss')
In [27]:
with tf.name_scope('optimizer'):
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = tf.train.AdamOptimizer(0.003).minimize(loss, global_step=global_step)
create session
In [28]:
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
In [29]:
def policy(state):
"Return sampled action given state"
a_probs = sess.run(probs, feed_dict={states_in: [state]})[0]
a = np.random.choice(2, p=a_probs)
return a
def policy_grad_descent(states, actions, cumulative_rewards):
"Run single gradient update and return loss"
feed_dict = {states_in: states, actions_in: actions, rewards_in: cumulative_rewards}
_, loss_val = sess.run([train_op, loss], feed_dict=feed_dict)
return loss_val
In [30]:
def play_policy(env, policy, n_trajectories=20, n_steps=MAX_STEPS):
"Generate trajectories given policy"
trajectories = []
for i in range(n_trajectories):
s = env.reset()
trajectory = []
for step in range(n_steps):
a = policy(s)
new_s, r, done, info = env.step(a)
trajectory.append([s, a, r])
if done:
break
s = new_s
trajectories.append(trajectory)
return trajectories
In [31]:
def calc_cumulative_rewards(rewards, gamma=0.99):
"""Given rewards at each step calculate
discounted cumulative future reward for each step"""
cumulative_rewards = np.zeros_like(rewards)
discounts = np.power(gamma, np.arange(len(rewards)))
for l in range(len(rewards)):
future_rewards = rewards[l:]
discounted = (future_rewards * discounts[:len(future_rewards)])
cumulative_rewards[l] = discounted.sum()
return cumulative_rewards
def update_policy(trajectories, policy_grad_descent, n_steps=MAX_STEPS):
"Run policy gradient descent on each trajectory"
losses = []
total_rewards = []
for trajectory in trajectories:
states, actions, rewards = zip(*trajectory)
cumulative_rewards = calc_cumulative_rewards(rewards)
loss = policy_grad_descent(states, actions, cumulative_rewards)
losses.append(loss / n_steps)
total_reward = np.sum(rewards) / n_steps
total_rewards.append(total_reward)
return np.mean(losses), np.mean(total_rewards)
In [32]:
n_epochs = 100
avg_losses = []
avg_rewards = []
for _ in tqdm(range(n_epochs)):
trajectories = play_policy(env, policy)
avg_loss, avg_reward = update_policy(trajectories, policy_grad_descent)
avg_losses.append(avg_loss)
avg_rewards.append(avg_reward)
if avg_reward == 1.0:
print('early stopping')
break
In [33]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(avg_losses, 'r')
ax2.plot(avg_rewards, 'g')
ax1.set_xlabel('epochs')
ax1.set_ylabel('total loss', color='r')
ax2.set_ylabel('avg rewards', color='g')
plt.show()
As the policy gets better trajectories get longer. That's why the total loss increases overtime, despite the objective which is to minimize it
In [ ]: