This notebook builds on qlearning.ipynb
to implement Expected Value SARSA.
The policy we're gonna use is epsilon-greedy policy, where agent takes optimal action with probability $(1-\epsilon)$, otherwise samples action at random. Note that agent can occasionally sample optimal action during random sampling by pure chance.
In [1]:
#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
!bash ../xvfb start
os.environ['DISPLAY'] = ':1'
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [3]:
from qlearning import QLearningAgent
class EVSarsaAgent(QLearningAgent):
"""
An agent that changes some of q-learning functions to implement Expected Value SARSA.
Note: this demo assumes that your implementation of QLearningAgent.update uses get_value(next_state).
If it doesn't, please add
def update(self, state, action, reward, next_state):
and implement it for Expected Value SARSA's V(s')
"""
def get_value(self, state):
"""
Returns Vpi for current state under epsilon-greedy policy:
V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}
Hint: all other methods from QLearningAgent are still accessible.
"""
epsilon = self.epsilon
possible_actions = self.get_legal_actions(state)
#If there are no legal actions, return 0.0
if len(possible_actions) == 0:
return 0.0
#<YOUR CODE HERE: SEE DOCSTRING>
best_action = self.get_best_action(state)
n_actions = len(possible_actions)
pi_random = epsilon / n_actions
state_value = 0
for action in possible_actions:
qvalue = self.get_qvalue(state, action)
if action == best_action:
# compute pi(ai_ | s) for optimal action
# 1 - epsilon + epsilon/n_actions
pi = 1 - epsilon + epsilon / n_actions
else:
# random action
# pi(ai | s) = epsilon /(n_actions)
pi = pi_random
# sum over a_i {pi(a_i|s) Q(s, a_i)}
state_value += pi * qvalue
return state_value
In [4]:
import gym, gym.envs.toy_text
env = gym.envs.toy_text.CliffWalkingEnv()
n_actions = env.action_space.n
print(env.__doc__)
In [5]:
# Our cliffworld has one difference from what's on the image: there is no wall.
# Agent can choose to go as close to the cliff as it wishes. x:start, T:exit, C:cliff, o: flat ground
env.render()
In [6]:
def play_and_train(env,agent,t_max=10**4):
"""This function should
- run a full game, actions given by agent.getAction(s)
- train agent using agent.update(...) whenever possible
- return total reward"""
total_reward = 0.0
s = env.reset()
for t in range(t_max):
a = agent.get_action(s)
next_s,r,done,_ = env.step(a)
agent.update(s, a, r, next_s)
s = next_s
total_reward +=r
if done:break
return total_reward
In [7]:
from qlearning import QLearningAgent
agent_sarsa = EVSarsaAgent(alpha=0.25, epsilon=0.2, discount=0.99,
get_legal_actions = lambda s: range(n_actions))
agent_ql = QLearningAgent(alpha=0.25, epsilon=0.2, discount=0.99,
get_legal_actions = lambda s: range(n_actions))
In [8]:
from IPython.display import clear_output
from pandas import DataFrame
moving_average = lambda x, span=100: DataFrame({'x':np.asarray(x)}).x.ewm(span=span).mean().values
rewards_sarsa, rewards_ql = [], []
for i in range(5000):
rewards_sarsa.append(play_and_train(env, agent_sarsa))
rewards_ql.append(play_and_train(env, agent_ql))
#Note: agent.epsilon stays constant
if i %100 ==0:
clear_output(True)
print('EVSARSA mean reward =', np.mean(rewards_sarsa[-100:]))
print('QLEARNING mean reward =', np.mean(rewards_ql[-100:]))
plt.title("epsilon = %s" % agent_ql.epsilon)
plt.plot(moving_average(rewards_sarsa), label='ev_sarsa')
plt.plot(moving_average(rewards_ql), label='qlearning')
plt.grid()
plt.legend()
plt.ylim(-500, 0)
plt.show()
Let's now see what did the algorithms learn by visualizing their actions at every state.
In [9]:
def draw_policy(env, agent):
""" Prints CliffWalkingEnv policy with arrows. Hard-coded. """
n_rows, n_cols = env._cliff.shape
actions = '^>v<'
for yi in range(n_rows):
for xi in range(n_cols):
if env._cliff[yi, xi]:
print(" C ", end='')
elif (yi * n_cols + xi) == env.start_state_index:
print(" X ", end='')
elif (yi * n_cols + xi) == n_rows * n_cols - 1:
print(" T ", end='')
else:
print(" %s " % actions[agent.get_best_action(yi * n_cols + xi)], end='')
print()
In [10]:
print("Q-Learning")
draw_policy(env, agent_ql)
print("SARSA")
draw_policy(env, agent_sarsa)
In [11]:
from submit import submit_sarsa
submit_sarsa(rewards_ql, rewards_sarsa, 'tonatiuh_rangel@hotmail.com', 'nyRcvftwV7KAwvaI')
Here are some of the things you can do if you feel like it: