In [ ]:
import sys, os
if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash
!touch .setup_complete
# This code creates a virtual display to draw game images on.
# It will have no effect if your machine has a monitor.
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
!bash ../xvfb start
os.environ['DISPLAY'] = ':1'
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
from collections import defaultdict
import random
import math
import numpy as np
class QLearningAgent:
def __init__(self, alpha, epsilon, discount, get_legal_actions):
"""
Q-Learning Agent
based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html
Instance variables you have access to
- self.epsilon (exploration prob)
- self.alpha (learning rate)
- self.discount (discount rate aka gamma)
Functions you should use
- self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
which returns legal actions for a state
- self.get_qvalue(state,action)
which returns Q(state,action)
- self.set_qvalue(state,action,value)
which sets Q(state,action) := value
!!!Important!!!
Note: please avoid using self._qValues directly.
There's a special self.get_qvalue/set_qvalue for that.
"""
self.get_legal_actions = get_legal_actions
self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.epsilon = epsilon
self.discount = discount
def get_qvalue(self, state, action):
""" Returns Q(state,action) """
return self._qvalues[state][action]
def set_qvalue(self, state, action, value):
""" Sets the Qvalue for [state,action] to the given value """
self._qvalues[state][action] = value
#---------------------START OF YOUR CODE---------------------#
def get_value(self, state):
"""
Compute your agent's estimate of V(s) using current q-values
V(s) = max_over_action Q(state,action) over possible actions.
Note: please take into account that q-values can be negative.
"""
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return 0.0
if len(possible_actions) == 0:
return 0.0
<YOUR CODE>
return value
def update(self, state, action, reward, next_state):
"""
You should do your Q-Value update here:
Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
"""
# agent parameters
gamma = self.discount
learning_rate = self.alpha
<YOUR CODE>
self.set_qvalue(state, action, <YOUR CODE: Q-value> )
def get_best_action(self, state):
"""
Compute the best action to take in a state (using current q-values).
"""
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return None
if len(possible_actions) == 0:
return None
<YOUR CODE>
return best_action
def get_action(self, state):
"""
Compute the action to take in the current state, including exploration.
With probability self.epsilon, we should take a random action.
otherwise - the best policy action (self.get_best_action).
Note: To pick randomly from a list, use random.choice(list).
To pick True or False with a given probablity, generate uniform number in [0, 1]
and compare it with your probability
"""
# Pick Action
possible_actions = self.get_legal_actions(state)
action = None
# If there are no legal actions, return None
if len(possible_actions) == 0:
return None
# agent parameters:
epsilon = self.epsilon
<YOUR CODE>
return chosen_action
In [ ]:
import gym
env = gym.make("Taxi-v3")
n_actions = env.action_space.n
In [ ]:
agent = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,
get_legal_actions=lambda s: range(n_actions))
In [ ]:
def play_and_train(env, agent, t_max=10**4):
"""
This function should
- run a full game, actions given by agent's e-greedy policy
- train agent using agent.update(...) whenever it is possible
- return total reward
"""
total_reward = 0.0
s = env.reset()
for t in range(t_max):
# get agent to pick action given state s.
a = <YOUR CODE>
next_s, r, done, _ = env.step(a)
# train (update) agent for state s
<YOUR CODE>
s = next_s
total_reward += r
if done:
break
return total_reward
In [ ]:
from IPython.display import clear_output
rewards = []
for i in range(1000):
rewards.append(play_and_train(env, agent))
agent.epsilon *= 0.99
if i % 100 == 0:
clear_output(True)
print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))
plt.plot(rewards)
plt.show()
Use agent to train efficiently on CartPole-v0.
This environment has a continuous set of possible states, so you will have to group them into bins somehow.
The simplest way is to use round(x,n_digits) (or np.round) to round a real number to a given amount of digits.
The tricky part is to get the n_digits right for each state to train effectively.
Note that you don't need to convert state to integers, but to tuples of any kind of values.
In [ ]:
env = gym.make("CartPole-v0")
n_actions = env.action_space.n
print("first state:%s" % (env.reset()))
plt.imshow(env.render('rgb_array'))
In [ ]:
all_states = []
for _ in range(1000):
all_states.append(env.reset())
done = False
while not done:
s, r, done, _ = env.step(env.action_space.sample())
all_states.append(s)
if done:
break
all_states = np.array(all_states)
for obs_i in range(env.observation_space.shape[0]):
plt.hist(all_states[:, obs_i], bins=20)
plt.show()
In [ ]:
from gym.core import ObservationWrapper
class Binarizer(ObservationWrapper):
def observation(self, state):
# hint: you can do that with round(x,n_digits)
# you may pick a different n_digits for each dimension
state = <YOUR CODE: round state to some amount digits>
return tuple(state)
In [ ]:
env = Binarizer(gym.make("CartPole-v0").env)
In [ ]:
all_states = []
for _ in range(1000):
all_states.append(env.reset())
done = False
while not done:
s, r, done, _ = env.step(env.action_space.sample())
all_states.append(s)
if done:
break
all_states = np.array(all_states)
for obs_i in range(env.observation_space.shape[0]):
plt.hist(all_states[:, obs_i], bins=20)
plt.show()
Now let's train a policy that uses binarized state space.
Tips:
len(QLearningAgent._qvalues)), but not required.
In [ ]:
agent = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,
get_legal_actions=lambda s: range(n_actions))
In [ ]:
rewards = []
for i in range(10000):
rewards.append(play_and_train(env, agent))
# OPTIONAL: <YOUR CODE: adjust epsilon>
if i % 100 == 0:
clear_output(True)
print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))
plt.plot(rewards)
plt.show()