In [2]:
import gym
import numpy as np
In [3]:
env = gym.make("Taxi-v2")
In [37]:
env.reset() # init state value of env
Out[37]:
In [38]:
env.observation_space.n # number of possible values in this state space
Out[38]:
In [39]:
env.action_space.n # number of possible actions
# print(env.action_space)
# 0 = down
# 1 = up
# 2 = right
# 3 = left
# 4 = pickup
# 5 = drop-off
Out[39]:
In [40]:
env.render()
# In this environment the yellow square represents the taxi, the (“|”) represents a wall, the blue letter represents the pick-up location, and the purple letter is the drop-off location. The taxi will turn green when it has a passenger aboard.
In [57]:
env.env.s = 114
env.render()
In [60]:
state, reward, done, info = env.step(1)
In [59]:
env.render()
The environment is considered solved when you successfully pick up a passenger and drop them off at their desired location. Upon doing this, you will receive a reward of 20 and done will equal True.|
In [63]:
def taxiRandomSearch(env):
""" Randomly pick an action and keep guessing until the env is solved
:param env: Gym Taxi-v2 env
:return: number of steps required to solve the Gym Taxi-v2 env
"""
state = env.reset()
stepCounter = 0
reward = None
while reward != 20: # reward 20 means that the env has been solved
state, reward, done, info = env.step(env.action_space.sample())
stepCounter += 1
return stepCounter
In [75]:
print(taxiRandomSearch(env))
In short, the problem is solved multiple times (each time called an episode) and the Q-table (memory) is updated to improve the algorithm's efficiency and performance.
In [83]:
Q = np.zeros([env.observation_space.n, env.action_space.n]) # memory, stores the value (reward) for every single state and every action you can take
G = 0 # accumulated reward for each episode
alpha = 0.618 # learning rate
In [84]:
Q[114]
Out[84]:
In [85]:
def taxiQlearning(env):
""" basic Q learning algo
:param env: Gym Taxi-v2 env
:return: None
"""
for episode in range(1,1001):
stepCounter = 0
done = False
G, reward = 0,0
state = env.reset()
while done != True:
action = np.argmax(Q[state]) # 1: find action with highest value/reward at the given state
state2, reward, done, info = env.step(action) # 2: take that 'best action' and store the future state
Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) # 3: update the q-value using Bellman equation
G += reward
state = state2
stepCounter += 1
if episode % 50 == 0:
print('Episode {} Total Reward: {}'.format(episode,G))
print('Steps required for this episode: %i'% stepCounter)
In [86]:
taxiQlearning(env)
In [88]:
print(Q[14])
In [ ]: