Q-Learning: Tabular

``````

In [97]:

## Frozen Lake 8x8 environment. Reach the goal without falling

``````
``````

In [2]:

import gym
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
from __future__ import division,print_function
%matplotlib inline

``````
``````

In [3]:

env = gym.make('FrozenLake-v0')

``````
``````

[2017-04-14 10:01:55,106] Making new env: FrozenLake-v0

``````
``````

In [4]:

type(env)

``````
``````

Out[4]:

gym.wrappers.time_limit.TimeLimit

``````
``````

In [103]:

def choose_egreedy(q_state,env_action_space,eps):
'''
q_state : Value vector for some state
a_state : Possible actions for such state
eps : Probability of choosing a random action
'''
## Taken from https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/blob/master/chapter06/CliffWalking.py
## With Epsilon probability choose a random action:  Binomial distribution,
##                                                   one trial with success probability of eps.
## np.random.binomial: Returns the number of successes over the number of trials specified.
##                     See it as throwing a coin with probabiility of ending in heads being epsilon
if np.random.binomial(1,eps)==1:
## Get a random action from the action vector
return env_action_space.sample()
else:
return np.argmax(q_state)

``````
``````

In [123]:

# State-action values
#Q = np.zeros((env.observation_space.n,env.action_space.n))
## Step size
alpha = .5
## Discount parameter
gamma = .99
## Number of episodes to simulate
n_episodes = 30000
## Epsilon
eps = 0.4
steps_episode = np.zeros(n_episodes)
for episode in xrange(n_episodes):
##Get first state
state = env.reset()
done =  False
steps = 0
while not done:
## Choose action from S (state) derived from Q (epsilon-greedily)
action = choose_egreedy(Q[state,:],env.action_space,eps)
steps += 1
## Take action and observe next state and reward
next_state, reward, done, info = env.step(action)
## Update State-action value
Q[state,action] += alpha*(reward + gamma*np.max(Q[next_state,:])-Q[state,action])
state = next_state
steps_episode[episode] = steps

``````
``````

In [124]:

plt.plot(steps_episode)

``````
``````

Out[124]:

[<matplotlib.lines.Line2D at 0x7f7a22d03650>]

``````
``````

In [ ]:

``````