Q-Learning: Tabular


In [97]:
## Frozen Lake 8x8 environment. Reach the goal without falling

In [2]:
import gym
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
from __future__ import division,print_function
%matplotlib inline

In [3]:
env = gym.make('FrozenLake-v0')


[2017-04-14 10:01:55,106] Making new env: FrozenLake-v0

In [4]:
type(env)


Out[4]:
gym.wrappers.time_limit.TimeLimit

In [103]:
def choose_egreedy(q_state,env_action_space,eps):
    '''
    q_state : Value vector for some state
    a_state : Possible actions for such state
    eps : Probability of choosing a random action
    '''
    ## Taken from https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/blob/master/chapter06/CliffWalking.py
    ## With Epsilon probability choose a random action:  Binomial distribution,
    ##                                                   one trial with success probability of eps.
    ## np.random.binomial: Returns the number of successes over the number of trials specified.
    ##                     See it as throwing a coin with probabiility of ending in heads being epsilon
    if np.random.binomial(1,eps)==1:
        ## Get a random action from the action vector
        return env_action_space.sample()
    else:
        return np.argmax(q_state)

In [123]:
# State-action values
#Q = np.zeros((env.observation_space.n,env.action_space.n))
## Step size
alpha = .5
## Discount parameter
gamma = .99
## Number of episodes to simulate
n_episodes = 30000
## Epsilon
eps = 0.4
steps_episode = np.zeros(n_episodes)
for episode in xrange(n_episodes):
    ##Get first state
    state = env.reset()
    done =  False
    steps = 0
    while not done:
        ## Choose action from S (state) derived from Q (epsilon-greedily)
        action = choose_egreedy(Q[state,:],env.action_space,eps)
        steps += 1
        ## Take action and observe next state and reward
        next_state, reward, done, info = env.step(action)
        ## Update State-action value
        Q[state,action] += alpha*(reward + gamma*np.max(Q[next_state,:])-Q[state,action])
        state = next_state
    steps_episode[episode] = steps

In [124]:
plt.plot(steps_episode)


Out[124]:
[<matplotlib.lines.Line2D at 0x7f7a22d03650>]

In [ ]: