Q-Learning: Tabular



In [97]:

    
## Frozen Lake 8x8 environment. Reach the goal without falling



In [2]:

    
import gym
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
from __future__ import division,print_function
%matplotlib inline



In [3]:

    
env = gym.make('FrozenLake-v0')









    



[2017-04-14 10:01:55,106] Making new env: FrozenLake-v0



In [4]:

    
type(env)









    Out[4]:





gym.wrappers.time_limit.TimeLimit



In [103]:

    
def choose_egreedy(q_state,env_action_space,eps):
    '''
    q_state : Value vector for some state
    a_state : Possible actions for such state
    eps : Probability of choosing a random action
    '''
    ## Taken from https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/blob/master/chapter06/CliffWalking.py
    ## With Epsilon probability choose a random action:  Binomial distribution,
    ##                                                   one trial with success probability of eps.
    ## np.random.binomial: Returns the number of successes over the number of trials specified.
    ##                     See it as throwing a coin with probabiility of ending in heads being epsilon
    if np.random.binomial(1,eps)==1:
        ## Get a random action from the action vector
        return env_action_space.sample()
    else:
        return np.argmax(q_state)



In [123]:

    
# State-action values
#Q = np.zeros((env.observation_space.n,env.action_space.n))
## Step size
alpha = .5
## Discount parameter
gamma = .99
## Number of episodes to simulate
n_episodes = 30000
## Epsilon
eps = 0.4
steps_episode = np.zeros(n_episodes)
for episode in xrange(n_episodes):
    ##Get first state
    state = env.reset()
    done =  False
    steps = 0
    while not done:
        ## Choose action from S (state) derived from Q (epsilon-greedily)
        action = choose_egreedy(Q[state,:],env.action_space,eps)
        steps += 1
        ## Take action and observe next state and reward
        next_state, reward, done, info = env.step(action)
        ## Update State-action value
        Q[state,action] += alpha*(reward + gamma*np.max(Q[next_state,:])-Q[state,action])
        state = next_state
    steps_episode[episode] = steps



In [124]:

    
plt.plot(steps_episode)









    Out[124]:





[<matplotlib.lines.Line2D at 0x7f7a22d03650>]



In [ ]: