In [1]:

    
import numpy as np
import matplotlib.pyplot as plt

class CliffWalking(object):

    position_init = (3,  0)
    position_goal = (3, 11)

    cliff = np.zeros((4, 12), dtype=np.int)
    cliff[3, 1:11] = 1

    def __init__(self):
        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 1

        self.init()

    def init(self):
        self.Q = np.random.uniform(size=(4, 12, 4), high=np.finfo(np.float).tiny)
        self.Q[self.position_goal[0], self.position_goal[1]] = 1
        self.Q[ 0,:,0] = -np.inf
        self.Q[-1,:,2] = -np.inf
        self.Q[:, 0,1] = -np.inf
        self.Q[:,-1,3] = -np.inf
    
    def choose(self, position):
        y, x = position
        if np.random.binomial(1, self.epsilon) == 1:
            while True:
                action = np.random.randint(4)
                if np.isfinite(self.Q[y,x,action]):
                    break
        else:
            action = np.argmax(self.Q[y,x])
        return action

    def move(self, action, position):
        y, x = position
        return {
            0: (y-1, x  ),
            1: (y  , x-1),
            2: (y+1, x  ),
            3: (y  , x+1),
        }[action]

    def greedy(self):
        episode = []
        y, x = self.position_init
        while True:
            action = np.argmax(self.Q[y,x])
            episode.append((action, (y, x)))
            y_, x_ = self.move(action=action, position=(y, x))
            if (y_, x_) == self.position_goal:
                return episode
            if self.cliff[y_, x_]:
                return episode
            y, x = y_, x_

    def upk_sarsa(self):
        y, x = self.position_init
        action = self.choose(position=(y, x))
        while True:
            y_, x_ = self.move(action=action, position=(y, x))
            if (y_, x_) == self.position_goal:
                self.Q[y, x, action] += self.alpha*(1 - self.Q[y, x, action])
                return 1
            if self.cliff[y_, x_]:
                self.Q[y, x, action] += self.alpha*(-100 - self.Q[y, x, action])
                return -100
            action_ = self.choose(position=(y_, x_))
            self.Q[y, x, action] += self.alpha*(-1+self.gamma*self.Q[y_, x_, action_] - self.Q[y, x, action])
            action = action_
            y, x = y_, x_

    def upk_QL(self):
        y, x = self.position_init
        while True:
            action = self.choose(position=(y, x))
            y_, x_ = self.move(action=action, position=(y, x))
            if (y_, x_) == self.position_goal:
                self.Q[y, x, action] += self.alpha*(1 - self.Q[y, x, action])
                return 1
            if self.cliff[y_, x_]:
                self.Q[y, x, action] += self.alpha*(-100 - self.Q[y, x, action])
                return -100
            self.Q[y, x, action] += self.alpha*(-1+self.gamma*np.max(self.Q[y_, x_]) - self.Q[y, x, action])
            y, x = y_, x_



In [5]:

    
cw = CliffWalking()
for _ in xrange(10000):
    cw.upk_sarsa()
cw.greedy()









    Out[5]:





[(0, (3, 0)),
 (0, (2, 0)),
 (3, (1, 0)),
 (3, (1, 1)),
 (3, (1, 2)),
 (3, (1, 3)),
 (3, (1, 4)),
 (3, (1, 5)),
 (3, (1, 6)),
 (3, (1, 7)),
 (3, (1, 8)),
 (3, (1, 9)),
 (3, (1, 10)),
 (2, (1, 11)),
 (2, (2, 11))]



In [6]:

    
cw = CliffWalking()
for _ in xrange(10000):
    cw.upk_QL()
cw.greedy()









    Out[6]:





[(0, (3, 0)),
 (3, (2, 0)),
 (3, (2, 1)),
 (3, (2, 2)),
 (3, (2, 3)),
 (3, (2, 4)),
 (3, (2, 5)),
 (3, (2, 6)),
 (3, (2, 7)),
 (3, (2, 8)),
 (3, (2, 9)),
 (3, (2, 10)),
 (2, (2, 11))]

Example 6.6: Cliff Walking