Example 6.5: Windy Gridworld https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node64.html#fig:windy

field: 7 x 10 
move:   0
      1   3
        2

In [1]:
import numpy as np
import matplotlib.pyplot as plt

class WindyGridworld(object):

    wind_strength = [0,0,0,1,1,1,2,2,1,0,0]

    Q = np.zeros((7, 10, 4))

    position_init = (3, 0)
    goal = (3, 7)

    def __init__(self):
        self.epsilon = 0.2
        self.alpha = 0.1
        self.gamma = 1

        self.Q[:] = np.random.uniform(size=self.Q.shape, high=np.finfo(np.float).tiny)
        self.Q[ 0,:,0] = -np.inf
        self.Q[-1,:,2] = -np.inf
        self.Q[:, 0,1] = -np.inf
        self.Q[:,-1,3] = -np.inf

    def choose(self, position):
        y, x = position
        if np.random.binomial(1, self.epsilon) == 1:
            while True:
                action = np.random.randint(4)
                if np.isfinite(self.Q[y,x,action]):
                    break
        else:
            action = np.argmax(self.Q[y,x])
        return action

    def move(self, action, position):
        ws = self.wind_strength
        y, x = position
        return {
            0: (max(0,-ws[x  ]+y-1), x  ),
            1: (max(0,-ws[x-1]+y  ), x-1),
            2: (max(0,-ws[x  ]+y+1), x  ),
            3: (max(0,-ws[x+1]+y  ), x+1),
        }[action]

    def upk(self):
        episode = []
        position = self.position_init
        action = self.choose(position=position)
        while True:
            episode.append((action, position))
            y , x  = position
            y_, x_ = self.move(action=action, position=position)
            if (y_, x_) == self.goal:
                self.Q[y, x, action] += self.alpha*(self.gamma - self.Q[y, x, action])
                return episode
            action_ = self.choose(position=(y_, x_))
            self.Q[y, x, action] += self.alpha*(-1+self.gamma*self.Q[y_, x_, action_] - self.Q[y, x, action])
            action = action_
            position = y_, x_

In [2]:
wg = WindyGridworld()
for _ in xrange(100000):
    episode = wg.upk()
    wg.epsilon *= 0.9999

In [3]:
print episode


[(3, (3, 0)), (3, (3, 1)), (3, (3, 2)), (3, (2, 3)), (3, (1, 4)), (3, (0, 5)), (3, (0, 6)), (3, (0, 7)), (3, (0, 8)), (2, (0, 9)), (2, (1, 9)), (2, (2, 9)), (2, (3, 9)), (2, (4, 9)), (2, (5, 9)), (1, (6, 9)), (1, (5, 8))]

In [4]:
wg.epsilon


Out[4]:
9.07544679180226e-06