Example 6.5: Windy Gridworld https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node64.html#fig:windy
field: 7 x 10
move: 0
1 3
2
In [1]:
import numpy as np
import matplotlib.pyplot as plt
class WindyGridworld(object):
wind_strength = [0,0,0,1,1,1,2,2,1,0,0]
Q = np.zeros((7, 10, 4))
position_init = (3, 0)
goal = (3, 7)
def __init__(self):
self.epsilon = 0.2
self.alpha = 0.1
self.gamma = 1
self.Q[:] = np.random.uniform(size=self.Q.shape, high=np.finfo(np.float).tiny)
self.Q[ 0,:,0] = -np.inf
self.Q[-1,:,2] = -np.inf
self.Q[:, 0,1] = -np.inf
self.Q[:,-1,3] = -np.inf
def choose(self, position):
y, x = position
if np.random.binomial(1, self.epsilon) == 1:
while True:
action = np.random.randint(4)
if np.isfinite(self.Q[y,x,action]):
break
else:
action = np.argmax(self.Q[y,x])
return action
def move(self, action, position):
ws = self.wind_strength
y, x = position
return {
0: (max(0,-ws[x ]+y-1), x ),
1: (max(0,-ws[x-1]+y ), x-1),
2: (max(0,-ws[x ]+y+1), x ),
3: (max(0,-ws[x+1]+y ), x+1),
}[action]
def upk(self):
episode = []
position = self.position_init
action = self.choose(position=position)
while True:
episode.append((action, position))
y , x = position
y_, x_ = self.move(action=action, position=position)
if (y_, x_) == self.goal:
self.Q[y, x, action] += self.alpha*(self.gamma - self.Q[y, x, action])
return episode
action_ = self.choose(position=(y_, x_))
self.Q[y, x, action] += self.alpha*(-1+self.gamma*self.Q[y_, x_, action_] - self.Q[y, x, action])
action = action_
position = y_, x_
In [2]:
wg = WindyGridworld()
for _ in xrange(100000):
episode = wg.upk()
wg.epsilon *= 0.9999
In [3]:
print episode
In [4]:
wg.epsilon
Out[4]: