https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node65.html#fig:cliff
field: 4 x 12
move: 0
1 3
2
In [1]:
import numpy as np
import matplotlib.pyplot as plt
class CliffWalking(object):
position_init = (3, 0)
position_goal = (3, 11)
cliff = np.zeros((4, 12), dtype=np.int)
cliff[3, 1:11] = 1
def __init__(self):
self.epsilon = 0.1
self.alpha = 0.1
self.gamma = 1
self.init()
def init(self):
self.Q = np.random.uniform(size=(4, 12, 4), high=np.finfo(np.float).tiny)
self.Q[self.position_goal[0], self.position_goal[1]] = 1
self.Q[ 0,:,0] = -np.inf
self.Q[-1,:,2] = -np.inf
self.Q[:, 0,1] = -np.inf
self.Q[:,-1,3] = -np.inf
def choose(self, position):
y, x = position
if np.random.binomial(1, self.epsilon) == 1:
while True:
action = np.random.randint(4)
if np.isfinite(self.Q[y,x,action]):
break
else:
action = np.argmax(self.Q[y,x])
return action
def move(self, action, position):
y, x = position
return {
0: (y-1, x ),
1: (y , x-1),
2: (y+1, x ),
3: (y , x+1),
}[action]
def greedy(self):
episode = []
y, x = self.position_init
while True:
action = np.argmax(self.Q[y,x])
episode.append((action, (y, x)))
y_, x_ = self.move(action=action, position=(y, x))
if (y_, x_) == self.position_goal:
return episode
if self.cliff[y_, x_]:
return episode
y, x = y_, x_
def upk_sarsa(self):
y, x = self.position_init
action = self.choose(position=(y, x))
while True:
y_, x_ = self.move(action=action, position=(y, x))
if (y_, x_) == self.position_goal:
self.Q[y, x, action] += self.alpha*(1 - self.Q[y, x, action])
return 1
if self.cliff[y_, x_]:
self.Q[y, x, action] += self.alpha*(-100 - self.Q[y, x, action])
return -100
action_ = self.choose(position=(y_, x_))
self.Q[y, x, action] += self.alpha*(-1+self.gamma*self.Q[y_, x_, action_] - self.Q[y, x, action])
action = action_
y, x = y_, x_
def upk_QL(self):
y, x = self.position_init
while True:
action = self.choose(position=(y, x))
y_, x_ = self.move(action=action, position=(y, x))
if (y_, x_) == self.position_goal:
self.Q[y, x, action] += self.alpha*(1 - self.Q[y, x, action])
return 1
if self.cliff[y_, x_]:
self.Q[y, x, action] += self.alpha*(-100 - self.Q[y, x, action])
return -100
self.Q[y, x, action] += self.alpha*(-1+self.gamma*np.max(self.Q[y_, x_]) - self.Q[y, x, action])
y, x = y_, x_
In [5]:
cw = CliffWalking()
for _ in xrange(10000):
cw.upk_sarsa()
cw.greedy()
Out[5]:
In [6]:
cw = CliffWalking()
for _ in xrange(10000):
cw.upk_QL()
cw.greedy()
Out[6]: