Example 4.3: Gambler's Problem
https://webdocs.cs.ualberta.ca/~sutton/book/code/gambler.lisp
In [2]:
import numpy as np
import matplotlib.pyplot as plt
V = np.zeros((101,))
V[100] = 1
p = np.float(0.45)
def backup_action(s, a):
return p * V[s+a] + (1-p) * V[s-a]
def value_update():
dif = []
for s in xrange(1, 100):
old_V = V[s]
V[s] = max(backup_action(s, a) for a in xrange(1, min(s, 100-s)+1))
dif.append(abs(old_V - V[s]))
return max(dif)
V_h = []
def vi(epsilon=1e-20):
V[0:100] = np.linspace(0, 0.99, 100)
while True:
max_dif = value_update()
# print max_dif
V_h.append(V.copy())
if epsilon > max_dif:
break
def policy(s, epsilon=1e-8):
best_value = -1
for a in xrange(1, min(s, 100-s)+1):
this_value = backup_action(s, a)
if this_value > best_value + epsilon:
best_value = this_value
best_action = a
return best_action
# def policy(s):
# a = range(1, min(s, 100-s)+1)
# i = np.argmax([backup_action(s=s, a=ai) for ai in a])
# return a[i]
In [3]:
vi(epsilon=1e-20)
In [4]:
plt.plot(V - np.linspace(0, 1, 101))
Out[4]:
In [5]:
plt.bar(range(1,100), [policy(s) for s in xrange(1,100)])
Out[5]:
In [6]:
plt.bar(range(1,100), [policy(s, epsilon=0) for s in xrange(1,100)])
Out[6]:
In [7]:
s = 49
plt.plot(range(1, min(s, 100-s)+1), [backup_action(s=s, a=a) for a in xrange(1, min(s, 100-s)+1)])
Out[7]:
In [8]:
p = np.float(0.5)
vi(epsilon=1e-20)
plt.bar(range(1,100), [policy(s) for s in xrange(1,100)])
Out[8]:
In [9]:
p = np.float(0.49)
vi(epsilon=1e-20)
plt.bar(range(1,100), [policy(s) for s in xrange(1,100)])
Out[9]:
In [10]:
p = np.float(0.51)
vi(epsilon=1e-20)
plt.bar(range(1,100), [policy(s) for s in xrange(1,100)])
Out[10]:
In [11]:
plt.plot(V - np.linspace(0, 1, 101))
Out[11]: