In [1]:
    
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei', font_scale=2.5)
#plt.rcParams['axes.grid'] = False
import numpy as np
import pandas as pd
#pd.options.display.max_rows = 20
import logging
logging.basicConfig()
logger = logging.getLogger()
from IPython.display import Image
import enum
    
In [2]:
    
Image('./res/iterative_policy_evaluation.png')
    
    Out[2]:
In [3]:
    
Image('./res/ex4_1.png')
    
    Out[3]:
In [4]:
    
class Action(enum.Enum):
    EAST = enum.auto()
    WEST = enum.auto()
    SOUTH = enum.auto()
    NORTH = enum.auto()
    
    @staticmethod
    def move(x, y, action):
        if action == Action.EAST:
            return x, y - 1
        elif action == Action.WEST:
            return x, y + 1
        elif action == Action.SOUTH:
            return x + 1, y
        elif action == Action.NORTH:
            return x - 1, y
            
            
class GridWorld(object):
    def move(self, s, action):
        if s == 0 or s == 15:
            return s, 0
        elif 0 < s < 15:
            x = s // 4
            y = s % 4
            x1, y1 = Action.move(x, y, action)
            if 0 <= x1 < 4 and 0 <= y1 < 4:
                s1 = x1 * 4 + y1
                return s1, -1
            else:
                return s, -1
        else:
            raise ValueError('s {} must be in [0, 15]'.format(s))
        
        
class RandomPolicy(object):
    
    def __init__(self, grid_world):
        self._grid_world = grid_world
        
        self._v = np.zeros((4, 4))
        self._v_flatten = self._v.ravel()
        
        self._delta = 0
    
    def iterate(self):
        v = self._v.copy()
        
        for s in range(0, 16):
            self.update_value(s)
        
        self._delta = max(self._delta, np.sum(np.abs(v - self._v)))
        
        return self._v.copy()
    
    def get_pi(self, s):
        return [(0.25, (s, a)) for a in [Action.EAST,
                                         Action.WEST,
                                         Action.SOUTH,
                                         Action.NORTH]]
    
    def update_value(self, s):
        # V(s) = \sum_a \pi(a | s) \sum 1 * (r + 1 * V(s1))
        vs = []
        for (prob, (s, a)) in self.get_pi(s):
            s1, r = self._grid_world.move(s, a)
            vs.append(prob * (r + self._v_flatten[s1]))
        logger.debug('vs: {}'.format(vs))
        self._v_flatten[s] = np.sum(vs)
    
In [5]:
    
# logger.setLevel(logging.DEBUG)
r = RandomPolicy(GridWorld())
for _ in range(100):
    r.iterate()
pd.DataFrame(np.round(r.iterate()))
    
    Out[5]:
policy improvement theorem: For all $s \in \mathcal{S}$, $q_\pi(s, \pi'(s)) \geq v_\pi(s)$, then $v_{\pi'}(s) \geq v_\pi(s)$.
=> new greedy policy: $\pi'(s) = \operatorname{arg \, max}_a q_\pi(s, a)$, policy imporvement.
If there are ties in policy improvement step, each maximizing action can be given a portion of the probability of being selected in the new greedy policy.
In [6]:
    
Image('./res/fig4_1.png')
    
    Out[6]:
policy iteration:
\begin{align*} \pi_0 \overset{E}{\longrightarrow} v_{\pi_0} \overset{I}{\longrightarrow} \pi_1 \overset{E}{\longrightarrow} v_{\pi_1} \overset{I}{\longrightarrow} \pi_2 \overset{E}{\longrightarrow} \cdots \overset{I}{\longrightarrow} \pi_\ast \overset{E}{\longrightarrow} v_{\pi_\ast} \end{align*}
In [7]:
    
Image('./res/policy_iteration.png')
    
    Out[7]:
In [8]:
    
Image('./res/fig4_2.png')
    
    Out[8]:
In [9]:
    
Image('./res/value_iteration.png')
    
    Out[9]:
In [10]:
    
Image('./res/gpi.png')
    
    Out[10]:
In [ ]: