In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei', font_scale=2.5)
#plt.rcParams['axes.grid'] = False
import numpy as np
import pandas as pd
#pd.options.display.max_rows = 20
#import sklearn
#import itertools
#import logging
#logger = logging.getLogger()
#from IPython.display import SVG
from IPython.display import Image
In [3]:
Image('./res/fig3_1.png')
Out[3]:
finite MDP: the sets of states, actions and rewards all have a finite number of elements.
\begin{align} & p(s', r \mid s, a) \doteq \operatorname{Pr} \{ S_t = s', R_t = r \mid S_{t-1} = s, A_{t-1} = a \} \\ & \displaystyle \sum_{s' \in \mathcal{S}} \sum_{r \in \mathcal{R}} p(s', r \mid s, a) = 1 \quad \text{, for all $s \in \mathcal{S}$, $a \in \mathcal{A}(s)$} \end{align}agent-environment boundary represents the limit of the agent's absolute control, not of its knowledge.
In [5]:
# Transition Graph
Image('./res/ex3_3.png')
Out[5]:
0 for escaping from the maze and -1 at all other times
$G_1 = \frac{7}{1 - r} = 70$
$G_0 = R_1 + 0.9 G_1 = 65$
Bellman equation for $v_\pi$:
\begin{equation} v_\pi(s) \doteq \displaystyle \sum_a \pi(a \mid s) \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma v_\pi(s') \right ] \quad \text{, for all $s \in \mathcal{S}$} \end{equation}The value functions $v_\pi$ and $q_\pi$ can be estimated from experience, say Monte Carlo methods (average).
In [31]:
# Example 3.5
from scipy.signal import convolve2d
reward_matrix = np.zeros((5, 5))
# kernel
kernel = np.array([[0, 1, 0],
[1, 0, 1],
[0, 1, 0]])
iteration_nums = 100
for _ in range(iteration_nums):
reward = convolve2d(reward_matrix, kernel, mode='same', boundary='fill', fillvalue=-1)
reward /= 4.0
# A -> A'
reward[0, 1] = 10 + reward[-1, 1]
# B -> B'
reward[0, -2] = 5 + reward[2, -2]
reward_matrix = reward
pd.DataFrame(reward_matrix)
Out[31]:
optimal policy:
\begin{align} v_\ast(s) & \doteq \displaystyle \max_\pi v_\pi(s) \quad \text{ for all } s \in \mathcal{S} \\ & = \max_a \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma v_\ast(s') \right ] \\ \end{align}Any policy that is greedy with respect to the optimal evaluation function $v_\ast$ is an optimal policy.
optimal action-value function:
\begin{align} q_\ast(s, a) & \doteq \max_\pi q_\pi(s, a) \quad \text{ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$} \\ & = \mathbb{E} [ R_{t+1} + \gamma v_\ast(S_{t+1}) \mid S_t = s, A_t = a ] \\ & = \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma \max_{a'} q_\ast (s', a') \right ] \\ \end{align}Explicitly solving the Bellman optimality equation relies on at least three assumptions that are rarely true in practice:
In [ ]: