In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from numpy import diag, dot
from numpy.linalg import pinv

from mdpsolver import normalize, stationary


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

The Least Squares (Monte Carlo) Solution $$ v_{\pi} = (I - P_{\pi} \Gamma)^{-1} r_{\pi} $$

$$ \Phi \theta \approx v_{\pi} $$$$ \theta_{LS} = (\Phi^{\top} D_{\pi} \Phi)^{-1} \Phi^{\top} D_{\pi} v_{\pi} $$

The ETD Solution:

$$ \theta_{ETD} = \left( \Phi^{\top} M (I - P_{\pi} \Lambda \Gamma )^{-1} (I - P_{\pi} \Gamma ) \Phi \right)^{-1} \Phi^{\top} M (I - P_{\pi} \Gamma \Lambda)^{-1} r_{\pi} $$

The TD Solution

$$ \theta_{TD} = (\Phi^{\top} D_{\pi} (I - P_{\pi} G) \Phi)^{-1} \Phi^{\top} D_{\pi} r_{\pi} $$

Example: Chicken


In [3]:
# Number of states
ns = 8

# Identity matrix
I = np.eye(ns)

# Define the transition matrix under the target policy
P = np.diag(np.ones(ns-1), 1)
P[-1,0] = 1
P_pi = P

# Calculate the stationary distribution under the target policy
d_pi = stationary(P).reshape(-1, 1)
D_pi = np.diag(d_pi.ravel())

# Define the expected reward under the target policy
rvec = np.zeros(ns)
rvec[-1] = 1

# Define the transition matrix under the behavior policy
P_mu = np.array([
        [0.0, 1, 0, 0, 0, 0, 0, 0],
        [0.0, 0, 1, 0, 0, 0, 0, 0],
        [0.0, 0, 0, 1, 0, 0, 0, 0],
        [0.0, 0, 0, 0, 1, 0, 0, 0],
        [0.5, 0, 0, 0, 0, 0.5, 0, 0],
        [0.5, 0, 0, 0, 0, 0, 0.5, 0],
        [0.5, 0, 0, 0, 0, 0, 0, 0.5],
        [1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)

# Calculate the stationary distribution under the behavior policy
d_mu = stationary(P_mu).reshape(-1, 1)

# Define the interest for each state
ivec = np.ones(ns)
imat = np.diag(ivec)

# Define the gamma matrix
gmvec = np.ones(ns) * 0.9
gmvec[0] = 0
G = np.diag(gmvec)

# Define the lambda matrix
lmvec = np.zeros(ns)
L = np.diag(lmvec)

# Define the feature matrix
X = np.eye(ns)

###############################################################
# Solve the emphasis equation
###############################################################
# Compute the "warp" matrix
P_lm = I - np.dot(pinv(I - np.dot(P, np.dot(G, L))), (I - np.dot(P, G)))

# Compute the emphasis distribution
d_i = np.dot(imat, d_mu)
mvec = np.dot(d_i.T, np.linalg.pinv(I - P_lm))
M = np.diag(mvec.ravel())

# Compute "A" matrix
A = X.T @ M @ (I - P_lm) @ X

# Compute "b" vector
b = X.T @ M @ pinv(I - P @ G @ L) @ rvec

# Solve the equation Aw = b
w_etd = pinv(A) @ b

###############################################################
# Solve for the TD solution
###############################################################
w_td = pinv(X.T @ D_pi @ (I - P_pi @ G) @ X) @ X.T @ D_pi @ rvec


###############################################################
# Solve for the least-squares solution
###############################################################
w_ls = pinv(X.T @ D_pi @ X) @ X.T @ D_pi @ pinv(I - P_pi @ G) @ rvec

In [4]:
w_etd


Out[4]:
array([ 0.4782969,  0.531441 ,  0.59049  ,  0.6561   ,  0.729    ,
        0.81     ,  0.9      ,  1.       ])

In [5]:
w_td


Out[5]:
array([ 0.4782969,  0.531441 ,  0.59049  ,  0.6561   ,  0.729    ,
        0.81     ,  0.9      ,  1.       ])

In [6]:
w_ls


Out[6]:
array([ 0.4782969,  0.531441 ,  0.59049  ,  0.6561   ,  0.729    ,
        0.81     ,  0.9      ,  1.       ])

In [7]:
plt.bar(np.arange(ns), w_ls)


Out[7]:
<Container object of 8 artists>

In [26]:
X.T @ (I - P @ G) @ X


Out[26]:
array([[ 1. , -0.9,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. , -0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  1. , -0.9,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  1. , -0.9,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  1. , -0.9,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  1. , -0.9,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. , -0.9],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ]])

In [ ]: