In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from numpy import diag, dot
from numpy.linalg import pinv
from mdpsolver import normalize, stationary
The Least Squares (Monte Carlo) Solution $$ v_{\pi} = (I - P_{\pi} \Gamma)^{-1} r_{\pi} $$
$$ \Phi \theta \approx v_{\pi} $$$$ \theta_{LS} = (\Phi^{\top} D_{\pi} \Phi)^{-1} \Phi^{\top} D_{\pi} v_{\pi} $$The ETD Solution:
$$ \theta_{ETD} = \left( \Phi^{\top} M (I - P_{\pi} \Lambda \Gamma )^{-1} (I - P_{\pi} \Gamma ) \Phi \right)^{-1} \Phi^{\top} M (I - P_{\pi} \Gamma \Lambda)^{-1} r_{\pi} $$The TD Solution
$$ \theta_{TD} = (\Phi^{\top} D_{\pi} (I - P_{\pi} G) \Phi)^{-1} \Phi^{\top} D_{\pi} r_{\pi} $$
In [3]:
# Number of states
ns = 8
# Identity matrix
I = np.eye(ns)
# Define the transition matrix under the target policy
P = np.diag(np.ones(ns-1), 1)
P[-1,0] = 1
P_pi = P
# Calculate the stationary distribution under the target policy
d_pi = stationary(P).reshape(-1, 1)
D_pi = np.diag(d_pi.ravel())
# Define the expected reward under the target policy
rvec = np.zeros(ns)
rvec[-1] = 1
# Define the transition matrix under the behavior policy
P_mu = np.array([
[0.0, 1, 0, 0, 0, 0, 0, 0],
[0.0, 0, 1, 0, 0, 0, 0, 0],
[0.0, 0, 0, 1, 0, 0, 0, 0],
[0.0, 0, 0, 0, 1, 0, 0, 0],
[0.5, 0, 0, 0, 0, 0.5, 0, 0],
[0.5, 0, 0, 0, 0, 0, 0.5, 0],
[0.5, 0, 0, 0, 0, 0, 0, 0.5],
[1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)
# Calculate the stationary distribution under the behavior policy
d_mu = stationary(P_mu).reshape(-1, 1)
# Define the interest for each state
ivec = np.ones(ns)
imat = np.diag(ivec)
# Define the gamma matrix
gmvec = np.ones(ns) * 0.9
gmvec[0] = 0
G = np.diag(gmvec)
# Define the lambda matrix
lmvec = np.zeros(ns)
L = np.diag(lmvec)
# Define the feature matrix
X = np.eye(ns)
###############################################################
# Solve the emphasis equation
###############################################################
# Compute the "warp" matrix
P_lm = I - np.dot(pinv(I - np.dot(P, np.dot(G, L))), (I - np.dot(P, G)))
# Compute the emphasis distribution
d_i = np.dot(imat, d_mu)
mvec = np.dot(d_i.T, np.linalg.pinv(I - P_lm))
M = np.diag(mvec.ravel())
# Compute "A" matrix
A = X.T @ M @ (I - P_lm) @ X
# Compute "b" vector
b = X.T @ M @ pinv(I - P @ G @ L) @ rvec
# Solve the equation Aw = b
w_etd = pinv(A) @ b
###############################################################
# Solve for the TD solution
###############################################################
w_td = pinv(X.T @ D_pi @ (I - P_pi @ G) @ X) @ X.T @ D_pi @ rvec
###############################################################
# Solve for the least-squares solution
###############################################################
w_ls = pinv(X.T @ D_pi @ X) @ X.T @ D_pi @ pinv(I - P_pi @ G) @ rvec
In [4]:
w_etd
Out[4]:
In [5]:
w_td
Out[5]:
In [6]:
w_ls
Out[6]:
In [7]:
plt.bar(np.arange(ns), w_ls)
Out[7]:
In [26]:
X.T @ (I - P @ G) @ X
Out[26]:
In [ ]: