In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [2]:
from mdpsolver import *
In [3]:
def random_binary(nrows, ncols, row_sum):
"""Generate a random binary matrix ."""
tmp = np.zeros(ncols)
tmp[:row_sum] = 1
ret = np.zeros((nrows, ncols))
for i in range(nrows):
ret[i] = np.random.permutation(tmp)
return ret
In [4]:
ns = 8
states = np.array([i for i in range(ns)])
# Target policy transition matrix
P_pi = np.diag(np.ones(ns-1), 1)
P_pi[-1,0] = 1
# Target policy distribution
d_pi = stationary(P_pi)
# Behavior policy transition matrix
P_mu = np.array([
[0.0, 1, 0, 0, 0, 0, 0, 0],
[0.0, 0, 1, 0, 0, 0, 0, 0],
[0.0, 0, 0, 1, 0, 0, 0, 0],
[0.0, 0, 0, 0, 1, 0, 0, 0],
[0.5, 0, 0, 0, 0, 0.5, 0, 0],
[0.5, 0, 0, 0, 0, 0, 0.5, 0],
[0.5, 0, 0, 0, 0, 0, 0, 0.5],
[1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)
# Behavior policy distribution
d_mu = stationary(P_mu)
# Reward vector under target policy
rvec = np.zeros(ns)
rvec[-1] = 1
# State-dependent parameters
gamma = 0.9*np.ones(ns)
gamma[0] = 0
lmbda = 0.0*np.ones(ns)
interest = np.ones(ns)
# True values
v_true = ls_solver(P_pi, rvec, np.eye(ns), gamma, d=d_pi)
In [5]:
print("Baseline: RMSE if all values are zero")
print(rmse(v_true, np.zeros_like(v_true), d_pi))
In [6]:
num_trials = 5000
phi_length = 6
row_sum = 3
# Store the values
ls_lst = []
td_lst = []
etd_lst = []
for i in range(num_trials):
print("Trial: %d"%i, end="\r")
# Feature matrix
X = random_binary(ns, phi_length, row_sum)
# Least-squares solution
w_ls = ls_solver(P_pi, rvec, X, gamma, d=d_pi)
v_ls = np.copy(X @ w_ls)
# TD solution
w_td = td_solver(P_pi, rvec, X, gamma, lmbda, d=d_pi)
v_td = np.copy(X @ w_td)
# ETD solution
w_etd = etd_solver(P_pi, rvec, X, gamma, lmbda, interest, d_pi=d_pi, d_mu=d_mu)
v_etd = np.copy(X @ w_etd)
# Store the data
ls_lst.append(v_ls)
td_lst.append(v_td)
etd_lst.append(v_etd)
# Average the results
ls_avg = np.mean(ls_lst, axis=0)
td_avg = np.mean(td_lst, axis=0)
etd_avg = np.mean(etd_lst, axis=0)
# Average the errors
ls_errors = pd.Series([rmse(v_true, v, d_pi) for v in ls_lst])
td_errors = pd.Series([rmse(v_true, v, d_pi) for v in td_lst])
etd_errors = pd.Series([rmse(v_true, v, d_pi) for v in etd_lst])
# Describe the data
df = pd.DataFrame()
df['least-squares'] = ls_errors
df['TD'] = td_errors
df['ETD'] = etd_errors
df.describe()
Out[6]:
In [7]:
print(df.describe().to_string())
In [8]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
ax.bar(states, ls_avg, width, color='r')
ax.bar(states + width, td_avg, width, color='g')
ax.bar(states + 2*width, etd_avg, width, color='b')
# formatting
ax.set_title('Approximate state values')
plt.show()
In [9]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - v_true, width, color='g')
ax.bar(states + 2*width, etd_avg - v_true, width, color='b')
# formatting
ax.set_title('Approximate values less true values')
plt.show()
In [10]:
# Average over the values computed for each state
print("Least squares approximation error")
print(rmse(ls_avg, v_true, d_pi))
print("TD approximation error")
print(rmse(td_avg, v_true, d_pi))
print("ETD approximation error")
print(rmse(etd_avg, v_true, d_pi))
In [11]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
# ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - ls_avg, width, color='g')
ax.bar(states + 2*width, etd_avg - ls_avg, width, color='b')
# formatting
ax.set_title('Approximate values less true values')
plt.show()
In [12]:
print("RMS difference between TD and LS solution")
print(rmse(ls_avg, td_avg, d_pi))
print("RMS difference between ETD and LS solution")
print(rmse(ls_avg, etd_avg, d_pi))
In [13]:
ns = 8
states = np.array([i for i in range(ns)])
# Target policy transition matrix
P_pi = np.diag(np.ones(ns-1), 1)
P_pi[-1,0] = 1
# Target policy distribution
d_pi = stationary(P_pi)
# Behavior policy transition matrix
P_mu = np.array([
[0.0, 1, 0, 0, 0, 0, 0, 0],
[0.0, 0, 1, 0, 0, 0, 0, 0],
[0.0, 0, 0, 1, 0, 0, 0, 0],
[0.0, 0, 0, 0, 1, 0, 0, 0],
[0.5, 0, 0, 0, 0, 0.5, 0, 0],
[0.5, 0, 0, 0, 0, 0, 0.5, 0],
[0.5, 0, 0, 0, 0, 0, 0, 0.5],
[1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)
# Behavior policy distribution
d_mu = stationary(P_mu)
# Reward vector under target policy
rvec = np.zeros(ns)
rvec[-1] = 1
# State-dependent parameters
gamma = 0.9*np.ones(ns)
gamma[0] = 0
lmbda = 1.0*np.ones(ns)
interest = np.ones(ns)
# True values
v_true = ls_solver(P_pi, rvec, np.eye(ns), gamma, d=d_pi)
In [14]:
num_trials = 5000
phi_length = 6
row_sum = 3
# Store the values
ls_lst = []
td_lst = []
etd_lst = []
for i in range(num_trials):
print("Trial: %d"%i, end="\r")
# Feature matrix
X = random_binary(ns, phi_length, row_sum)
# Least-squares solution
w_ls = ls_solver(P_pi, rvec, X, gamma, d=d_pi)
v_ls = np.copy(X @ w_ls)
# TD solution
w_td = td_solver(P_pi, rvec, X, gamma, lmbda, d=d_pi)
v_td = np.copy(X @ w_td)
# ETD solution
w_etd = etd_solver(P_pi, rvec, X, gamma, lmbda, interest, d_pi=d_pi, d_mu=d_mu)
v_etd = np.copy(X @ w_etd)
# Store the data
ls_lst.append(v_ls)
td_lst.append(v_td)
etd_lst.append(v_etd)
# Average the results
ls_avg = np.mean(ls_lst, axis=0)
td_avg = np.mean(td_lst, axis=0)
etd_avg = np.mean(etd_lst, axis=0)
# Average the errors
ls_errors = pd.Series([rmse(v_true, v, d_pi) for v in ls_lst])
td_errors = pd.Series([rmse(v_true, v, d_pi) for v in td_lst])
etd_errors = pd.Series([rmse(v_true, v, d_pi) for v in etd_lst])
# Describe the data
df = pd.DataFrame()
df['least-squares'] = ls_errors
df['TD'] = td_errors
df['ETD'] = etd_errors
df.describe()
Out[14]:
In [15]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
ax.bar(states, ls_avg, width, color='r')
ax.bar(states + width, td_avg, width, color='g')
ax.bar(states + 2*width, etd_avg, width, color='b')
# formatting
ax.set_title('Approximate state values')
plt.show()
In [16]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - v_true, width, color='g')
ax.bar(states + 2*width, etd_avg - v_true, width, color='b')
# formatting
ax.set_title('Approximate values less true values')
plt.show()
In [17]:
print("Least squares approximation error")
print(rmse(ls_avg, v_true, d_pi))
print("TD approximation error")
print(rmse(td_avg, v_true, d_pi))
print("ETD approximation error")
print(rmse(etd_avg, v_true, d_pi))
In [18]:
# Plot the results
fig, ax = plt.subplots()
width = 0.3
ax.bar(states + width, td_avg - ls_avg, width, color='g')
ax.bar(states + 2*width, etd_avg - ls_avg, width, color='b')
# formatting
ax.set_title('RMS Difference between LS solution')
plt.show()
In [19]:
print("RMS difference between TD and LS solution")
print(rmse(ls_avg, td_avg, d_pi))
print("RMS difference between ETD and LS solution")
print(rmse(ls_avg, etd_avg, d_pi))
In [26]:
%%javascript
var attribs = document.body.attributes;
var cmd = "theNotebook = " + "'" + attribs['data-notebook-name'].value + "'";
IPython.notebook.kernel.execute(cmd);
In [27]:
%env __NOTEBOOK_NAME $theNotebook
In [28]:
%%bash
echo "ipython nbconvert --to html "$__NOTEBOOK_NAME""
ipython nbconvert --to html "$__NOTEBOOK_NAME"
In [ ]: