In [1]:

    
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd



In [2]:

    
from mdpsolver import *



In [3]:

    
def random_binary(nrows, ncols, row_sum):
    """Generate a random binary matrix ."""
    tmp = np.zeros(ncols)
    tmp[:row_sum] = 1
    ret = np.zeros((nrows, ncols))
    for i in range(nrows):
        ret[i] = np.random.permutation(tmp)
    return ret

Chicken Example -- Off Function Approximation

Lambda=0



In [4]:

    
ns = 8
states = np.array([i for i in range(ns)])

# Target policy transition matrix
P_pi = np.diag(np.ones(ns-1), 1)
P_pi[-1,0] = 1

# Target policy distribution
d_pi = stationary(P_pi)

# Behavior policy transition matrix
P_mu = np.array([
        [0.0, 1, 0, 0, 0, 0, 0, 0],
        [0.0, 0, 1, 0, 0, 0, 0, 0],
        [0.0, 0, 0, 1, 0, 0, 0, 0],
        [0.0, 0, 0, 0, 1, 0, 0, 0],
        [0.5, 0, 0, 0, 0, 0.5, 0, 0],
        [0.5, 0, 0, 0, 0, 0, 0.5, 0],
        [0.5, 0, 0, 0, 0, 0, 0, 0.5],
        [1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)

# Behavior policy distribution
d_mu = stationary(P_mu)

# Reward vector under target policy
rvec = np.zeros(ns)
rvec[-1] = 1

# State-dependent parameters
gamma = 0.9*np.ones(ns)
gamma[0] = 0
lmbda = 0.0*np.ones(ns)
interest = np.ones(ns)

# True values
v_true = ls_solver(P_pi, rvec, np.eye(ns), gamma, d=d_pi)



In [5]:

    
print("Baseline: RMSE if all values are zero")
print(rmse(v_true, np.zeros_like(v_true), d_pi))









    



Baseline: RMSE if all values are zero
0.258840084292

Average errors



In [6]:

    
num_trials = 5000
phi_length = 6
row_sum = 3

# Store the values
ls_lst = []
td_lst = []
etd_lst = []

for i in range(num_trials):
    print("Trial: %d"%i, end="\r")
    
    # Feature matrix
    X = random_binary(ns, phi_length, row_sum)

    # Least-squares solution
    w_ls = ls_solver(P_pi, rvec, X, gamma, d=d_pi)
    v_ls = np.copy(X @ w_ls)

    # TD solution
    w_td = td_solver(P_pi, rvec, X, gamma, lmbda, d=d_pi)
    v_td = np.copy(X @ w_td)

    # ETD solution
    w_etd = etd_solver(P_pi, rvec, X, gamma, lmbda, interest, d_pi=d_pi, d_mu=d_mu)
    v_etd = np.copy(X @ w_etd)
    
    # Store the data
    ls_lst.append(v_ls)
    td_lst.append(v_td)
    etd_lst.append(v_etd)

# Average the results
ls_avg = np.mean(ls_lst, axis=0)
td_avg = np.mean(td_lst, axis=0)
etd_avg = np.mean(etd_lst, axis=0)

# Average the errors
ls_errors = pd.Series([rmse(v_true, v, d_pi) for v in ls_lst])
td_errors = pd.Series([rmse(v_true, v, d_pi) for v in td_lst])
etd_errors = pd.Series([rmse(v_true, v, d_pi) for v in etd_lst])

# Describe the data
df = pd.DataFrame()
df['least-squares'] = ls_errors
df['TD'] = td_errors
df['ETD'] = etd_errors
df.describe()









    









    Out[6]:






  
    
      
      least-squares
      TD
      ETD
    
  
  
    
      count
      5000.000000
      5000.000000
      5000.000000
    
    
      mean
      0.032629
      0.053695
      0.045405
    
    
      std
      0.012850
      0.024412
      0.018917
    
    
      min
      0.001033
      0.001573
      0.001445
    
    
      25%
      0.022797
      0.035025
      0.030656
    
    
      50%
      0.033088
      0.053214
      0.046346
    
    
      75%
      0.042600
      0.071489
      0.059484
    
    
      max
      0.060077
      0.120615
      0.096028



In [7]:

    
print(df.describe().to_string())









    



       least-squares           TD          ETD
count    5000.000000  5000.000000  5000.000000
mean        0.032629     0.053695     0.045405
std         0.012850     0.024412     0.018917
min         0.001033     0.001573     0.001445
25%         0.022797     0.035025     0.030656
50%         0.033088     0.053214     0.046346
75%         0.042600     0.071489     0.059484
max         0.060077     0.120615     0.096028



In [8]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

ax.bar(states, ls_avg, width, color='r')
ax.bar(states + width, td_avg, width, color='g')
ax.bar(states + 2*width, etd_avg, width, color='b')

# formatting 
ax.set_title('Approximate state values')
plt.show()



In [9]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - v_true, width, color='g')
ax.bar(states + 2*width, etd_avg - v_true, width, color='b')

# formatting 
ax.set_title('Approximate values less true values')
plt.show()



In [10]:

    
# Average over the values computed for each state
print("Least squares approximation error")
print(rmse(ls_avg, v_true, d_pi))

print("TD approximation error")
print(rmse(td_avg, v_true, d_pi))

print("ETD approximation error")
print(rmse(etd_avg, v_true, d_pi))









    



Least squares approximation error
0.020375153241
TD approximation error
0.0444149498921
ETD approximation error
0.0367492071662



In [11]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

# ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - ls_avg, width, color='g')
ax.bar(states + 2*width, etd_avg - ls_avg, width, color='b')

# formatting 
ax.set_title('Approximate values less true values')
plt.show()



In [12]:

    
print("RMS difference between TD and LS solution")
print(rmse(ls_avg, td_avg, d_pi))

print("RMS difference between ETD and LS solution")
print(rmse(ls_avg, etd_avg, d_pi))









    



RMS difference between TD and LS solution
0.0355938944627
RMS difference between ETD and LS solution
0.0226753265473

Lambda=1.0



In [13]:

    
ns = 8
states = np.array([i for i in range(ns)])

# Target policy transition matrix
P_pi = np.diag(np.ones(ns-1), 1)
P_pi[-1,0] = 1

# Target policy distribution
d_pi = stationary(P_pi)

# Behavior policy transition matrix
P_mu = np.array([
        [0.0, 1, 0, 0, 0, 0, 0, 0],
        [0.0, 0, 1, 0, 0, 0, 0, 0],
        [0.0, 0, 0, 1, 0, 0, 0, 0],
        [0.0, 0, 0, 0, 1, 0, 0, 0],
        [0.5, 0, 0, 0, 0, 0.5, 0, 0],
        [0.5, 0, 0, 0, 0, 0, 0.5, 0],
        [0.5, 0, 0, 0, 0, 0, 0, 0.5],
        [1.0, 0, 0, 0, 0, 0, 0, 0],
], dtype=np.float)

# Behavior policy distribution
d_mu = stationary(P_mu)

# Reward vector under target policy
rvec = np.zeros(ns)
rvec[-1] = 1

# State-dependent parameters
gamma = 0.9*np.ones(ns)
gamma[0] = 0
lmbda = 1.0*np.ones(ns)
interest = np.ones(ns)

# True values
v_true = ls_solver(P_pi, rvec, np.eye(ns), gamma, d=d_pi)



In [14]:

    
num_trials = 5000
phi_length = 6
row_sum = 3

# Store the values
ls_lst = []
td_lst = []
etd_lst = []

for i in range(num_trials):
    print("Trial: %d"%i, end="\r")

    # Feature matrix
    X = random_binary(ns, phi_length, row_sum)

    # Least-squares solution
    w_ls = ls_solver(P_pi, rvec, X, gamma, d=d_pi)
    v_ls = np.copy(X @ w_ls)

    # TD solution
    w_td = td_solver(P_pi, rvec, X, gamma, lmbda, d=d_pi)
    v_td = np.copy(X @ w_td)

    # ETD solution
    w_etd = etd_solver(P_pi, rvec, X, gamma, lmbda, interest, d_pi=d_pi, d_mu=d_mu)
    v_etd = np.copy(X @ w_etd)
    
    # Store the data
    ls_lst.append(v_ls)
    td_lst.append(v_td)
    etd_lst.append(v_etd)

# Average the results
ls_avg = np.mean(ls_lst, axis=0)
td_avg = np.mean(td_lst, axis=0)
etd_avg = np.mean(etd_lst, axis=0)

# Average the errors
ls_errors = pd.Series([rmse(v_true, v, d_pi) for v in ls_lst])
td_errors = pd.Series([rmse(v_true, v, d_pi) for v in td_lst])
etd_errors = pd.Series([rmse(v_true, v, d_pi) for v in etd_lst])

# Describe the data
df = pd.DataFrame()
df['least-squares'] = ls_errors
df['TD'] = td_errors
df['ETD'] = etd_errors
df.describe()









    









    Out[14]:






  
    
      
      least-squares
      TD
      ETD
    
  
  
    
      count
      5000.000000
      5000.000000
      5000.000000
    
    
      mean
      0.032392
      0.032392
      0.039167
    
    
      std
      0.012978
      0.012978
      0.016792
    
    
      min
      0.000818
      0.000818
      0.000861
    
    
      25%
      0.022631
      0.022631
      0.025925
    
    
      50%
      0.032529
      0.032529
      0.039101
    
    
      75%
      0.042725
      0.042725
      0.052849
    
    
      max
      0.060025
      0.060025
      0.077641



In [15]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

ax.bar(states, ls_avg, width, color='r')
ax.bar(states + width, td_avg, width, color='g')
ax.bar(states + 2*width, etd_avg, width, color='b')

# formatting 
ax.set_title('Approximate state values')
plt.show()



In [16]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

ax.bar(states, ls_avg - v_true, width, color='r')
ax.bar(states + width, td_avg - v_true, width, color='g')
ax.bar(states + 2*width, etd_avg - v_true, width, color='b')

# formatting 
ax.set_title('Approximate values less true values')
plt.show()



In [17]:

    
print("Least squares approximation error")
print(rmse(ls_avg, v_true, d_pi))

print("TD approximation error")
print(rmse(td_avg, v_true, d_pi))

print("ETD approximation error")
print(rmse(etd_avg, v_true, d_pi))









    



Least squares approximation error
0.0201701222401
TD approximation error
0.0201701222401
ETD approximation error
0.0295863810005



In [18]:

    
# Plot the results
fig, ax = plt.subplots()
width = 0.3

ax.bar(states + width, td_avg - ls_avg, width, color='g')
ax.bar(states + 2*width, etd_avg - ls_avg, width, color='b')

# formatting 
ax.set_title('RMS Difference between LS solution')
plt.show()



In [19]:

    
print("RMS difference between TD and LS solution")
print(rmse(ls_avg, td_avg, d_pi))

print("RMS difference between ETD and LS solution")
print(rmse(ls_avg, etd_avg, d_pi))









    



RMS difference between TD and LS solution
1.17756934401e-16
RMS difference between ETD and LS solution
0.0153947594717

Commentary

As expected, with lambda set to one, the solution found by TD is the same as the least squares solution.
On average, emphasis TD seems to diverge from the least-squares solution most in the latter states.
- These states are the ones that will tend to have the highest emphasis
- Sometimes, it appears like the algorithm is trying to get those values right, since they are the ones it is emphasizing.
- This case doesn't seem to be a correction-- when using the on-policy distribution, the difference disappears (as would be expected from following the derivation of ETD)
I do not have a good intuition for why there is less discrepancy towards the "middle", that is, about half way between the start state and the "wall"

Get name of notebook and export it, in a roundabout way



In [26]:

    
%%javascript
var attribs = document.body.attributes;
var cmd = "theNotebook = " + "'" + attribs['data-notebook-name'].value + "'";
IPython.notebook.kernel.execute(cmd);



In [27]:

    
%env __NOTEBOOK_NAME $theNotebook









    



env: __NOTEBOOK_NAME=chicken_solution.ipynb



In [28]:

    
%%bash
echo "ipython nbconvert --to html "$__NOTEBOOK_NAME""
ipython nbconvert --to html "$__NOTEBOOK_NAME"









    



ipython nbconvert --to html chicken_solution.ipynb






    



[NbConvertApp] Converting notebook chicken_solution.ipynb to html
[NbConvertApp] Writing 307723 bytes to chicken_solution.html



In [ ]:

	least-squares	TD	ETD
count	5000.000000	5000.000000	5000.000000
mean	0.032629	0.053695	0.045405
std	0.012850	0.024412	0.018917
min	0.001033	0.001573	0.001445
25%	0.022797	0.035025	0.030656
50%	0.033088	0.053214	0.046346
75%	0.042600	0.071489	0.059484
max	0.060077	0.120615	0.096028