Testing Algorithm Performance in Off-Policy Setting



In [2]:

    
%load_ext autoreload
%autoreload 2



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np



In [4]:

    
import algos
import features
import parametric
import policy
import chicken
from agents import OffPolicyAgent, OnPolicyAgent
from rlbench import *

Assessing Learning Algorithms

In theory, it is possible to solve for the value function sought by the learning algorithms directly, but in practice approximation will suffice.



In [15]:

    
# define the experiment
num_states = 8
num_features = 8

# set up environment
env = chicken.Chicken(num_states)

# set up policy
pol_pi = policy.FixedPolicy({s: {0: 1} for s in env.states})

# set feature mapping
phi = features.RandomBinary(num_features, num_features // 2, random_seed=101011)
# phi = features.Int2Unary(num_states)

# run the algorithms for enough time to get reliable convergence
num_steps = 20000

# state-dependent gamma
gm_dct = {s: 0.9 for s in env.states}
gm_dct[0] = 0
gm_func = parametric.MapState(gm_dct)
gm_p_func = parametric.MapNextState(gm_dct)

# the TD(1) solution should minimize the mean-squared error
update_params = {
    'gm': gm_func,
    'gm_p': gm_p_func,
    'lm': 1.0,
}
lstd_1 = OnPolicyAgent(algos.LSTD(phi.length), pol_pi, phi, update_params)
run_episode(lstd_1, env, num_steps)
mse_values = lstd_1.get_values(env.states)

# the TD(0) solution should minimize the MSPBE
update_params = {
    'gm': gm_func,
    'gm_p': gm_p_func,
    'lm': 0.0,
}
lstd_0 = OnPolicyAgent(algos.LSTD(phi.length), pol_pi, phi, update_params)
run_episode(lstd_0, env, num_steps)
mspbe_values = lstd_0.get_values(env.states)

What do the target values look like?



In [16]:

    
# Plot the states against their target values
xvals = list(sorted(env.states))
y_mse = [mse_values[s] for s in xvals]
y_mspbe = [mspbe_values[s] for s in xvals]

# Mean-square error optimal values
plt.bar(xvals, y_mse)
plt.show()

# MSPBE optimal values
plt.bar(xvals, y_mspbe)
plt.show()



In [17]:

    
y_mse









    Out[17]:





[0.47829653322696686,
 0.53144136071205139,
 0.59048984944820404,
 0.76457551121711731,
 0.72899995744228363,
 0.91847451031208038,
 0.79152511060237885,
 0.89152491092681885]



In [18]:

    
y_mspbe









    Out[18]:





[0.52969549596309662,
 0.58855029940605164,
 0.65394493937492371,
 0.72660551965236664,
 0.62296865880489349,
 0.69218727946281433,
 0.58472633361816406,
 0.83406646549701691]

Actual Testing

We have a number of algorithms that we can try



In [6]:

    
algos.algo_registry









    Out[6]:





{'ETD': algos.ETD,
 'GTD': algos.GTD,
 'GTD2': algos.GTD2,
 'LSTD': algos.LSTD,
 'TD': algos.TD,
 'TDC': algos.TDC}

These algorithms are given to OffPolicyAgent, which also takes care of the function approximation and manages the parameters given to the learning algorithm.



In [7]:

    
# set up algorithm parameters
update_params = {
    'alpha': 0.02,
    'beta': 0.002,
    'gm': 0.9,
    'gm_p': 0.9,
    'lm': 0.0,
    'lm_p': 0.0,
    'interest': 1.0,
}

# Define the target policy
pol_pi = policy.FixedPolicy({s: {0: 1} for s in env.states})
# Define the behavior policy
pol_mu = policy.FixedPolicy({s: {0: 1} if s < 4 else {0: 0.5, 1: 0.5} for s in env.states})


# Run all available algorithms 
max_steps = 50000
for name, alg in algos.algo_registry.items():    
    # Set up the agent, run the experiment, get state-values
    agent = OffPolicyAgent(alg(phi.length), pol_pi, pol_mu, phi, update_params)
    mse_lst = run_errors(agent, env, max_steps, mse_values)
    mspbe_lst = run_errors(agent, env, max_steps, mspbe_values)

    # Plot the errors
    xdata = np.arange(max_steps)
    plt.plot(xdata, mse_lst)
    plt.plot(xdata, mspbe_lst)
#     plt.plot(xdata, np.log(mse_lst))
#     plt.plot(xdata, np.log(mspbe_lst))
    
    # Format and label the graph
    plt.ylim(0, 2)
    plt.title(name)
    plt.xlabel('Timestep')
    plt.ylabel('Error')
    plt.show()



In [ ]: