In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import copy
import numpy as np
import seaborn as sns
import pandas as pd
import jax.experimental.optimizers as jaxopt
import matplotlib.pyplot as plt
import scipy
import imitation.tabular_irl as tirl
import imitation.examples.model_envs as menv
sns.set(context='notebook')
np.random.seed(42)
In [2]:
mdp = menv.RandomMDP(
n_states=16,
n_actions=3,
branch_factor=2,
horizon=10,
random_obs=True,
obs_dim=5,
generator_seed=42)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
obs_dim, = demo_counts.shape
In [3]:
rmodel = tirl.LinearRewardModel(obs_dim)
opt = jaxopt.sgd(0.1)
final_weights, D_fake = tirl.mce_irl(
mdp, opt, rmodel, D, linf_eps=1e-1)
In [4]:
rmodel = tirl.MLPRewardModel(obs_dim, [32, 32])
opt = jaxopt.sgd(0.1)
final_weights, D_fake = tirl.mce_irl(
mdp, opt, rmodel, D, linf_eps=1e-2)
In [5]:
# Same experiments, but on grid world
mdp = menv.CliffWorld(
width=7,
height=4,
horizon=8,
use_xy_obs=True)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
obs_dim, = demo_counts.shape
rmodel = tirl.LinearRewardModel(obs_dim)
opt = jaxopt.adam(1)
final_weights, D_fake = tirl.mce_irl(
mdp, opt, rmodel, D, linf_eps=0.1)
mdp.draw_value_vec(D)
plt.title("Cliff World $p(s)$")
plt.xlabel('x-coord')
plt.ylabel('y-coord')
plt.show()
mdp.draw_value_vec(D_fake)
plt.title("Occupancy for linear reward function")
plt.show()
plt.subplot(1, 2, 1)
mdp.draw_value_vec(rmodel.out(mdp.observation_matrix))
plt.title("Inferred reward")
plt.subplot(1, 2, 2)
mdp.draw_value_vec(mdp.reward_matrix)
plt.title("True reward")
plt.show()
In [6]:
rmodel = tirl.MLPRewardModel(obs_dim, [1024,], activation='Relu')
opt = jaxopt.adam(1e-3)
final_weights, D_fake_mlp = tirl.mce_irl(
mdp, opt, rmodel, D, linf_eps=3e-2, print_interval=250)
mdp.draw_value_vec(D_fake_mlp)
plt.title("Occupancy for MLP reward function")
plt.show()
plt.subplot(1, 2, 1)
mdp.draw_value_vec(rmodel.out(mdp.observation_matrix))
plt.title("Inferred reward")
plt.subplot(1, 2, 2)
mdp.draw_value_vec(mdp.reward_matrix)
plt.title("True reward")
plt.show()
Notice that the inferred reward is absolutely nothing like the true reward, but the occupancy measure still (roughly) matches the true occupancy measure.