In [1]:
import matplotlib
from matplotlib import pyplot
import numpy as np
import sys
sys.path.append("..")
from hiora_cartpole import features
from hiora_cartpole import fourier_fa
from hiora_cartpole import driver
import gym
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
four_n_weights, four_feature_vec \
= fourier_fa.make_feature_vec(state_ranges,
n_acts=3,
order=7)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
experience = linfa.init(lmbda=0.9,
init_alpha=1.0,
epsi=0.01,
feature_vec=four_feature_vec,
n_weights=four_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=True)
In [75]:
experience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, experience, n_episodes=2000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
In [59]:
steps_per_episode
Out[59]:
In [66]:
%matplotlib notebook
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
driver.plot_2D_V(state_ranges, env.action_space, four_feature_vec, experience.theta)
In [69]:
experience, steps_per_episode = driver.train(env, linfa, experience, n_episodes=10, max_steps=200, is_render=True)tg
#pyplot.plot(steps_per_episode)
#pyplot.show()
In [6]:
from hiora_cartpole import features
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
tilec_n_weights, tilec_feature_vec = features.make_feature_vec(state_ranges, 3, [9, 9], 5)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
fexperience = linfa.init(lmbda=0.9,
init_alpha=1.0,
epsi=0.01,
feature_vec=tilec_feature_vec,
n_weights=tilec_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=True)
In [7]:
fexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=True)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
In [103]:
%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, tilec_feature_vec, fexperience.theta)
Gehring's tilecoding is incredibly slow!
In [4]:
%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)
Out[4]:
In [12]:
from hiora_cartpole import easytile_fa
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
easyt_n_weights, easyt_feature_vec = easytile_fa.make_feature_vec(state_ranges, 3, [9, 9], 5)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
eexperience = linfa.init(lmbda=0.9,
init_alpha=0.005,
epsi=0.01,
feature_vec=easyt_feature_vec,
n_weights=easyt_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=False)
It's quite sensitive to lambda.
In [21]:
eexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, eexperience, n_episodes=5000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
In [22]:
fig, ax1 = pyplot.subplots()
ax1.plot(eexperience.theta)
pyplot.show()
In [23]:
%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, easyt_feature_vec, -eexperience.theta)
In [24]:
eexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, eexperience, n_episodes=20, max_steps=200, is_render=True)
In [28]:
%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)
Out[28]:
Faster.
In [9]:
# What do I want?
#
# - Write one procedure that just trains for a number of episodes.
#
# - Write another procedure that keeps a running average of episode lengths and
# stops training when the average doesn't change much anymore.
#
# - Possibly write a procedure that returns the sequence of Q functions
# resulting from training.
next_dtimestep = driver.make_next_dtimestep(env, linfa.think)
train_and_prep = driver.make_train_and_prep(env, next_dtimestep, linfa.wrapup)
episode_nr, last_avg, experience \
= driver.train_until_converged(
env=env,
train_and_prep=train_and_prep,
init_experience=experience,
max_steps=100,
max_episodes=10000,
avg_window=200,
max_diff=1)
print episode_nr, last_avg
cnts_dtimesteps = driver.cnts_dtimesteps_iter(env, train_and_prep, experience,
100)
thetas = driver.train_return_thetas(cnts_dtimesteps, 1000)
sqes = 1.0 / experience.theta.shape[0] * np.sum(np.diff(thetas, axis=0) ** 2, axis=1)
pyplot.plot(sqes)
pyplot.show()
sums = np.sum(np.abs(thetas), axis=1)
pyplot.plot(sums)
pyplot.show()
with np.load("hard-earned-theta.npz") as data:
old_theta = data['arr_0']
print np.sum(old_theta)
#hard_earned_theta = np.copy(experience.theta)
#np.savez_compressed("hard-earned-theta", hard_earned_theta)
In [11]:
np.array([[1, 2], [3, 4]])[0]
Out[11]:
In [4]:
env.close()