In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
import numpy as np
import tempfile
import tensorflow as tf
from tf_rl.controller import DiscreteDeepQ, HumanController
from tf_rl import simulate
from tf_rl.models import MLP
from maddux.rl_experiments.throwing import ThrowingArm
In [4]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)
In [5]:
game = ThrowingArm()
In [6]:
# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.reset_default_graph()
session = tf.InteractiveSession()
# This little guy will let us run tensorboard
# tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)
# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([game.observation_size,], [200, 200, game.num_actions],
[tf.tanh, tf.tanh, tf.identity])
# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
# DiscreteDeepQ object
current_controller = DiscreteDeepQ(game.observation_size, game.num_actions, brain,
optimizer, session, discount_rate=0.90,
exploration_period=5000, max_experience=10000,
store_every_nth=4, train_every_nth=4,
summary_writer=journalist)
session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
# graph was not available when journalist was created
journalist.add_graph(session.graph_def)
In [ ]:
performances = []
try:
for game_idx in range(10000):
game = ThrowingArm()
game_iterations = 0
observation = game.observe()
while game_iterations < 1000 and not game.is_over():
action = current_controller.action(observation)
reward = game.collect_reward(action)
new_observation = game.observe()
current_controller.store(observation, action, reward, new_observation)
current_controller.training_step()
observation = new_observation
game_iterations += 1
performance = float(game_iterations - (game.distance_to_target)) / game.distance_to_target
performances.append(performance)
if game_idx % 50 == 0:
print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
print "Distance to target: %s" % (game.distance_to_target),
print "Last 5 rewards: {}".format(game.collected_rewards[-5:]),
except KeyboardInterrupt:
print "Interrupted"
In [8]:
game.max_distance
Out[8]:
In [ ]: