In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
import numpy as np
import tempfile
import tensorflow as tf
from tf_rl.controller import HumanController, DDQN as DiscreteDeepQ
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from __future__ import print_function
In [3]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)
In [4]:
current_settings = {
'objects': [
'friend',
'enemy',
],
'colors': {
'hero': 'yellow',
'friend': 'green',
'enemy': 'red',
},
'object_reward': {
'friend': 0.1,
'enemy': -0.1,
},
'hero_bounces_off_walls': False,
'world_size': (700,500),
'hero_initial_position': [400, 300],
'hero_initial_speed': [0, 0],
"maximum_speed": [50, 50],
"object_radius": 10.0,
"num_objects": {
"friend" : 25,
"enemy" : 25,
},
"num_observation_lines" : 32,
"observation_line_length": 120.,
"tolerable_distance_to_wall": 50,
"wall_distance_penalty": -0.0,
"delta_v": 50
}
In [5]:
# create the game simulator
g = KarpathyGame(current_settings)
In [6]:
human_control = False
if human_control:
# WSAD CONTROL (requires extra setup - check out README)
current_controller = HumanController({b"w": 3, b"d": 0, b"s": 1,b"a": 2,})
else:
# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.ops.reset_default_graph()
session = tf.InteractiveSession()
# This little guy will let us run tensorboard
# tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)
# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([g.observation_size,], [200, 200, g.num_actions],
[tf.tanh, tf.tanh, tf.identity])
# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
# DiscreteDeepQ object
current_controller = DiscreteDeepQ(g.observation_size, g.num_actions, brain, optimizer, session,
discount_rate=0.99, exploration_period=5000, max_experience=10000,
store_every_nth=4, train_every_nth=4,
summary_writer=journalist)
session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
# graph was not available when journalist was created
journalist.add_graph(session.graph_def)
In [ ]:
FPS = 30
ACTION_EVERY = 3
fast_mode = False
if fast_mode:
WAIT, VISUALIZE_EVERY = False, 20
else:
WAIT, VISUALIZE_EVERY = True, 1
try:
if True:#with tf.device("/cpu:0"):
simulate(simulation=g,
controller=current_controller,
fps=FPS,
visualize_every=VISUALIZE_EVERY,
action_every=ACTION_EVERY,
wait=WAIT,
disable_training=False,
simulation_resolution=0.001,
save_path=None)
except KeyboardInterrupt:
print("Interrupted")
In [8]:
session.run(current_controller.target_network_update)
In [10]:
current_controller.q_network.input_layer.Ws[0].eval()
Out[10]:
In [11]:
current_controller.target_q_network.input_layer.Ws[0].eval()
In [12]:
g.plot_reward(smoothing=100)
Starting with the ray pointing all the way right, we have one row per ray in clockwise order. The numbers for each ray are the following:
Finally the last two numbers in the representation correspond to speed of the hero.
In [13]:
g.__class__ = KarpathyGame
np.set_printoptions(formatter={'float': (lambda x: '%.2f' % (x,))})
x = g.observe()
new_shape = (x[:-2].shape[0]//g.eye_observation_size, g.eye_observation_size)
print(x[:-2].reshape(new_shape))
print(x[-2:])
g.to_html()
In [ ]: