In [1]:
    
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
In [2]:
    
import numpy as np
import tempfile
import tensorflow as tf
from tf_rl.controller import HumanController, DDQN as DiscreteDeepQ
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from __future__ import print_function
    
In [3]:
    
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)
    
    
In [4]:
    
current_settings = {
    'objects': [
        'friend',
        'enemy',
    ],
    'colors': {
        'hero':   'yellow',
        'friend': 'green',
        'enemy':  'red',
    },
    'object_reward': {
        'friend': 0.1,
        'enemy': -0.1,
    },
    'hero_bounces_off_walls': False,
    'world_size': (700,500),
    'hero_initial_position': [400, 300],
    'hero_initial_speed':    [0,   0],
    "maximum_speed":         [50, 50],
    "object_radius": 10.0,
    "num_objects": {
        "friend" : 25,
        "enemy" :  25,
    },
    "num_observation_lines" : 32,
    "observation_line_length": 120.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -0.0,
    "delta_v": 50
}
    
In [5]:
    
# create the game simulator
g = KarpathyGame(current_settings)
    
In [6]:
    
human_control = False
if human_control:
    # WSAD CONTROL (requires extra setup - check out README)
    current_controller = HumanController({b"w": 3, b"d": 0, b"s": 1,b"a": 2,}) 
else:
    # Tensorflow business - it is always good to reset a graph before creating a new controller.
    tf.ops.reset_default_graph()
    session = tf.InteractiveSession()
    # This little guy will let us run tensorboard
    #      tensorboard --logdir [LOG_DIR]
    journalist = tf.train.SummaryWriter(LOG_DIR)
    # Brain maps from observation to Q values for different actions.
    # Here it is a done using a multi layer perceptron with 2 hidden
    # layers
    brain = MLP([g.observation_size,], [200, 200, g.num_actions], 
                [tf.tanh, tf.tanh, tf.identity])
    
    # The optimizer to use. Here we use RMSProp as recommended
    # by the publication
    optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(g.observation_size, g.num_actions, brain, optimizer, session,
                                       discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                       store_every_nth=4, train_every_nth=4,
                                       summary_writer=journalist)
    
    session.run(tf.initialize_all_variables())
    session.run(current_controller.target_network_update)
    # graph was not available when journalist was created  
    journalist.add_graph(session.graph_def)
    
In [ ]:
    
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 20
else:
    WAIT, VISUALIZE_EVERY = True, 1
    
try:
    if True:#with tf.device("/cpu:0"):
        simulate(simulation=g,
                 controller=current_controller,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=0.001,
                 save_path=None)
except KeyboardInterrupt:
    print("Interrupted")
    
    
In [8]:
    
session.run(current_controller.target_network_update)
    
In [10]:
    
current_controller.q_network.input_layer.Ws[0].eval()
    
    Out[10]:
In [11]:
    
current_controller.target_q_network.input_layer.Ws[0].eval()
    
    
In [12]:
    
g.plot_reward(smoothing=100)
    
    
Starting with the ray pointing all the way right, we have one row per ray in clockwise order. The numbers for each ray are the following:
Finally the last two numbers in the representation correspond to speed of the hero.
In [13]:
    
g.__class__ = KarpathyGame
np.set_printoptions(formatter={'float': (lambda x: '%.2f' % (x,))})
x = g.observe()
new_shape = (x[:-2].shape[0]//g.eye_observation_size, g.eye_observation_size)
print(x[:-2].reshape(new_shape))
print(x[-2:])
g.to_html()
    
    
In [ ]: