In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import tempfile
import tensorflow as tf

from tf_rl.controller import HumanController, DDQN as DiscreteDeepQ
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP

from __future__ import print_function

In [3]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)


/tmp/tmpafM9l3

In [4]:
current_settings = {
    'objects': [
        'friend',
        'enemy',
    ],
    'colors': {
        'hero':   'yellow',
        'friend': 'green',
        'enemy':  'red',
    },
    'object_reward': {
        'friend': 0.1,
        'enemy': -0.1,
    },
    'hero_bounces_off_walls': False,
    'world_size': (700,500),
    'hero_initial_position': [400, 300],
    'hero_initial_speed':    [0,   0],
    "maximum_speed":         [50, 50],
    "object_radius": 10.0,
    "num_objects": {
        "friend" : 25,
        "enemy" :  25,
    },
    "num_observation_lines" : 32,
    "observation_line_length": 120.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -0.0,
    "delta_v": 50
}

In [5]:
# create the game simulator
g = KarpathyGame(current_settings)

In [6]:
human_control = False

if human_control:
    # WSAD CONTROL (requires extra setup - check out README)
    current_controller = HumanController({b"w": 3, b"d": 0, b"s": 1,b"a": 2,}) 
else:
    # Tensorflow business - it is always good to reset a graph before creating a new controller.
    tf.ops.reset_default_graph()
    session = tf.InteractiveSession()

    # This little guy will let us run tensorboard
    #      tensorboard --logdir [LOG_DIR]
    journalist = tf.train.SummaryWriter(LOG_DIR)

    # Brain maps from observation to Q values for different actions.
    # Here it is a done using a multi layer perceptron with 2 hidden
    # layers
    brain = MLP([g.observation_size,], [200, 200, g.num_actions], 
                [tf.tanh, tf.tanh, tf.identity])
    
    # The optimizer to use. Here we use RMSProp as recommended
    # by the publication
    optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(g.observation_size, g.num_actions, brain, optimizer, session,
                                       discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                       store_every_nth=4, train_every_nth=4,
                                       summary_writer=journalist)
    
    session.run(tf.initialize_all_variables())
    session.run(current_controller.target_network_update)
    # graph was not available when journalist was created  
    journalist.add_graph(session.graph_def)

In [ ]:
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 20
else:
    WAIT, VISUALIZE_EVERY = True, 1

    
try:
    if True:#with tf.device("/cpu:0"):
        simulate(simulation=g,
                 controller=current_controller,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=0.001,
                 save_path=None)
except KeyboardInterrupt:
    print("Interrupted")


fps = 30.0 nearest wall = 32.5 reward = 0.0 objects eaten => enemy: 41, friend: 41

In [8]:
session.run(current_controller.target_network_update)

In [10]:
current_controller.q_network.input_layer.Ws[0].eval()


Out[10]:
array([[-0.06292054,  0.07018796,  0.05575218, ..., -0.076327  ,
         0.02628701, -0.02502313],
       [-0.0122828 , -0.03783315, -0.05952611, ..., -0.04457522,
         0.01477667,  0.03304856],
       [-0.06571824, -0.04028175,  0.02779744, ..., -0.00455148,
        -0.04327632,  0.03037542],
       ..., 
       [ 0.05008943,  0.03139383, -0.02281955, ..., -0.02654718,
        -0.01443507,  0.02257421],
       [-0.07789286,  0.05785912, -0.04746125, ...,  0.01773719,
         0.01857647, -0.01236247],
       [ 0.05106261, -0.03196249, -0.06304625, ..., -0.00868494,
         0.06353462, -0.00128598]], dtype=float32)

In [11]:
current_controller.target_q_network.input_layer.Ws[0].eval()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-68b98e181373> in <module>()
----> 1 current_controller.target_q_network.input_layer.Ws[0].eval()

AttributeError: 'DDQN' object has no attribute 'target_q_network'

Average Reward over time


In [12]:
g.plot_reward(smoothing=100)


Visualizing what the agent is seeing

Starting with the ray pointing all the way right, we have one row per ray in clockwise order. The numbers for each ray are the following:

  • first three numbers are normalized distances to the closest visible (intersecting with the ray) object. If no object is visible then all of them are $1$. If there's many objects in sight, then only the closest one is visible. The numbers represent distance to friend, enemy and wall in order.
  • the last two numbers represent the speed of moving object (x and y components). Speed of wall is ... zero.

Finally the last two numbers in the representation correspond to speed of the hero.


In [13]:
g.__class__ = KarpathyGame
np.set_printoptions(formatter={'float': (lambda x: '%.2f' % (x,))})
x = g.observe()
new_shape = (x[:-2].shape[0]//g.eye_observation_size, g.eye_observation_size)
print(x[:-2].reshape(new_shape))
print(x[-2:])
g.to_html()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-d8b259d78d4d> in <module>()
      3 x = g.observe()
      4 new_shape = (x[:-2].shape[0]//g.eye_observation_size, g.eye_observation_size)
----> 5 print(x[:-2].reshape(new_shape))
      6 print(x[-2:])
      7 g.to_html()

ValueError: total size of new array must be unchanged

In [ ]: