notebook.community

Edit and run



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [ ]:

    
from __future__ import print_function

import numpy as np

from tf_rl.controller import DiscreteDeepQ, NL, HumanController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate



In [ ]:

    
current_settings = {
    'objects': [
        'friend',
        'enemy',
    ],
    'colors': {
        'hero':   'yellow',
        'friend': 'green',
        'enemy':  'red',
    },
    'object_reward': {
        'friend': 0.1,
        'enemy': -0.1,
    },
    'hero_bounces_off_walls': False,
    'world_size': (700,500),
    'hero_initial_position': [400, 300],
    'hero_initial_speed':    [0,   0],
    "maximum_speed":         [50, 50],
    "object_radius": 10.0,
    "num_objects": {
        "friend" : 25,
        "enemy" :  25,
    },
    "num_observation_lines" : 32,
    "observation_line_length": 120.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -0.0,
    "delta_v": 50
}



In [ ]:

    
# create the game simulator
g = KarpathyGame(current_settings)



In [ ]:

    
human_control = False

if human_control:
    # WSAD CONTROL (requires extra setup - check out README)
    current_controller = HumanController({b"w": 3, b"d": 0, b"s": 1,b"a": 2,}) 
else:
    # The optimizer to use. Here we use RMSProp as recommended
    # by the publication

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(g.observation_size, 
                                       [200, 200, g.num_actions], 
                                       [NL.TANH, NL.TANH, NL.IDENTITY], 
                                       learning_rate=0.001, decay=0.9,
                                       discount_rate=0.99, 
                                       exploration_period=5000, 
                                       max_experience=10000, 
                                       store_every_nth=4, 
                                       train_every_nth=4)
    current_controller.initialize('')



In [ ]:

    
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = True
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 50
else:
    WAIT, VISUALIZE_EVERY = True, 1

    
try:
    simulate(simulation=g,
             controller=current_controller,
             fps=FPS,
             visualize_every=VISUALIZE_EVERY,
             action_every=ACTION_EVERY,
             wait=WAIT,
             disable_training=False,
             simulation_resolution=0.001,
             save_path=None)
except KeyboardInterrupt:
    print("Interrupted")

Average Reward over time



In [ ]:

    
g.plot_reward(smoothing=100)

Visualizing what the agent is seeing

Starting with the ray pointing all the way right, we have one row per ray in clockwise order. The numbers for each ray are the following:

first three numbers are normalized distances to the closest visible (intersecting with the ray) object. If no object is visible then all of them are $1$. If there's many objects in sight, then only the closest one is visible. The numbers represent distance to friend, enemy and wall in order.
the last two numbers represent the speed of moving object (x and y components). Speed of wall is ... zero.

Finally the last two numbers in the representation correspond to speed of the hero.



In [ ]:

    
g.__class__ = KarpathyGame
np.set_printoptions(formatter={'float': (lambda x: '%.2f' % (x,))})
x = g.observe()
new_shape = (x[:-4].shape[0]//g.eye_observation_size, g.eye_observation_size)
print(x[:-4].reshape(new_shape))
print(x[-4:])
g.to_html()



In [ ]: