In [4]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
import numpy as np
import tempfile
import tensorflow as tf

from tf_rl.controller import DiscreteDeepQ, HumanController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP

In [7]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)


/tmp/tmpdzxofD

In [8]:
from tf_rl.simulation import DiscreteHill

In [9]:
# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.ops.reset_default_graph()
session = tf.InteractiveSession()

# This little guy will let us run tensorboard
#      tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([4,], [10, 4], 
            [tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = DiscreteDeepQ(4, 4, brain, optimizer, session,
                                   discount_rate=0.9, exploration_period=100, max_experience=10000, 
                                   store_every_nth=1, train_every_nth=4, target_network_update_rate=0.1,
                                   summary_writer=journalist)

session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
# graph was not available when journalist was created  
journalist.add_graph(session.graph_def)

In [10]:
performances = []

try:
    for game_idx in range(10000):
        game = DiscreteHill()
        game_iterations = 0

        observation = game.observe()

        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(observation)
            reward = game.collect_reward(action)
            game.perform_action(action)
            new_observation = game.observe()
            current_controller.store(observation, action, reward, new_observation)
            current_controller.training_step()
            observation = new_observation
            game_iterations += 1
        performance = float(game_iterations - (game.shortest_path)) / game.shortest_path
        performances.append(performance)
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
except KeyboardInterrupt:
    print "Interrupted"


Game 9900: iterations before success 12. Pos: (-3, 7), Target: (-3, 7)

In [11]:
N = 500
smooth_performances = [float(sum(performances[i:i+N])) / N for i in range(0, len(performances) - N)]

plt.plot(range(len(smooth_performances)), smooth_performances)


Out[11]:
[<matplotlib.lines.Line2D at 0x7fae08076550>]

In [19]:
np.average(performances[-1000:])


Out[19]:
1.6869445151941282

In [231]:
x = brain.layers[0].Ws[0].eval()
import matplotlib.pyplot as plt
%matplotlib inline
plt.matshow(x)
plt.colorbar()


Out[231]:
<matplotlib.colorbar.Colorbar instance at 0x7fa050305dd0>

In [138]:
brain.input_layer.b.eval()


Out[138]:
array([ 11.01352692,  11.28201485,  12.03692055,  12.26954937], dtype=float32)

In [88]:
game.collect_reward(0)


Out[88]:
-2.0

In [7]:
x = tf.Variable(tf.zeros((5,5)))

In [8]:
tf.clip_by_norm(x, 5)


Out[8]:
<tensorflow.python.framework.ops.Tensor at 0x7f955033c390>

In [ ]: