In [1]:
import numpy as np
import gym
from gym.wrappers import Monitor
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import DQN, ExpandedStateEnv
import os
from scipy.interpolate import interp1d
import numbers



def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

In [24]:
run = get_run()
env_logs = '/tmp/cartpole-{}'.format(run)
expansion = 3

env = gym.make('LunarLander-v2')
# env = Monitor(env, env_logs)
env = ExpandedStateEnv(env, expansion)
                
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * expansion
model_path = os.getcwd() + "/Q-network-full.model"
logs_path = "logs/run{}".format(run)


[2017-03-03 14:10:54,022] Making new env: LunarLander-v2

In [25]:
class LunarDQN(DQN):
    
    def define_Qs(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        net = tf.layers.dense(inputs.s, 64, activation=tf.nn.relu, name='relu_layer', **ops)
        net = tf.layers.dense(inputs.s, 32, activation=tf.nn.relu, name='relu_layer2', **ops)
        return tf.layers.dense(inputs.s, n_actions, name='linear_layer', **ops)

In [26]:
model = LunarDQN(
    n_actions, n_states,
    model_path = model_path,
    logs_path = logs_path,
    flush_secs = 3.0,
    y = 0.98,
    buffer_length=500000,
    restore = False
)

print("run: {},\n s: {},\n a: {},\n r: {},\n Qs: {},\n update: {}".format(
    run, model.inputs.s, model.inputs.a, model.inputs.r, model.network.Qs, model.update
))


run: 70,
 s: Tensor("inputs/s:0", shape=(?, 24), dtype=float32, device=/device:CPU:0),
 a: Tensor("inputs/a:0", shape=(?,), dtype=int32, device=/device:CPU:0),
 r: Tensor("inputs/r:0", shape=(?,), dtype=float32, device=/device:CPU:0),
 Qs: Tensor("network/linear_layer/MatMul:0", shape=(?, 4), dtype=float32, device=/device:CPU:0),
 update: name: "network/Adam"
op: "NoOp"
input: "^network/Adam/update_network/linear_layer/kernel/ApplyAdam"
input: "^network/Adam/Assign"
input: "^network/Adam/Assign_1"
device: "/device:CPU:0"


In [27]:
k = 5000.
model.fit(
    env, 
    episodes=50000,
    max_episode_length = 2000,
    learning_rate = 0.01, #lambda t: max(0.001, k / (k + t)),
    e = 0.1 #interp1d([0, 4000], [1, 0.05], fill_value=0.05, bounds_error=False)
)


[MAX] Episode: 0, Reward: -427.44619018, e: 0.1, learning_rate: 0.01, buffer_len: 90, episode_length: 90
[NOR] Episode: 10, avg reward: -286.982311246, e: 0.1, learning_rate: 0.01, buffer_len: 740, episode_length: 60
[NOR] Episode: 20, avg reward: -181.130319211, e: 0.1, learning_rate: 0.01, buffer_len: 1388, episode_length: 56
[NOR] Episode: 30, avg reward: -242.727730193, e: 0.1, learning_rate: 0.01, buffer_len: 2354, episode_length: 65
[NOR] Episode: 40, avg reward: -238.480080357, e: 0.1, learning_rate: 0.01, buffer_len: 3070, episode_length: 64
[NOR] Episode: 50, avg reward: -243.644812841, e: 0.1, learning_rate: 0.01, buffer_len: 3738, episode_length: 52
[NOR] Episode: 60, avg reward: -365.74857608, e: 0.1, learning_rate: 0.01, buffer_len: 4469, episode_length: 68
[NOR] Episode: 70, avg reward: -314.768971952, e: 0.1, learning_rate: 0.01, buffer_len: 5167, episode_length: 55
[NOR] Episode: 80, avg reward: -292.654297829, e: 0.1, learning_rate: 0.01, buffer_len: 5857, episode_length: 63
[NOR] Episode: 90, avg reward: -274.794022053, e: 0.1, learning_rate: 0.01, buffer_len: 6532, episode_length: 59
[NOR] Episode: 100, avg reward: -337.29878289, e: 0.1, learning_rate: 0.01, buffer_len: 7339, episode_length: 132
[NOR] Episode: 110, avg reward: -298.128354506, e: 0.1, learning_rate: 0.01, buffer_len: 8153, episode_length: 57
[NOR] Episode: 120, avg reward: -298.369397818, e: 0.1, learning_rate: 0.01, buffer_len: 8900, episode_length: 66
[NOR] Episode: 130, avg reward: -238.667810887, e: 0.1, learning_rate: 0.01, buffer_len: 9828, episode_length: 145
[NOR] Episode: 140, avg reward: -272.146438488, e: 0.1, learning_rate: 0.01, buffer_len: 10544, episode_length: 68
[NOR] Episode: 150, avg reward: -392.395855865, e: 0.1, learning_rate: 0.01, buffer_len: 11531, episode_length: 112
[NOR] Episode: 160, avg reward: -276.477981779, e: 0.1, learning_rate: 0.01, buffer_len: 12465, episode_length: 53
[NOR] Episode: 170, avg reward: -257.280124225, e: 0.1, learning_rate: 0.01, buffer_len: 13349, episode_length: 72
[NOR] Episode: 180, avg reward: -345.544169009, e: 0.1, learning_rate: 0.01, buffer_len: 14279, episode_length: 120
[NOR] Episode: 190, avg reward: -295.484659797, e: 0.1, learning_rate: 0.01, buffer_len: 15240, episode_length: 88
[NOR] Episode: 200, avg reward: -310.934632821, e: 0.1, learning_rate: 0.01, buffer_len: 16351, episode_length: 142
[NOR] Episode: 210, avg reward: -463.156417479, e: 0.1, learning_rate: 0.01, buffer_len: 17476, episode_length: 126
[NOR] Episode: 220, avg reward: -298.616447343, e: 0.1, learning_rate: 0.01, buffer_len: 18912, episode_length: 211
[NOR] Episode: 230, avg reward: -358.381124428, e: 0.1, learning_rate: 0.01, buffer_len: 20369, episode_length: 81
[NOR] Episode: 240, avg reward: -383.372909775, e: 0.1, learning_rate: 0.01, buffer_len: 21892, episode_length: 137
[NOR] Episode: 250, avg reward: -230.869030934, e: 0.1, learning_rate: 0.01, buffer_len: 23224, episode_length: 263
[NOR] Episode: 260, avg reward: -272.371121979, e: 0.1, learning_rate: 0.01, buffer_len: 24991, episode_length: 242
[NOR] Episode: 270, avg reward: -293.969860889, e: 0.1, learning_rate: 0.01, buffer_len: 26738, episode_length: 145
[NOR] Episode: 280, avg reward: -312.104953421, e: 0.1, learning_rate: 0.01, buffer_len: 28488, episode_length: 130
[NOR] Episode: 290, avg reward: -353.676364073, e: 0.1, learning_rate: 0.01, buffer_len: 30081, episode_length: 147
[NOR] Episode: 300, avg reward: -354.790421099, e: 0.1, learning_rate: 0.01, buffer_len: 31808, episode_length: 111
[NOR] Episode: 310, avg reward: -403.316230425, e: 0.1, learning_rate: 0.01, buffer_len: 33429, episode_length: 193
[NOR] Episode: 320, avg reward: -249.666692909, e: 0.1, learning_rate: 0.01, buffer_len: 34890, episode_length: 168
[NOR] Episode: 330, avg reward: -367.090849686, e: 0.1, learning_rate: 0.01, buffer_len: 36331, episode_length: 125
[NOR] Episode: 340, avg reward: -257.914400105, e: 0.1, learning_rate: 0.01, buffer_len: 37695, episode_length: 117
[NOR] Episode: 350, avg reward: -439.470661099, e: 0.1, learning_rate: 0.01, buffer_len: 38883, episode_length: 111
[NOR] Episode: 360, avg reward: -295.292733234, e: 0.1, learning_rate: 0.01, buffer_len: 40258, episode_length: 107
[NOR] Episode: 370, avg reward: -349.144180937, e: 0.1, learning_rate: 0.01, buffer_len: 41420, episode_length: 108
[NOR] Episode: 380, avg reward: -313.519682263, e: 0.1, learning_rate: 0.01, buffer_len: 42620, episode_length: 151
[NOR] Episode: 390, avg reward: -395.125081298, e: 0.1, learning_rate: 0.01, buffer_len: 43873, episode_length: 184
[NOR] Episode: 400, avg reward: -374.227018892, e: 0.1, learning_rate: 0.01, buffer_len: 45178, episode_length: 163
[NOR] Episode: 410, avg reward: -276.008692778, e: 0.1, learning_rate: 0.01, buffer_len: 46519, episode_length: 146
[NOR] Episode: 420, avg reward: -300.412767327, e: 0.1, learning_rate: 0.01, buffer_len: 47685, episode_length: 82
[NOR] Episode: 430, avg reward: -383.767176121, e: 0.1, learning_rate: 0.01, buffer_len: 49122, episode_length: 138
[NOR] Episode: 440, avg reward: -417.275954842, e: 0.1, learning_rate: 0.01, buffer_len: 50477, episode_length: 112
[NOR] Episode: 450, avg reward: -284.02163945, e: 0.1, learning_rate: 0.01, buffer_len: 51774, episode_length: 102
[NOR] Episode: 460, avg reward: -383.50471301, e: 0.1, learning_rate: 0.01, buffer_len: 52982, episode_length: 100
[NOR] Episode: 470, avg reward: -447.64329634, e: 0.1, learning_rate: 0.01, buffer_len: 54115, episode_length: 120
[NOR] Episode: 480, avg reward: -341.121506809, e: 0.1, learning_rate: 0.01, buffer_len: 55269, episode_length: 91
[NOR] Episode: 490, avg reward: -326.551124974, e: 0.1, learning_rate: 0.01, buffer_len: 56540, episode_length: 133
[NOR] Episode: 500, avg reward: -356.344951674, e: 0.1, learning_rate: 0.01, buffer_len: 57691, episode_length: 119
[NOR] Episode: 510, avg reward: -360.21340113, e: 0.1, learning_rate: 0.01, buffer_len: 58873, episode_length: 148
[NOR] Episode: 520, avg reward: -433.307712834, e: 0.1, learning_rate: 0.01, buffer_len: 60011, episode_length: 143
[NOR] Episode: 530, avg reward: -328.46542743, e: 0.1, learning_rate: 0.01, buffer_len: 61123, episode_length: 101
[NOR] Episode: 540, avg reward: -272.512573615, e: 0.1, learning_rate: 0.01, buffer_len: 62273, episode_length: 153
[NOR] Episode: 550, avg reward: -444.977233715, e: 0.1, learning_rate: 0.01, buffer_len: 63545, episode_length: 97
[NOR] Episode: 560, avg reward: -485.960343913, e: 0.1, learning_rate: 0.01, buffer_len: 64683, episode_length: 106
[NOR] Episode: 570, avg reward: -475.831637953, e: 0.1, learning_rate: 0.01, buffer_len: 65849, episode_length: 125
[NOR] Episode: 580, avg reward: -457.706761594, e: 0.1, learning_rate: 0.01, buffer_len: 67033, episode_length: 164
[NOR] Episode: 590, avg reward: -313.358844289, e: 0.1, learning_rate: 0.01, buffer_len: 68081, episode_length: 98
[NOR] Episode: 600, avg reward: -445.308916834, e: 0.1, learning_rate: 0.01, buffer_len: 68962, episode_length: 86
[NOR] Episode: 610, avg reward: -391.38912075, e: 0.1, learning_rate: 0.01, buffer_len: 70155, episode_length: 119
[NOR] Episode: 620, avg reward: -390.268204894, e: 0.1, learning_rate: 0.01, buffer_len: 71095, episode_length: 93
[NOR] Episode: 630, avg reward: -504.913015153, e: 0.1, learning_rate: 0.01, buffer_len: 72073, episode_length: 96
[NOR] Episode: 640, avg reward: -434.352767638, e: 0.1, learning_rate: 0.01, buffer_len: 73146, episode_length: 114
[NOR] Episode: 650, avg reward: -486.303457151, e: 0.1, learning_rate: 0.01, buffer_len: 74339, episode_length: 201
[NOR] Episode: 660, avg reward: -478.736288867, e: 0.1, learning_rate: 0.01, buffer_len: 75562, episode_length: 108
[NOR] Episode: 670, avg reward: -456.321772751, e: 0.1, learning_rate: 0.01, buffer_len: 76697, episode_length: 92
[NOR] Episode: 680, avg reward: -372.415885918, e: 0.1, learning_rate: 0.01, buffer_len: 77874, episode_length: 117
[NOR] Episode: 690, avg reward: -325.614941266, e: 0.1, learning_rate: 0.01, buffer_len: 79040, episode_length: 103
[NOR] Episode: 700, avg reward: -371.396563223, e: 0.1, learning_rate: 0.01, buffer_len: 80231, episode_length: 137
[NOR] Episode: 710, avg reward: -404.693886108, e: 0.1, learning_rate: 0.01, buffer_len: 81467, episode_length: 115
[NOR] Episode: 720, avg reward: -333.213309455, e: 0.1, learning_rate: 0.01, buffer_len: 82864, episode_length: 138
[NOR] Episode: 730, avg reward: -360.921827804, e: 0.1, learning_rate: 0.01, buffer_len: 84145, episode_length: 132
[NOR] Episode: 740, avg reward: -362.457130995, e: 0.1, learning_rate: 0.01, buffer_len: 85381, episode_length: 120
[NOR] Episode: 750, avg reward: -400.407745427, e: 0.1, learning_rate: 0.01, buffer_len: 86700, episode_length: 141
[NOR] Episode: 760, avg reward: -326.558854641, e: 0.1, learning_rate: 0.01, buffer_len: 87889, episode_length: 96
[NOR] Episode: 770, avg reward: -458.951679234, e: 0.1, learning_rate: 0.01, buffer_len: 89099, episode_length: 89
[NOR] Episode: 780, avg reward: -388.015868978, e: 0.1, learning_rate: 0.01, buffer_len: 90437, episode_length: 142
[NOR] Episode: 790, avg reward: -376.963521115, e: 0.1, learning_rate: 0.01, buffer_len: 91849, episode_length: 140
[NOR] Episode: 800, avg reward: -351.366265408, e: 0.1, learning_rate: 0.01, buffer_len: 93043, episode_length: 159
[NOR] Episode: 810, avg reward: -371.152401056, e: 0.1, learning_rate: 0.01, buffer_len: 94263, episode_length: 93
[NOR] Episode: 820, avg reward: -371.038859161, e: 0.1, learning_rate: 0.01, buffer_len: 95543, episode_length: 123
[NOR] Episode: 830, avg reward: -394.108070027, e: 0.1, learning_rate: 0.01, buffer_len: 96881, episode_length: 161
[NOR] Episode: 840, avg reward: -337.308101188, e: 0.1, learning_rate: 0.01, buffer_len: 98071, episode_length: 73
[NOR] Episode: 850, avg reward: -410.478011782, e: 0.1, learning_rate: 0.01, buffer_len: 99114, episode_length: 83
[NOR] Episode: 860, avg reward: -400.738796184, e: 0.1, learning_rate: 0.01, buffer_len: 100395, episode_length: 124
[NOR] Episode: 870, avg reward: -359.491005763, e: 0.1, learning_rate: 0.01, buffer_len: 101469, episode_length: 109
[NOR] Episode: 880, avg reward: -430.46278473, e: 0.1, learning_rate: 0.01, buffer_len: 102779, episode_length: 107
[NOR] Episode: 890, avg reward: -331.56321129, e: 0.1, learning_rate: 0.01, buffer_len: 103882, episode_length: 105
[NOR] Episode: 900, avg reward: -349.542762717, e: 0.1, learning_rate: 0.01, buffer_len: 104878, episode_length: 97
[NOR] Episode: 910, avg reward: -379.174235146, e: 0.1, learning_rate: 0.01, buffer_len: 105845, episode_length: 76
[NOR] Episode: 920, avg reward: -460.269071114, e: 0.1, learning_rate: 0.01, buffer_len: 107098, episode_length: 91
[NOR] Episode: 930, avg reward: -486.978642861, e: 0.1, learning_rate: 0.01, buffer_len: 108361, episode_length: 84
[NOR] Episode: 940, avg reward: -436.92901612, e: 0.1, learning_rate: 0.01, buffer_len: 109536, episode_length: 84
[NOR] Episode: 950, avg reward: -377.592918855, e: 0.1, learning_rate: 0.01, buffer_len: 110961, episode_length: 198
[NOR] Episode: 960, avg reward: -442.299033272, e: 0.1, learning_rate: 0.01, buffer_len: 112132, episode_length: 137
[NOR] Episode: 970, avg reward: -442.026512399, e: 0.1, learning_rate: 0.01, buffer_len: 113489, episode_length: 83
[NOR] Episode: 980, avg reward: -409.516295262, e: 0.1, learning_rate: 0.01, buffer_len: 114773, episode_length: 175
[NOR] Episode: 990, avg reward: -428.829660688, e: 0.1, learning_rate: 0.01, buffer_len: 116211, episode_length: 102
[NOR] Episode: 1000, avg reward: -365.484269654, e: 0.1, learning_rate: 0.01, buffer_len: 117403, episode_length: 93
[NOR] Episode: 1010, avg reward: -498.174404376, e: 0.1, learning_rate: 0.01, buffer_len: 118545, episode_length: 71
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-06d27bfb4fbb> in <module>()
      5     max_episode_length = 2000,
      6     learning_rate = 0.01, #lambda t: max(0.001, k / (k + t)),
----> 7     e = 0.1 #interp1d([0, 4000], [1, 0.05], fill_value=0.05, bounds_error=False)
      8 )

/home/cristian/data/cristian/tfinterface/tfinterface/reinforcement/dnq.pyc in fit(self, env, learning_rate, e, print_step, episodes, max_episode_length, discount, batch_size)
    581 
    582                 S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
--> 583                 MaxQs1 = self.sess.run(self.target_network.max_Qs, feed_dict={self.inputs.s: S1})
    584 
    585                 feed_dict = self.fit_feed(S, A, R, MaxQs1, Done, _learning_rate)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    765     try:
    766       result = self._run(None, fetches, feed_dict, options_ptr,
--> 767                          run_metadata_ptr)
    768       if run_metadata:
    769         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    963     if final_fetches or final_targets:
    964       results = self._do_run(handle, final_targets, final_fetches,
--> 965                              feed_dict_string, options, run_metadata)
    966     else:
    967       results = []

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1013     if handle is None:
   1014       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015                            target_list, options, run_metadata)
   1016     else:
   1017       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1020   def _do_call(self, fn, *args):
   1021     try:
-> 1022       return fn(*args)
   1023     except errors.OpError as e:
   1024       message = compat.as_text(e.message)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1002         return tf_session.TF_Run(session, options,
   1003                                  feed_dict, fetch_list, target_list,
-> 1004                                  status, run_metadata)
   1005 
   1006     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [ ]:
import time

model_run = DQN(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)



s = env.reset()
done = False

while not done:
    a = model_run.choose_action(s, e=0.2)
    s, r, done, info = env.step(a)
    env.render()
    time.sleep(0.01)

In [ ]: