In [12]:
import numpy as np
import gym
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import DeepActorCritic, ExpandedStateEnv
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
from tensorflow.python import debug as tf_debug
import os
from scipy.interpolate import interp1d
import numbers


def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

In [30]:
run = get_run()
env = ExpandedStateEnv("CartPole-v1", 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run{}".format(run)

print("Run: {}".format(run))


[2017-03-04 08:38:02,318] Making new env: CartPole-v1
True
Discrete(2)
Run: 202

In [31]:
model = DeepActorCritic(
    n_actions, n_states, y=0.99999, 
    buffer_length=1000000, pi=0.02,
    model_path = model_path,
    logs_path = logs_path
)

In [32]:
k = 40000.
model.fit(
    env, keep_prob=0.5,print_step=10, 
    episodes=int(1e5), max_episode_length=200e3, batch_size=32,
    learning_rate = 0.01 # lambda t: 0.05 * k / (k + t)
)


[MAX] Episode: 0, Length: 24, Reward: 24.0, buffer_len: 24
[MAX] Episode: 2, Length: 33, Reward: 33.0, buffer_len: 80
[MAX] Episode: 3, Length: 55, Reward: 55.0, buffer_len: 135
[MAX] Episode: 6, Length: 55, Reward: 55.0, buffer_len: 265
[NOR] Episode: 10, Length: 31, Avg Reward: 36.3, Learning Rate: 0.01, buffer_len: 363
Loss: 0.467469424009
[MAX] Episode: 16, Length: 58, Reward: 58.0, buffer_len: 578
[MAX] Episode: 20, Length: 59, Reward: 59.0, buffer_len: 759
[NOR] Episode: 20, Length: 59, Avg Reward: 39.6, Learning Rate: 0.01, buffer_len: 759
Loss: 0.493125587702
[MAX] Episode: 26, Length: 98, Reward: 98.0, buffer_len: 1029
[NOR] Episode: 30, Length: 45, Avg Reward: 38.9, Learning Rate: 0.01, buffer_len: 1148
Loss: 0.613241314888
[NOR] Episode: 40, Length: 26, Avg Reward: 30.1, Learning Rate: 0.01, buffer_len: 1449
Loss: 0.518278598785
[NOR] Episode: 50, Length: 16, Avg Reward: 35.3, Learning Rate: 0.01, buffer_len: 1802
Loss: 0.616812705994
[NOR] Episode: 60, Length: 37, Avg Reward: 30.8, Learning Rate: 0.01, buffer_len: 2110
Loss: 0.631880402565
[NOR] Episode: 70, Length: 21, Avg Reward: 32.8, Learning Rate: 0.01, buffer_len: 2438
Loss: 0.491368293762
[NOR] Episode: 80, Length: 14, Avg Reward: 31.6, Learning Rate: 0.01, buffer_len: 2754
Loss: 0.558840036392
[NOR] Episode: 90, Length: 40, Avg Reward: 31.0, Learning Rate: 0.01, buffer_len: 3064
Loss: 0.475624889135
[NOR] Episode: 100, Length: 23, Avg Reward: 24.2, Learning Rate: 0.01, buffer_len: 3306
Loss: 0.64003443718
[NOR] Episode: 110, Length: 14, Avg Reward: 21.3, Learning Rate: 0.01, buffer_len: 3519
Loss: 0.513172268867
[NOR] Episode: 120, Length: 19, Avg Reward: 24.8, Learning Rate: 0.01, buffer_len: 3767
Loss: 0.639866828918
[NOR] Episode: 130, Length: 48, Avg Reward: 44.3, Learning Rate: 0.01, buffer_len: 4210
Loss: 0.583773493767
[NOR] Episode: 140, Length: 31, Avg Reward: 27.5, Learning Rate: 0.01, buffer_len: 4485
Loss: 0.624247670174
[NOR] Episode: 150, Length: 28, Avg Reward: 22.7, Learning Rate: 0.01, buffer_len: 4712
Loss: 0.61384832859
[NOR] Episode: 160, Length: 36, Avg Reward: 23.8, Learning Rate: 0.01, buffer_len: 4950
Loss: 0.52715575695
[NOR] Episode: 170, Length: 54, Avg Reward: 37.3, Learning Rate: 0.01, buffer_len: 5323
Loss: 0.498015910387
[NOR] Episode: 180, Length: 30, Avg Reward: 27.0, Learning Rate: 0.01, buffer_len: 5593
Loss: 0.669825553894
[NOR] Episode: 190, Length: 24, Avg Reward: 29.1, Learning Rate: 0.01, buffer_len: 5884
Loss: 0.652291178703
[NOR] Episode: 200, Length: 12, Avg Reward: 25.4, Learning Rate: 0.01, buffer_len: 6138
Loss: 0.642940580845
[NOR] Episode: 210, Length: 53, Avg Reward: 27.5, Learning Rate: 0.01, buffer_len: 6413
Loss: 0.480633467436
[NOR] Episode: 220, Length: 34, Avg Reward: 23.5, Learning Rate: 0.01, buffer_len: 6648
Loss: 0.673851668835
[NOR] Episode: 230, Length: 34, Avg Reward: 35.9, Learning Rate: 0.01, buffer_len: 7007
Loss: 0.690881490707
[NOR] Episode: 240, Length: 92, Avg Reward: 38.8, Learning Rate: 0.01, buffer_len: 7395
Loss: 0.602312326431
[NOR] Episode: 250, Length: 25, Avg Reward: 35.7, Learning Rate: 0.01, buffer_len: 7752
Loss: 0.609916567802
[NOR] Episode: 260, Length: 16, Avg Reward: 29.0, Learning Rate: 0.01, buffer_len: 8042
Loss: 0.567219614983
[NOR] Episode: 270, Length: 50, Avg Reward: 39.4, Learning Rate: 0.01, buffer_len: 8436
Loss: 0.694430589676
[NOR] Episode: 280, Length: 46, Avg Reward: 40.0, Learning Rate: 0.01, buffer_len: 8836
Loss: 0.215962231159
[MAX] Episode: 284, Length: 149, Reward: 149.0, buffer_len: 9049
[NOR] Episode: 290, Length: 38, Avg Reward: 45.9, Learning Rate: 0.01, buffer_len: 9295
Loss: 0.47364115715
[NOR] Episode: 300, Length: 68, Avg Reward: 42.6, Learning Rate: 0.01, buffer_len: 9721
Loss: 0.6573741436
[NOR] Episode: 310, Length: 34, Avg Reward: 30.9, Learning Rate: 0.01, buffer_len: 10030
Loss: 0.5191385746
[NOR] Episode: 320, Length: 32, Avg Reward: 54.4, Learning Rate: 0.01, buffer_len: 10574
Loss: 0.520025372505
[NOR] Episode: 330, Length: 33, Avg Reward: 41.7, Learning Rate: 0.01, buffer_len: 10991
Loss: 0.767980992794
[NOR] Episode: 340, Length: 39, Avg Reward: 54.8, Learning Rate: 0.01, buffer_len: 11539
Loss: 0.593125343323
[MAX] Episode: 347, Length: 149, Reward: 149.0, buffer_len: 11992
[MAX] Episode: 348, Length: 158, Reward: 158.0, buffer_len: 12150
[NOR] Episode: 350, Length: 18, Avg Reward: 73.4, Learning Rate: 0.01, buffer_len: 12273
Loss: 0.321682721376
[NOR] Episode: 360, Length: 107, Avg Reward: 51.6, Learning Rate: 0.01, buffer_len: 12789
Loss: 0.318740606308
[MAX] Episode: 363, Length: 167, Reward: 167.0, buffer_len: 13037
[NOR] Episode: 370, Length: 87, Avg Reward: 71.7, Learning Rate: 0.01, buffer_len: 13506
Loss: 0.433030188084
[MAX] Episode: 378, Length: 362, Reward: 362.0, buffer_len: 14270
[NOR] Episode: 380, Length: 142, Avg Reward: 105.8, Learning Rate: 0.01, buffer_len: 14564
Loss: 0.120967656374
[NOR] Episode: 390, Length: 142, Avg Reward: 123.0, Learning Rate: 0.01, buffer_len: 15794
Loss: 0.702686190605
[MAX] Episode: 399, Length: 536, Reward: 536.0, buffer_len: 17684
[NOR] Episode: 400, Length: 345, Avg Reward: 223.5, Learning Rate: 0.01, buffer_len: 18029
Loss: -0.381382107735
[MAX] Episode: 401, Length: 607, Reward: 607.0, buffer_len: 18636
[MAX] Episode: 406, Length: 684, Reward: 684.0, buffer_len: 20763
[NOR] Episode: 410, Length: 302, Avg Reward: 405.5, Learning Rate: 0.01, buffer_len: 22084
Loss: -0.515115559101
[NOR] Episode: 420, Length: 190, Avg Reward: 243.1, Learning Rate: 0.01, buffer_len: 24515
Loss: -1.79723978043
[NOR] Episode: 430, Length: 226, Avg Reward: 259.0, Learning Rate: 0.01, buffer_len: 27105
Loss: -2.16159963608
[NOR] Episode: 440, Length: 269, Avg Reward: 247.1, Learning Rate: 0.01, buffer_len: 29576
Loss: -3.03325772285
[NOR] Episode: 450, Length: 239, Avg Reward: 232.3, Learning Rate: 0.01, buffer_len: 31899
Loss: -2.008664608
[NOR] Episode: 460, Length: 140, Avg Reward: 260.8, Learning Rate: 0.01, buffer_len: 34507
Loss: -2.14649581909
[NOR] Episode: 470, Length: 351, Avg Reward: 279.2, Learning Rate: 0.01, buffer_len: 37299
Loss: 1.52481806278
[NOR] Episode: 480, Length: 453, Avg Reward: 351.6, Learning Rate: 0.01, buffer_len: 40815
Loss: -3.06407260895
[MAX] Episode: 486, Length: 707, Reward: 707.0, buffer_len: 43137
[NOR] Episode: 490, Length: 316, Avg Reward: 380.6, Learning Rate: 0.01, buffer_len: 44621
Loss: -1.65523827076
[NOR] Episode: 500, Length: 303, Avg Reward: 380.1, Learning Rate: 0.01, buffer_len: 48422
Loss: -2.06640315056
[MAX] Episode: 509, Length: 772, Reward: 772.0, buffer_len: 53133
[NOR] Episode: 510, Length: 377, Avg Reward: 508.8, Learning Rate: 0.01, buffer_len: 53510
Loss: -2.52667427063
[MAX] Episode: 516, Length: 930, Reward: 930.0, buffer_len: 56895
[NOR] Episode: 520, Length: 440, Avg Reward: 553.3, Learning Rate: 0.01, buffer_len: 59043
Loss: -1.7752764225
[MAX] Episode: 528, Length: 1619, Reward: 1619.0, buffer_len: 64273
[MAX] Episode: 529, Length: 1784, Reward: 1784.0, buffer_len: 66057
[NOR] Episode: 530, Length: 644, Avg Reward: 765.8, Learning Rate: 0.01, buffer_len: 66701
Loss: -4.25634002686
[MAX] Episode: 533, Length: 2064, Reward: 2064.0, buffer_len: 69836
[NOR] Episode: 540, Length: 634, Avg Reward: 803.1, Learning Rate: 0.01, buffer_len: 74732
Loss: -4.90438556671
[NOR] Episode: 550, Length: 769, Avg Reward: 910.1, Learning Rate: 0.01, buffer_len: 83833
Loss: -6.1105465889
[NOR] Episode: 560, Length: 1162, Avg Reward: 1229.0, Learning Rate: 0.01, buffer_len: 96123
Loss: -2.94537115097
[MAX] Episode: 565, Length: 3669, Reward: 3669.0, buffer_len: 104622
[MAX] Episode: 570, Length: 7318, Reward: 7318.0, buffer_len: 119376
[NOR] Episode: 570, Length: 7318, Avg Reward: 2325.3, Learning Rate: 0.01, buffer_len: 119376
Loss: -7.73536062241
[MAX] Episode: 571, Length: 8144, Reward: 8144.0, buffer_len: 127520
[MAX] Episode: 576, Length: 200001, Reward: 200001.0, buffer_len: 336621
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-bb1ff83461af> in <module>()
      3     env, keep_prob=0.5,print_step=10,
      4     episodes=int(1e5), max_episode_length=200e3, batch_size=32,
----> 5     learning_rate = 0.01 # lambda t: 0.05 * k / (k + t)
      6 )

/home/cristian/data/cristian/tfinterface/tfinterface/reinforcement/deep_actor_critic.py in fit(self, env, keep_prob, learning_rate, print_step, update_target, episodes, max_episode_length, batch_size)
    609                 if self.global_step > 1:
    610                     _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
--> 611                     self.writer.add_summary(summaries)
    612 
    613                 if self.global_step % update_target == 0:

KeyboardInterrupt: 

In [35]:
import time

model_run = DeepActorCritic(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)
env = ExpandedStateEnv("CartPole-v1", 3)
s = env.reset()
done = False
total = 0
while not done:
    total += 1
    a = model_run.choose_action(s, 1.0, e=0.2)
    s, r, done, info = env.step(a)
    env.render()
    time.sleep(0.01)
    
print total


[2017-03-04 08:53:24,278] Making new env: CartPole-v1
True
---------------------------------------------------------------------------
ArgumentError                             Traceback (most recent call last)
<ipython-input-35-675e5e61d077> in <module>()
     15     a = model_run.choose_action(s, 1.0, e=0.2)
     16     s, r, done, info = env.step(a)
---> 17     env.render()
     18     time.sleep(0.01)
     19 

/usr/local/lib/python2.7/dist-packages/gym/core.pyc in render(self, mode, close)
    172             raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes))
    173 
--> 174         return self._render(mode=mode, close=close)
    175 
    176     def close(self):

/usr/local/lib/python2.7/dist-packages/gym/envs/classic_control/cartpole.pyc in _render(self, mode, close)
    140         self.poletrans.set_rotation(-x[2])
    141 
--> 142         return self.viewer.render(return_rgb_array = mode=='rgb_array')

/usr/local/lib/python2.7/dist-packages/gym/envs/classic_control/rendering.pyc in render(self, return_rgb_array)
     82         self.window.clear()
     83         self.window.switch_to()
---> 84         self.window.dispatch_events()
     85         self.transform.enable()
     86         for geom in self.geoms:

/usr/local/lib/python2.7/dist-packages/pyglet/window/xlib/__init__.pyc in dispatch_events(self)
    851         # Check for the events specific to this window
    852         while xlib.XCheckWindowEvent(_x_display, _window,
--> 853                                      0x1ffffff, byref(e)):
    854             # Key events are filtered by the xlib window event
    855             # handler so they get a shot at the prefiltered event.

ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type

In [ ]: