In [1]:
import numpy as np
import gym
from numpy.random import choice
import random
from tensorbuilder.api import *
import tensorflow as tf

env = gym.make("CartPole-v1")


hdf5 not supported (please install/reinstall h5py)
[2017-01-29 01:54:24,928] Making new env: CartPole-v1

In [17]:
def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)

def discount(rewards, y):
    r_accum = 0.0
    gains = []
    for r in reversed(list(rewards)):
        r_accum = r + y * r_accum 
        gains.insert(0, r_accum)
        
    return gains


Out[17]:
<tf.Tensor 'softmax_layer/Reshape_1:0' shape=(?, 12, 2) dtype=float32>

In [55]:
model_name = "policy-gradient-cartpole.model"
model_path = "/models/" + model_name
n_actions = env.action_space.n
n_states_env = env.observation_space.shape[0]
n_states = n_states_env * 3

class Model(object):
    
    def __init__(self, y, restore=False):
        
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        
        with self.graph.as_default():
            with tf.device("cpu:0"):
                s = tf.placeholder(tf.float32, [None, n_states], name='s')
                a = tf.placeholder(tf.int32, [None], name='a')
                r = tf.placeholder(tf.float32, [None], name='r')
                lr = tf.placeholder(tf.float32, [], name='lr')
                
                trainer = tf.train.GradientDescentOptimizer(lr)

                ops = dict(trainable=True, weights_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01), biases_initializer=None) #tf.random_uniform_initializer(minval=0, maxval=0.01))

                with tf.variable_scope("actor"):
                    Ps = Pipe(
                        s,
                        T
                        .relu_layer(16, **ops)
                        .softmax_layer(n_actions, scope='softmax_layer', **ops)
                    )
                Psws = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "actor")

                Psa = select_columns(Ps, a)
                
                base = tf.Variable(0.0)
                
                error = r - base

                loss = -tf.reduce_sum(tf.log(Psa) * error)
                gradients = trainer.compute_gradients(loss, var_list=Psws)
                gradients = [ (tf.clip_by_value(g, -1.0, 1.0), w) for g, w in gradients ]
                update = trainer.apply_gradients(gradients)
                
                loss_base = Pipe(error, tf.nn.l2_loss, tf.reduce_sum)
                gradients = trainer.compute_gradients(loss_base, var_list=[base])
                gradients = [ (tf.clip_by_value(g, -1.0, 1.0), w) for g, w in gradients ]
                update_base = trainer.apply_gradients(gradients)

                self.writer = tf.summary.FileWriter('/logs/' +  model_name)
                self.saver = tf.train.Saver()
                
                self.variables_initializer = tf.global_variables_initializer()
                
        

            if restore:
                self.saver.restore(self.sess, model_path)
            else:
                self.sess.run(self.variables_initializer)

        self.s = s; self.a = a; self.r = r;
        self.Ps = Ps; self.Psa = Psa; self.update = update; self.update_base = update_base
        self.lr = lr
                
    def next_action(self, state):
        actions = self.sess.run(self.Ps, feed_dict={self.s: [state]})[0]
        n = len(actions)

        return choice(n, p=actions)

    def train(self, s, a, r, s1, lr):
        #train
        self.train_offline([s], [a], [r], [s1], lr)
        
    def train_offline(self, S, A, R, S1, lr):
        #train
        self.sess.run(self.update, feed_dict={
            self.s: S, self.a: A, self.r: R, 
            self.lr: lr
        })
        
        self.sess.run(self.update_base, feed_dict={
            self.s: S, self.a: A, self.r: R, 
            self.lr: lr
        })

    def save(self, model_path):
        self.saver.save(self.sess, model_path)

    def restore(self, model_path):
        self.sess.close()
        self.sess = tf.Session(graph=self.graph)
        self.saver.restore(self.sess, model_path)

    @staticmethod
    def learning_rate(t, b, k):
        return b * k / (k + t)

In [56]:
y = 0.98
b = 0.1
k = 2000.0

model = Model(y, restore=False)

r_total = 0.0
max_r = 0.0

for t in range(200000):
    lr = model.learning_rate(t, b, k)
    s = env.reset()
    s = np.hstack((s,s,s))
    
    S = []; A = []; R = []; S1 = []
    
    
    for j in range(10000):
        #next action
        a = model.next_action(s)

        #take step
        s1, r, done, info = env.step(a)
        n = s1.shape[0]
        
        s1 = np.hstack((s[n_states_env:], s1))
        
        r_total += r
        
        #append values
        S.append(s); A.append(a); R.append(r); S1.append(s1)
        
        #update state
        s = s1
        
        if done: break
        
    R = discount(R, y)
        
    #train
    model.train_offline(S, A, R, S1, lr)

    save_period = 50
    if t % save_period == 0:
        print r_total / save_period, ", lr:", lr
        r_total = 0
        model.save(model_path)


0.68 , lr: 0.1
17.86 , lr: 0.0975609756098
10.54 , lr: 0.0952380952381
11.12 , lr: 0.093023255814
17.12 , lr: 0.0909090909091
18.26 , lr: 0.0888888888889
12.5 , lr: 0.0869565217391
9.7 , lr: 0.0851063829787
10.28 , lr: 0.0833333333333
9.84 , lr: 0.0816326530612
9.88 , lr: 0.08
10.76 , lr: 0.078431372549
11.14 , lr: 0.0769230769231
15.68 , lr: 0.0754716981132
44.62 , lr: 0.0740740740741
27.0 , lr: 0.0727272727273
15.12 , lr: 0.0714285714286
9.68 , lr: 0.0701754385965
9.9 , lr: 0.0689655172414
9.62 , lr: 0.0677966101695
9.5 , lr: 0.0666666666667
9.82 , lr: 0.0655737704918
9.56 , lr: 0.0645161290323
9.78 , lr: 0.0634920634921
9.68 , lr: 0.0625
9.78 , lr: 0.0615384615385
11.72 , lr: 0.0606060606061
22.4 , lr: 0.0597014925373
27.36 , lr: 0.0588235294118
28.64 , lr: 0.0579710144928
25.68 , lr: 0.0571428571429
26.62 , lr: 0.056338028169
26.4 , lr: 0.0555555555556
29.24 , lr: 0.0547945205479
39.92 , lr: 0.0540540540541
42.7 , lr: 0.0533333333333
40.84 , lr: 0.0526315789474
39.62 , lr: 0.0519480519481
40.32 , lr: 0.0512820512821
41.0 , lr: 0.0506329113924
41.54 , lr: 0.05
41.34 , lr: 0.0493827160494
44.26 , lr: 0.0487804878049
37.88 , lr: 0.0481927710843
41.04 , lr: 0.047619047619
39.56 , lr: 0.0470588235294
42.08 , lr: 0.046511627907
37.32 , lr: 0.0459770114943
38.02 , lr: 0.0454545454545
40.24 , lr: 0.0449438202247
35.1 , lr: 0.0444444444444
40.3 , lr: 0.043956043956
36.74 , lr: 0.0434782608696
34.54 , lr: 0.0430107526882
38.32 , lr: 0.0425531914894
40.32 , lr: 0.0421052631579
32.54 , lr: 0.0416666666667
36.44 , lr: 0.0412371134021
47.32 , lr: 0.0408163265306
52.36 , lr: 0.040404040404
52.38 , lr: 0.04
52.18 , lr: 0.039603960396
43.68 , lr: 0.0392156862745
60.64 , lr: 0.0388349514563
46.24 , lr: 0.0384615384615
50.78 , lr: 0.0380952380952
87.3 , lr: 0.0377358490566
74.76 , lr: 0.0373831775701
68.16 , lr: 0.037037037037
74.68 , lr: 0.0366972477064
54.7 , lr: 0.0363636363636
56.2 , lr: 0.036036036036
89.48 , lr: 0.0357142857143
88.36 , lr: 0.0353982300885
139.68 , lr: 0.0350877192982
304.22 , lr: 0.0347826086957
270.7 , lr: 0.0344827586207
294.86 , lr: 0.034188034188
1951.04 , lr: 0.0338983050847
3749.08 , lr: 0.0336134453782
6700.78 , lr: 0.0333333333333
8032.06 , lr: 0.0330578512397
2111.2 , lr: 0.0327868852459
2062.28 , lr: 0.0325203252033
2978.16 , lr: 0.0322580645161
7493.94 , lr: 0.032

KeyboardInterruptTraceback (most recent call last)
<ipython-input-56-f0e3c412df23> in <module>()
     18     for j in range(10000):
     19         #next action
---> 20         a = model.next_action(s)
     21 
     22         #take step

<ipython-input-55-b028503498ae> in next_action(self, state)
     65 
     66     def next_action(self, state):
---> 67         actions = self.sess.run(self.Ps, feed_dict={self.s: [state]})[0]
     68         n = len(actions)
     69 

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    764     try:
    765       result = self._run(None, fetches, feed_dict, options_ptr,
--> 766                          run_metadata_ptr)
    767       if run_metadata:
    768         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    911     # Validate and process feed_dict.
    912     if feed_dict:
--> 913       feed_dict = nest.flatten_dict_items(feed_dict)
    914       for feed, feed_val in feed_dict.items():
    915         for subfeed, subfeed_val in _feed_fn(feed, feed_val):

/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/nest.pyc in flatten_dict_items(dictionary)
    171     raise TypeError("input must be a dictionary")
    172   flat_dictionary = {}
--> 173   for i, v in six.iteritems(dictionary):
    174     if not is_sequence(i):
    175       if i in flat_dictionary:

/usr/local/lib/python2.7/dist-packages/six.pyc in iteritems(d, **kw)
    597 
    598     def iteritems(d, **kw):
--> 599         return d.iteritems(**kw)
    600 
    601     def iterlists(d, **kw):

KeyboardInterrupt: 

In [42]:
tf.get_variable?

In [13]:
s = env.reset()
s = np.hstack((s,s,s))
    
for i in range(100):
    a = model.next_action(s)
    s1, r, done, info = env.step(a)
    s = np.hstack((s[n_states_env:], s1))
    env.render()
    print("")

    if done:
        print(r)
        break


SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Down)

1.0