In [1]:
import numpy as np
import gym
from numpy.random import choice
import random
from tensorbuilder.api import *
import tensorflow as tf

env = gym.make("FrozenLake-v0")


hdf5 not supported (please install/reinstall h5py)
[2017-01-29 01:11:44,406] Making new env: FrozenLake-v0

In [2]:
def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)

def discount(rewards, y):
    r_accum = 0.0
    gains = []
    for r in reversed(list(rewards)):
        r_accum = r + y * r_accum 
        gains.insert(0, r_accum)
        
    return gains

In [11]:
model_name = "policy-gradient.model"
model_path = "/models/" + model_name
n_actions = env.action_space.n
n_states = env.observation_space.n

class Model(object):
    
    def __init__(self, y, restore=False):
        
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        
        with self.graph.as_default():
            with tf.device("cpu:0"):
                s = tf.placeholder(tf.int32, [None], name='s')
                a = tf.placeholder(tf.int32, [None], name='a')
                r = tf.placeholder(tf.float32, [None], name='r')
                lr = tf.placeholder(tf.float32, [], name='lr')

                ops = dict(trainable=True, weights_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01), biases_initializer=None) #tf.random_uniform_initializer(minval=0, maxval=0.01))


                Ps = Pipe(
                    s,
                    T
                    .one_hot(n_states)
                    .softmax_layer(n_actions, scope='softmax_layer', **ops)
                )

                Psa = select_columns(Ps, a)

                loss = -tf.reduce_sum(tf.log(Psa) * r)
                update = tf.train.GradientDescentOptimizer(lr).minimize(loss)

                self.writer = tf.summary.FileWriter('/logs/' +  model_name)
                self.saver = tf.train.Saver()
                
                self.variables_initializer = tf.global_variables_initializer()
                
        

            if restore:
                self.saver.restore(self.sess, model_path)
            else:
                self.sess.run(self.variables_initializer)

        self.s = s; self.a = a; self.r = r;
        self.Ps = Ps; self.Psa = Psa; self.update = update
        self.lr = lr
                
    def next_action(self, state):
        actions = self.sess.run(self.Ps, feed_dict={self.s: [state]})[0]
        n = len(actions)

        return choice(n, p=actions)

    def train(self, s, a, r, s1, lr):
        #train
        self.train_offline([s], [a], [r], [s1], lr)
        
    def train_offline(self, S, A, R, S1, lr):
        #train
        self.sess.run(self.update, feed_dict={
            self.s: S, self.a: A, self.r: R, 
            self.lr: lr
        })

    def save(self, model_path):
        self.saver.save(self.sess, model_path)

    def restore(self, model_path):
        self.sess.close()
        self.sess = tf.Session(graph=self.graph)
        self.saver.restore(self.sess, model_path)

    @staticmethod
    def learning_rate(t, b, k):
        return b * k / (k + t)

In [12]:
y = 0.95
b = 0.5
k = 2000.0
e = 0.05

model = Model(y, restore=False)

r_total = 0.0

for t in range(200000):
    lr = model.learning_rate(t, b, k)
    s = env.reset()
    
    S = []; A = []; R = []; S1 = []
    
    done = False
    while not done:
        #next action
        a = model.next_action(s)

        #take step
        s1, r, done, info = env.step(a)
        r_total += r
        
        #append values
        S.append(s); A.append(a); R.append(r); S1.append(s1)
        
        #update state
        s = s1
        
    R = discount(R, y)
        
    #train
    model.train_offline(S, A, R, S1, lr)

    if t % 500 == 0:
        print r_total, "of", 500, ", lr:", lr
        r_total = 0
        model.save(model_path)


0.0 of 500 , lr: 0.5
8.0 of 500 , lr: 0.4
8.0 of 500 , lr: 0.333333333333
14.0 of 500 , lr: 0.285714285714
12.0 of 500 , lr: 0.25
19.0 of 500 , lr: 0.222222222222
26.0 of 500 , lr: 0.2
32.0 of 500 , lr: 0.181818181818
36.0 of 500 , lr: 0.166666666667
71.0 of 500 , lr: 0.153846153846
95.0 of 500 , lr: 0.142857142857
109.0 of 500 , lr: 0.133333333333
153.0 of 500 , lr: 0.125
184.0 of 500 , lr: 0.117647058824
208.0 of 500 , lr: 0.111111111111
261.0 of 500 , lr: 0.105263157895
311.0 of 500 , lr: 0.1
332.0 of 500 , lr: 0.0952380952381
359.0 of 500 , lr: 0.0909090909091
352.0 of 500 , lr: 0.0869565217391
369.0 of 500 , lr: 0.0833333333333
353.0 of 500 , lr: 0.08
369.0 of 500 , lr: 0.0769230769231
368.0 of 500 , lr: 0.0740740740741
357.0 of 500 , lr: 0.0714285714286
361.0 of 500 , lr: 0.0689655172414
375.0 of 500 , lr: 0.0666666666667
359.0 of 500 , lr: 0.0645161290323
379.0 of 500 , lr: 0.0625
366.0 of 500 , lr: 0.0606060606061

KeyboardInterruptTraceback (most recent call last)
<ipython-input-12-3c88868f2b63> in <module>()
     17     while not done:
     18         #next action
---> 19         a = model.next_action(s)
     20 
     21         #take step

<ipython-input-11-106bfd9fce64> in next_action(self, state)
     50 
     51     def next_action(self, state):
---> 52         actions = self.sess.run(self.Ps, feed_dict={self.s: [state]})[0]
     53         n = len(actions)
     54 

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    764     try:
    765       result = self._run(None, fetches, feed_dict, options_ptr,
--> 766                          run_metadata_ptr)
    767       if run_metadata:
    768         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    962     if final_fetches or final_targets:
    963       results = self._do_run(handle, final_targets, final_fetches,
--> 964                              feed_dict_string, options, run_metadata)
    965     else:
    966       results = []

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1012     if handle is None:
   1013       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1014                            target_list, options, run_metadata)
   1015     else:
   1016       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1019   def _do_call(self, fn, *args):
   1020     try:
-> 1021       return fn(*args)
   1022     except errors.OpError as e:
   1023       message = compat.as_text(e.message)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1001         return tf_session.TF_Run(session, options,
   1002                                  feed_dict, fetch_list, target_list,
-> 1003                                  status, run_metadata)
   1004 
   1005     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [13]:
s = env.reset()
    
for i in range(100):
    a = model.next_action(s)
    s, r, done, info = env.step(a)
    env.render()
    print("")

    if done:
        print(r)
        break


SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Up)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Right)

SFFF
FHFH
FFFH
HFFG
  (Down)

SFFF
FHFH
FFFH
HFFG
  (Left)

SFFF
FHFH
FFFH
HFFG
  (Down)

1.0