In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [3]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.V = base_model.define_critic_network(inputs, n_actions, n_states)

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.P = base_model.define_actor_network(inputs, n_actions, n_states)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])

In [4]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
            
            
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
        net = tf.nn.dropout(net, inputs.keep_prob)
        
        net = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
        
        return net


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
        net = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
        
        return net
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.

In [5]:
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{name}".format(name = name))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)


[2017-03-17 12:07:01,773] Making new env: LunarLander-v2
[2017-03-17 12:07:02,147] Creating monitor directory monitor/actor-critic-base
False

In [ ]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)


[2017-03-16 22:21:40,438] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000000.mp4
[2017-03-16 22:21:42,271] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000001.mp4
[MAX] Episode: 0, Length: 141, Reward: -169.862621605, buffer_len: 141
[MAX] Episode: 5, Length: 138, Reward: -87.858969857, buffer_len: 808
[MAX] Episode: 6, Length: 139, Reward: -72.7414348717, buffer_len: 947
[2017-03-16 22:21:47,404] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000008.mp4
[MAX] Episode: 10, Length: 178, Reward: -45.7347469854, buffer_len: 2418
[NOR] Episode: 10, Length: 178, Avg Reward: -286.245647511, e: 0.397180166667, Learning Rate: 0.01, buffer_len: 2418
Loss: -10.3632049561
[MAX] Episode: 14, Length: 137, Reward: -24.5288992475, buffer_len: 3577
[NOR] Episode: 20, Length: 143, Avg Reward: -127.396939849, e: 0.392185666667, Learning Rate: 0.01, buffer_len: 6699
Loss: -5.88282966614
[2017-03-16 22:22:07,444] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000027.mp4
[NOR] Episode: 30, Length: 236, Avg Reward: -141.697028313, e: 0.389630666667, Learning Rate: 0.01, buffer_len: 8889
Loss: -7.21165847778
[MAX] Episode: 34, Length: 158, Reward: 33.342165944, buffer_len: 10008
[MAX] Episode: 36, Length: 189, Reward: 40.2056866199, buffer_len: 10324
[NOR] Episode: 40, Length: 202, Avg Reward: -75.9001141286, e: 0.387244833333, Learning Rate: 0.01, buffer_len: 10934
Loss: -8.61496925354

In [6]:
env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 12:07:02,890] Making new env: LunarLander-v2
[2017-03-17 12:07:02,892] Finished writing results. You can upload them to the scoreboard via gym.upload('/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/actor-critic-base')
False
196.104081847
149.867198645
215.673060563
217.593832845
201.192692875
198.152117614
-35.4402700038
218.394082887
193.044127714
120.793062792
234.628837747
213.755804367
0.398475525016
219.955313615
219.817810851
-31.113964878
204.808844432
218.3261461
223.759720668
230.32005815
192.594561856
192.463367622
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-ac2829c8c03c> in <module>()
     26         total += r
     27         env.render()
---> 28         time.sleep(0.01)
     29 
     30     print(total)

KeyboardInterrupt: 

In [ ]: