In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time
name = "actor-critic-base"
In [3]:
class Inputs(object):
def __init__(self, n_states, scope):
with tf.variable_scope(scope):
self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')
self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
self.a = tf.placeholder(tf.int32, [None], name='a')
self.r = tf.placeholder(tf.float32, [None], name='r')
self.v1 = tf.placeholder(tf.float32, [None], name='V1')
self.done = tf.placeholder(tf.float32, [None], name='done')
self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
self.training = tf.placeholder(tf.bool, [], name='training')
self.pi = tf.placeholder(tf.float32, [], name='pi')
class Critic(object):
def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
with tf.variable_scope(scope):
self.V = base_model.define_critic_network(inputs, n_actions, n_states)
self.target = soft_if(inputs.done, inputs.r, inputs.r + y * inputs.v1)
self.error = self.target - self.V
self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)
self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
avg_error, std_error = tf.nn.moments(self.error, [0])
self.summaries = tf.summary.merge([
tf.summary.scalar('loss', self.loss),
tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
tf.summary.scalar('avg_error', avg_error),
tf.summary.scalar('std_error', std_error),
tf.summary.histogram(
'avg_action', Pipe(
inputs.a,
Then(tf.one_hot, n_actions),
Then(tf.reduce_mean, axis=0)
))
]+[
tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
])
class Actor(object):
def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
with tf.variable_scope(scope):
self.P = base_model.define_actor_network(inputs, n_actions, n_states)
self.Pa = select_columns(self.P, inputs.a)
self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
self.loss = tf.reduce_mean(self.loss)
self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
self.summaries = tf.summary.merge([
tf.summary.scalar('loss', self.loss),
tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
tf.summary.histogram(
'avg_action', Pipe(
inputs.a,
Then(tf.one_hot, n_actions),
Then(tf.reduce_mean, axis=0)
))
]+[
tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
])
In [4]:
class LunarLander(ModelBase):
def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
self.global_max = float('-inf')
self.replay_buffer = ExperienceReplay(max_length=buffer_length)
with self.graph.as_default(), tf.device("cpu:0"):
self.inputs = Inputs(n_states, "inputs")
self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")
self.update = tf.group(self.critic.update, self.actor.update)
self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)
self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])
self.update_target = tf.group(*[
t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
])
def define_actor_network(self, inputs, n_actions, n_states):
ops = dict(
trainable=True,
kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
)
net = inputs.s
net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
net = tf.nn.dropout(net, inputs.keep_prob)
net = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
return net
def define_critic_network(self, inputs, n_actions, n_states):
ops = dict(
trainable=True,
kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
)
net = inputs.s
net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)
net = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
return net
def predict_feed(self, S):
return {
self.inputs.s: S,
self.inputs.keep_prob: 1.0,
self.inputs.training: False
}
def predict(self, state, e = 0.0):
predict_feed = self.predict_feed([state])
actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
actions = actions[0]
n = len(actions)
if random.random() < e:
return random.randint(0, n-1)
else:
return np.random.choice(n, p=actions)
def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
return {
self.inputs.s: S,
self.inputs.a: A,
self.inputs.r: R,
self.inputs.v1: V1,
self.inputs.done: Done,
self.inputs.learning_rate: learning_rate,
self.inputs.keep_prob: keep_prob,
self.inputs.training: True
}
def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10,
update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
r_total = 0.
for episode in range(episodes):
done = False
ep_step = 0
s = env.reset()
episode_length = 0
ep_reward = 0.
while not done and ep_step <= max_episode_length:
self.global_step += 1
episode_length += 1
ep_step += 1
_learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
_e = e(self.global_step) if hasattr(e, '__call__') else e
a = self.predict(s, e = _e)
s1, r, done, info = env.step(a)
r_total += r
ep_reward += r
self.replay_buffer.append((s, a, r, s1, float(done)))
S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
predict_feed = self.predict_feed(S1)
V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)
fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
_, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
self.writer.add_summary(summaries, self.global_step)
if self.global_step % update_target_step == 0:
self.sess.run(self.update_target)
s = s1
episode_length_summary = self.sess.run(self.episode_length_summary,
feed_dict={self.inputs.episode_length: episode_length})
self.writer.add_summary(episode_length_summary, self.global_step)
if ep_reward >= self.global_max:
print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
self.global_max = ep_reward
if episode % print_step == 0 and episode > 0:
avg_r = r_total / print_step
actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
print("Loss: {}".format(actor_loss))
self.save()
r_total = 0.
In [5]:
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{name}".format(name = name))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = "{path}/models/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)
model = LunarLander(
n_actions, n_states, y=0.9999,
buffer_length=500000,
model_path = model_path,
logs_path = logs_path,
restore = False,
pi = 0.005
)
In [ ]:
k = 40000.
model.fit(
env, print_step=10,
episodes=int(1e5), max_episode_length=10000, batch_size=32,
learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
keep_prob = 0.5,
update_target_step = 1
)
In [6]:
env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)
model_run = LunarLander(
n_actions, n_states,
model_path = model_path,
flush_secs = 3.0,
restore = True
)
for i in range(100):
s = env.reset()
done = False
total = 0.
ep = 0
while not done and ep < 700:
ep += 1
a = model_run.predict(s, 0.0)
s, r, done, info = env.step(a)
total += r
env.render()
time.sleep(0.01)
print(total)
env.render(close=True)
In [ ]: