In [ ]:
from gym_torcs import TorcsEnv
import numpy as np
import random
import argparse
from keras.models import model_from_json, Model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import Adam
import tensorflow as tf
from keras.engine.training import collect_trainable_weights
import json
from collections import deque
import math
from keras.initializations import normal, identity
from keras.models import model_from_json
from keras.models import Sequential, Model
from keras.engine.training import collect_trainable_weights
from keras.layers import Dense, Flatten, Input, merge, Lambda
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.layers import Activation
from keras.models import Sequential, Model
import timeit
In [ ]:
#Ornstein-Uhlenbeck process
def Ornstein_Uhlenbeck(self, x, mu, theta, sigma):
return theta * (mu - x) + sigma * np.random.randn(1)
In [ ]:
class ReplayBuffer(object):
def __init__(self, buffer_size):
self.buffer_size = buffer_size
self.num_experiences = 0
self.buffer = deque()
def getBatch(self, batch_size):
# Randomly sample batch_size examples
if self.num_experiences < batch_size:
return random.sample(self.buffer, self.num_experiences)
else:
return random.sample(self.buffer, batch_size)
def size(self):
return self.buffer_size
def add(self, state, action, reward, new_state, done):
experience = (state, action, reward, new_state, done)
if self.num_experiences < self.buffer_size:
self.buffer.append(experience)
self.num_experiences += 1
else:
self.buffer.popleft()
self.buffer.append(experience)
def count(self):
# if buffer is full, return buffer size
# otherwise, return experience counter
return self.num_experiences
def erase(self):
self.buffer = deque()
self.num_experiences = 0
In [ ]:
hidden_neuron_l1 = 500
hidden_neuron_l2 = 1000
class ActorNetwork(object):
def __init__(self, sess, state_size, action_size, BATCH_SIZE, TAU, LEARNING_RATE):
self.sess = sess
self.BATCH_SIZE = BATCH_SIZE
self.TAU = TAU
self.LEARNING_RATE = LEARNING_RATE
K.set_session(sess)
#Now create the model
self.model , self.weights, self.state = self.create_actor_network(state_size, action_size)
self.target_model, self.target_weights, self.target_state = self.create_actor_network(state_size, action_size)
self.action_gradient = tf.placeholder(tf.float32,[None, action_size])
self.params_grad = tf.gradients(self.model.output, self.weights, -self.action_gradient)
grads = zip(self.params_grad, self.weights)
self.optimize = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(grads)
self.sess.run(tf.initialize_all_variables())
def train(self, states, action_grads):
self.sess.run(self.optimize, feed_dict={
self.state: states,
self.action_gradient: action_grads
})
def target_train(self):
actor_weights = self.model.get_weights()
actor_target_weights = self.target_model.get_weights()
for i in xrange(len(actor_weights)):
actor_target_weights[i] = self.TAU * actor_weights[i] + (1 - self.TAU)* actor_target_weights[i]
self.target_model.set_weights(actor_target_weights)
def create_actor_network(self, state_size,action_dim):
print("building Actor model")
S = Input(shape=[state_size])
h0 = Dense(hidden_neuron_l1, activation='relu')(S)
h1 = Dense(hidden_neuron_l2, activation='relu')(h0)
Steering = Dense(1,activation='tanh',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
Acceleration = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
Brake = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
V = merge([Steering,Acceleration,Brake],mode='concat')
model = Model(input=S,output=V)
return model, model.trainable_weights, S
In [ ]:
#For visualization
BUFFER_SIZE = 100000
BATCH_SIZE = 32
GAMMA = 0.99
TAU = 0.001 #Target Network HyperParameters
LRA = 0.0001 #Learning rate for Actor
LRC = 0.001 #Lerning rate for Critic
action_dim = 3 #Steering/Acceleration/Brake
state_dim = 29 #of sensors input
state_size=29
np.random.seed(1337)
hidden_neuron_l1 = 500
hidden_neuron_l2 = 1000
vision = False
EXPLORE = 100000.
episode_count = 20
max_steps = 10
reward = 0
done = False
step = 0
epsilon = 1
indicator = 0
#Tensorflow GPU optimization
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from keras import backend as K
K.set_session(sess)
In [ ]:
S = Input(shape=[state_size])
h0 = Dense(hidden_neuron_l1, activation='relu')(S)
h1 = Dense(hidden_neuron_l2, activation='relu')(h0)
Steering = Dense(1,activation='tanh',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
Acceleration = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
Brake = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(h1)
V = merge([Steering,Acceleration,Brake],mode='concat')
model = Model(input=S,output=V)
In [ ]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))
In [ ]:
from keras.utils.visualize_util import plot
plot(model, to_file='ActorModel.png')
In [ ]:
S = Input(shape=[state_size])
A = Input(shape=[action_dim],name='action2')
w1 = Dense(hidden_neuron_l1, activation='relu')(S)
a1 = Dense(hidden_neuron_l2, activation='linear')(A)
h1 = Dense(hidden_neuron_l2, activation='linear')(w1)
h2 = merge([h1,a1],mode='sum')
h3 = Dense(hidden_neuron_l2, activation='relu')(h2)
V = Dense(action_dim,activation='linear')(h3)
Criticmodel = Model(input=[S,A],output=V)
In [ ]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
SVG(model_to_dot(Criticmodel).create(prog='dot', format='svg'))
In [ ]:
from keras.utils.visualize_util import plot
plot(model, to_file='CriticModel.png')
In [ ]:
In [ ]:
class CriticNetwork(object):
def __init__(self, sess, state_size, action_size, BATCH_SIZE, TAU, LEARNING_RATE):
self.sess = sess
self.BATCH_SIZE = BATCH_SIZE
self.TAU = TAU
self.LEARNING_RATE = LEARNING_RATE
self.action_size = action_size
K.set_session(sess)
#Now create the model
self.model, self.action, self.state = self.create_critic_network(state_size, action_size)
self.target_model, self.target_action, self.target_state = self.create_critic_network(state_size, action_size)
self.action_grads = tf.gradients(self.model.output, self.action) #GRADIENTS for policy update
self.sess.run(tf.initialize_all_variables())
def gradients(self, states, actions):
return self.sess.run(self.action_grads, feed_dict={
self.state: states,
self.action: actions
})[0]
def target_train(self):
critic_weights = self.model.get_weights()
critic_target_weights = self.target_model.get_weights()
for i in xrange(len(critic_weights)):
critic_target_weights[i] = self.TAU * critic_weights[i] + (1 - self.TAU)* critic_target_weights[i]
self.target_model.set_weights(critic_target_weights)
def create_critic_network(self, state_size,action_dim):
print("building critic model")
S = Input(shape=[state_size])
A = Input(shape=[action_dim],name='action2')
w1 = Dense(hidden_neuron_l1, activation='relu')(S)
a1 = Dense(hidden_neuron_l2, activation='linear')(A)
h1 = Dense(hidden_neuron_l2, activation='linear')(w1)
h2 = merge([h1,a1],mode='sum')
h3 = Dense(hidden_neuron_l2, activation='relu')(h2)
V = Dense(action_dim,activation='linear')(h3)
model = Model(input=[S,A],output=V)
adam = Adam(lr=self.LEARNING_RATE)
model.compile(loss='mse', optimizer=adam)
return model, A, S
In [ ]:
def playGame(train_indicator=1): #1 means Train, 0 means simply Run
BUFFER_SIZE = 100000
BATCH_SIZE = 32
GAMMA = 0.99
TAU = 0.0001 #Target Network HyperParameters
LRA = 0.0001 #Learning rate for Actor
LRC = 0.001 #Lerning rate for Critic
action_dim = 3 #Steering/Acceleration/Brake
state_dim = 29 #of sensors input
np.random.seed(1337)
vision = False
EXPLORE = 100000.
episode_count = 200
max_steps = 10
reward = 0
done = False
step = 0
epsilon = 1
indicator = 0
#Tensorflow GPU optimization
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from keras import backend as K
K.set_session(sess)
actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer
# Generate a Torcs environment
env = TorcsEnv(vision=vision, throttle=True,gear_change=False)
#Now load the weight
print("load the weight")
try:
actor.model.load_weights("Myactormodel.h5")
critic.model.load_weights("Mycriticmodel.h5")
actor.target_model.load_weights("Myactormodel.h5")
critic.target_model.load_weights("Mycriticmodel.h5")
print("Weight load successfully")
except:
print("Cannot find the model weight")
print("TORCS RL Start...")
for i in range(episode_count):
print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))
if np.mod(i, 3) == 0:
ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error
else:
ob = env.reset()
s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
total_reward = 0.
for j in range(max_steps):
loss = 0
epsilon -= 1.0 / EXPLORE
a_t = np.zeros([1,action_dim])
noise_t = np.zeros([1,action_dim])
a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
noise_t[0][0] = train_indicator * max(epsilon, 0) * Ornstein_Uhlenbeck(a_t_original[0][0], 0.0 , 0.60, 0.30)
noise_t[0][1] = train_indicator * max(epsilon, 0) * Ornstein_Uhlenbeck(a_t_original[0][1], 0.5 , 1.00, 0.10)
noise_t[0][2] = train_indicator * max(epsilon, 0) * Ornstein_Uhlenbeck(a_t_original[0][2], -0.1 , 1.00, 0.05)
#The following code do the stochastic brake
if random.random() <= 0.1:
print("we apply the brakes here...")
noise_t[0][2] = train_indicator * max(epsilon, 0) * Ornstein_Uhlenbeck(a_t_original[0][2], 0.2 , 1.00, 0.10)
a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
ob, r_t, done, info = env.step(a_t[0])
s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer
#Do the batch update
batch = buff.getBatch(BATCH_SIZE)
states = np.asarray([e[0] for e in batch])
actions = np.asarray([e[1] for e in batch])
rewards = np.asarray([e[2] for e in batch])
new_states = np.asarray([e[3] for e in batch])
dones = np.asarray([e[4] for e in batch])
y_t = np.asarray([e[1] for e in batch])
target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
for k in range(len(batch)):
if dones[k]:
y_t[k] = rewards[k]
else:
y_t[k] = rewards[k] + GAMMA*target_q_values[k]
if (train_indicator):
loss += critic.model.train_on_batch([states,actions], y_t)
a_for_grad = actor.model.predict(states)
grads = critic.gradients(states, a_for_grad)
actor.train(states, grads)
actor.target_train()
critic.target_train()
total_reward += r_t
s_t = s_t1
print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
step += 1
if done:
break
if np.mod(i, 3) == 0:
if (train_indicator):
print("Now we save model")
actor.model.save_weights("Myactormodel.h5", overwrite=True)
with open("Myactormodel.json", "w") as outfile:
json.dump(actor.model.to_json(), outfile)
critic.model.save_weights("Mycriticmodel.h5", overwrite=True)
with open("Mycriticmodel.json", "w") as outfile:
json.dump(critic.model.to_json(), outfile)
print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward))
print("Total Step: " + str(step))
print("")
env.end() # This is for shutting down TORCS
print("Finish.")
if __name__ == "__main__":
playGame()