In [ ]:
import numpy as np
from numpy.random import rand, randint
In [ ]:
from src.agent.dp_agent import dp_agent
from src.environment.maze import maze
In [ ]:
def run_sim(env, agent, steps = 100, disable_training = False):
last_observation = None
last_action = None
for s in range(steps):
new_observation = env.observe()
reward = env.collect_reward()
# store last transition
if last_observation is not None:
agent.store(last_observation, last_action, reward, new_observation)
# act
new_action = agent.action(np.array([new_observation]))
env.perform_action(new_action)
#train
if not disable_training:
agent.training_step()
# update current state as last state.
last_action = new_action
last_observation = new_observation
In [ ]:
array = np.array([[-1,-1,-1,-1,-1],[-1,1,4,2,-1],[-1,0,1,10,-1],[-1,5,1,10,-1], [-1,-1,-1,-1,-1]])
g = maze(array)
In [ ]:
agent = dp_agent(g.observation_size, g.num_actions)
init_table = np.copy(agent.table)
In [ ]:
run_sim(g, agent, 10000)
In [ ]:
opt_actions = [["" for i in range(5)] for j in range(5)]
for i in range(agent.num_observations):
action = np.argmax(agent.table[i,:])
if action == 0:
t = "U"
elif action == 1:
t = "D"
elif action == 2:
t = "L"
else:
t = "R"
opt_actions[int(i/5)][i%5] = t
In [ ]:
array
In [ ]:
opt_actions
In [ ]:
g.total_reward
In [ ]:
import tensorflow as tf
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models import MLP
In [ ]:
tf.reset_default_graph()
session = tf.InteractiveSession()
# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([1,], [20, 20, g.num_actions], # change size to larger observation arrays
[tf.tanh, tf.tanh, tf.identity])
# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
discount_rate=0.99, exploration_period=5000, max_experience=10000,
store_every_nth=4, train_every_nth=4)
session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
In [ ]:
run_sim(g, current_controller, 10000)
In [ ]:
g.total_reward
In [ ]: