In [ ]:

    
import numpy as np
from numpy.random import rand, randint

Simple dp agent



In [ ]:

    
from src.agent.dp_agent import dp_agent
from src.environment.maze import maze



In [ ]:

    
def run_sim(env, agent, steps = 100, disable_training = False):
    last_observation = None
    last_action      = None
    
    for s in range(steps):
        new_observation = env.observe()
        reward          = env.collect_reward()
        
        # store last transition
        if last_observation is not None:
            agent.store(last_observation, last_action, reward, new_observation)

        # act
        new_action = agent.action(np.array([new_observation]))
        env.perform_action(new_action)

        #train
        if not disable_training:
            agent.training_step()

        # update current state as last state.
        last_action = new_action
        last_observation = new_observation



In [ ]:

    
array = np.array([[-1,-1,-1,-1,-1],[-1,1,4,2,-1],[-1,0,1,10,-1],[-1,5,1,10,-1], [-1,-1,-1,-1,-1]])
g = maze(array)



In [ ]:

    
agent = dp_agent(g.observation_size, g.num_actions)
init_table = np.copy(agent.table)



In [ ]:

    
run_sim(g, agent, 10000)



In [ ]:

    
opt_actions = [["" for i in range(5)] for j in range(5)]
for i in range(agent.num_observations):
    action = np.argmax(agent.table[i,:])
    if action == 0:
        t = "U"
    elif action == 1:
        t = "D"
    elif action == 2:
        t = "L"
    else:
        t = "R"
    opt_actions[int(i/5)][i%5] = t



In [ ]:

    
array



In [ ]:

    
opt_actions



In [ ]:

    
g.total_reward

DQN agent



In [ ]:

    
import tensorflow as tf
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models  import MLP



In [ ]:

    
tf.reset_default_graph()
session = tf.InteractiveSession()

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([1,], [20, 20, g.num_actions], # change size to larger observation arrays
            [tf.tanh, tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
                                   discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                   store_every_nth=4, train_every_nth=4)

session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)



In [ ]:

    
run_sim(g, current_controller, 10000)



In [ ]:

    
g.total_reward



In [ ]: