notebook.community

Edit and run



In [ ]:

    
import numpy as np
import tensorflow as tf
from numpy.random import rand, randint

from src.environment.matching import simple_matching
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models  import MLP



In [ ]:

    
def action2matching(n, action):
    m = np.zeros((n,n))
    m[int(action/n), action%n] = 1
    return m



In [ ]:

    
def run_sim(env, agent, steps = 100, disable_training = False):
    last_observation = None
    last_action      = None
    
    for s in range(steps):
        new_observation = env.observe()
        reward          = env.collect_reward()
        
        # store last transition
        if last_observation is not None:
            agent.store(last_observation, last_action, reward, new_observation)

        # act
        new_action = agent.action(new_observation)
        env.perform_action(action2matching(len(g.types), new_action))

        #train
        if not disable_training:
            agent.training_step()

        # update current state as last state.
        last_action = new_action
        last_observation = new_observation



In [ ]:

    
types = np.array([1,2,3])
weight_matrix = np.array([[0,1,2],[-1,0,0],[-1,-1,0]]) 
#make the weights under diagonal negative enforces that matches are counted only once
arrival_probabilities = np.array([0.2,0.5,0.2])
departure_probabilities = np.array([0.002,0.002,0.006])



In [ ]:

    
g = simple_matching(types, weight_matrix, arrival_probabilities, departure_probabilities)



In [ ]:

    
g.num_actions



In [ ]:



In [ ]:

    
g.observation_shape



In [ ]:

    
tf.reset_default_graph()
session = tf.InteractiveSession()

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP(list(g.observation_shape), [200, 200, g.num_actions], # change size to larger observation arrays
            [tf.tanh, tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
                                   discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                   store_every_nth=4, train_every_nth=4)

session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)



In [ ]:

    
obs = g.observe()



In [ ]:

    
current_controller.action(obs)



In [ ]:

    
a = np.zeros((2,3))



In [ ]:

    
a[np.newaxis,:]



In [ ]:

    
run_sim(g, current_controller, 10000)



In [ ]:

    
g.state



In [ ]:

    
g.total_reward



In [ ]: