In [ ]:
import numpy as np
import tensorflow as tf
from numpy.random import rand, randint

from src.environment.matching import simple_matching
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models  import MLP

In [ ]:
def action2matching(n, action):
    m = np.zeros((n,n))
    m[int(action/n), action%n] = 1
    return m

In [ ]:
def run_sim(env, agent, steps = 100, disable_training = False):
    last_observation = None
    last_action      = None
    
    for s in range(steps):
        new_observation = env.observe()
        reward          = env.collect_reward()
        
        # store last transition
        if last_observation is not None:
            agent.store(last_observation, last_action, reward, new_observation)

        # act
        new_action = agent.action(new_observation)
        env.perform_action(action2matching(len(g.types), new_action))

        #train
        if not disable_training:
            agent.training_step()

        # update current state as last state.
        last_action = new_action
        last_observation = new_observation

In [ ]:
types = np.array([1,2,3])
weight_matrix = np.array([[0,1,2],[-1,0,0],[-1,-1,0]]) 
#make the weights under diagonal negative enforces that matches are counted only once
arrival_probabilities = np.array([0.2,0.5,0.2])
departure_probabilities = np.array([0.002,0.002,0.006])

In [ ]:
g = simple_matching(types, weight_matrix, arrival_probabilities, departure_probabilities)

In [ ]:
g.num_actions

In [ ]:


In [ ]:
g.observation_shape

In [ ]:
tf.reset_default_graph()
session = tf.InteractiveSession()

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP(list(g.observation_shape), [200, 200, g.num_actions], # change size to larger observation arrays
            [tf.tanh, tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
                                   discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                   store_every_nth=4, train_every_nth=4)

session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)

In [ ]:
obs = g.observe()

In [ ]:
current_controller.action(obs)

In [ ]:
a = np.zeros((2,3))

In [ ]:
a[np.newaxis,:]

In [ ]:
run_sim(g, current_controller, 10000)

In [ ]:
g.state

In [ ]:
g.total_reward

In [ ]: