In [ ]:
import numpy as np
import tensorflow as tf
from numpy.random import rand, randint
from src.environment.matching import simple_matching
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models import MLP
In [ ]:
def action2matching(n, action):
m = np.zeros((n,n))
m[int(action/n), action%n] = 1
return m
In [ ]:
def run_sim(env, agent, steps = 100, disable_training = False):
last_observation = None
last_action = None
for s in range(steps):
new_observation = env.observe()
reward = env.collect_reward()
# store last transition
if last_observation is not None:
agent.store(last_observation, last_action, reward, new_observation)
# act
new_action = agent.action(new_observation)
env.perform_action(action2matching(len(g.types), new_action))
#train
if not disable_training:
agent.training_step()
# update current state as last state.
last_action = new_action
last_observation = new_observation
In [ ]:
types = np.array([1,2,3])
weight_matrix = np.array([[0,1,2],[-1,0,0],[-1,-1,0]])
#make the weights under diagonal negative enforces that matches are counted only once
arrival_probabilities = np.array([0.2,0.5,0.2])
departure_probabilities = np.array([0.002,0.002,0.006])
In [ ]:
g = simple_matching(types, weight_matrix, arrival_probabilities, departure_probabilities)
In [ ]:
g.num_actions
In [ ]:
In [ ]:
g.observation_shape
In [ ]:
tf.reset_default_graph()
session = tf.InteractiveSession()
# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP(list(g.observation_shape), [200, 200, g.num_actions], # change size to larger observation arrays
[tf.tanh, tf.tanh, tf.identity])
# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
discount_rate=0.99, exploration_period=5000, max_experience=10000,
store_every_nth=4, train_every_nth=4)
session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
In [ ]:
obs = g.observe()
In [ ]:
current_controller.action(obs)
In [ ]:
a = np.zeros((2,3))
In [ ]:
a[np.newaxis,:]
In [ ]:
run_sim(g, current_controller, 10000)
In [ ]:
g.state
In [ ]:
g.total_reward
In [ ]: