In [ ]:
%reset
In [ ]:
import numpy as np
import tensorflow as tf
from numpy.random import rand, randint, exponential
from src.environment.network_rm import network_rm
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models import MLP
In [ ]:
inventory = np.array([6,6,6,20]) # the last item is time
demand_types = np.array([[0,0,0,0], [1,1,1,0], [2,0,0,0]])
demand_values = np.array([0,1,20])
demand_arrivals = np.array([0.5, 0.5])
inter_arrival_time = 1
g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)
In [ ]:
tf.reset_default_graph()
session = tf.InteractiveSession()
# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
#observation_shape has to be a 1d vector for now
brain = MLP(list(g.observation_shape), [50, 50, g.num_actions], # change size to larger observation arrays
[tf.tanh, tf.tanh, tf.identity])
# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)
# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
discount_rate=0.99, exploration_period=1000, max_experience=100,
store_every_nth=4, train_every_nth=4)
session.run(tf.global_variables_initializer())
session.run(current_controller.target_network_update)
In [ ]:
def run_sim(env, agent, steps = 100, disable_training = False):
last_observation = None
last_action = None
for s in range(steps):
if env.terminate:
break
new_observation = env.observe()
reward = env.collect_reward()
#print(g.total_reward, " ", g.last_reward)
# store last transition
if last_observation is not None:
agent.store(last_observation, last_action, reward, new_observation)
# act
new_action = agent.action(new_observation)
env.perform_action(new_action)
#transition
env.transition()
#train
if not disable_training:
agent.training_step()
# update current state as last state.
last_action = new_action
last_observation = new_observation
return s
In [ ]:
g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)
In [ ]:
g.print_()
In [ ]:
x = g.total_reward
In [ ]:
T = 100
# %prun T = run_sim(g, current_controller, T)
T = run_sim(g, current_controller, T)
print("average reward: {}".format((g.total_reward - x)/T))
In [ ]:
g.state
In [ ]:
S = 1000
rewards = np.zeros(S)
for i in range(S):
g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)
T = run_sim(g, current_controller, 100)
rewards[i] = g.total_reward
if i%(S/20) == 0 and i >= (S/20):
print("average reward: {}".format(rewards[(i-int(S/20)):i].mean()))
In [ ]: