In [ ]:
%reset

In [ ]:
import numpy as np
import tensorflow as tf
from numpy.random import rand, randint, exponential

from src.environment.network_rm import network_rm
from src.agent.dqn.dqn import exp_replay
from src.agent.dqn.models  import MLP

In [ ]:
inventory = np.array([6,6,6,20]) # the last item is time
demand_types = np.array([[0,0,0,0], [1,1,1,0], [2,0,0,0]])
demand_values = np.array([0,1,20])
demand_arrivals = np.array([0.5, 0.5])
inter_arrival_time = 1
g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)

In [ ]:
tf.reset_default_graph()
session = tf.InteractiveSession()

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers

#observation_shape has to be a 1d vector for now
brain = MLP(list(g.observation_shape), [50, 50, g.num_actions], # change size to larger observation arrays
            [tf.tanh, tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = exp_replay(g.observation_shape, g.num_actions, brain, optimizer, session,
                                   discount_rate=0.99, exploration_period=1000, max_experience=100, 
                                   store_every_nth=4, train_every_nth=4)

session.run(tf.global_variables_initializer())
session.run(current_controller.target_network_update)

In [ ]:
def run_sim(env, agent, steps = 100, disable_training = False):
    last_observation = None
    last_action      = None
    
    for s in range(steps):
        if env.terminate:
            break
        new_observation = env.observe()
        reward          = env.collect_reward()
        #print(g.total_reward, " ", g.last_reward)
        
        # store last transition
        if last_observation is not None:
            agent.store(last_observation, last_action, reward, new_observation)

        # act
        new_action = agent.action(new_observation)
        env.perform_action(new_action)
    
        #transition
        env.transition()
        
        #train
        if not disable_training:
            agent.training_step()

        # update current state as last state.
        last_action = new_action
        last_observation = new_observation
    return s

One instance of network rm problem


In [ ]:
g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)

In [ ]:
g.print_()

In [ ]:
x = g.total_reward

In [ ]:
T = 100
# %prun T = run_sim(g, current_controller, T)
T = run_sim(g, current_controller, T)
print("average reward: {}".format((g.total_reward - x)/T))

In [ ]:
g.state

Training


In [ ]:
S = 1000
rewards = np.zeros(S)
for i in range(S):
    g = network_rm(inventory, demand_values, demand_types, demand_arrivals, inter_arrival_time)
    T = run_sim(g, current_controller, 100)
    rewards[i] = g.total_reward
    if i%(S/20) == 0 and i >= (S/20):
        print("average reward: {}".format(rewards[(i-int(S/20)):i].mean()))

In [ ]: