Berater Environment v11

Changes from v10

  • configure custom network allowing to train to almost perfection
  • score method for BaseLine

Installation (required for colab)


In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment


In [0]:
import numpy as np
import random

import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
#         self.map = {
#             'S': [('A', 100), ('B', 400), ('C', 200 )],
#             'A': [('B', 250), ('C', 400), ('S', 100 )],
#             'B': [('A', 250), ('C', 250), ('S', 400 )],
#             'C': [('A', 400), ('B', 250), ('S', 200 )]
#         }
        self.map = {
            'S': [('A', 300), ('B', 100), ('C', 200 )],
            'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
            'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
            'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
            'D': [('A', 100), ('F', 50)],
            'E': [('A', 100), ('F', 100), ('H', 100)],
            'F': [('D', 50), ('E', 100), ('G', 200)],
            'G': [('F', 200), ('O', 300)],
            'H': [('E', 100), ('K', 300)],
            'K': [('B', 200), ('H', 300)],
            'L': [('C', 200), ('M', 50)],
            'M': [('C', 100), ('L', 50), ('N', 100)],
            'N': [('M', 100), ('O', 100)],
            'O': [('N', 100), ('G', 300)]
        }
        max_paths = 4
        self.action_space = spaces.Discrete(max_paths)
      
        positions = len(self.map)
        # observations: position, reward of all 4 local paths, rest reward of all locations
        # non existing path is -1000 and no position change
        # look at what #getObservation returns if you are confused
        low = np.append(np.append([0], np.full(max_paths, -1000)), np.full(positions, 0))
        high = np.append(np.append([positions - 1], np.full(max_paths, 1000)), np.full(positions, 1000))
        self.observation_space = spaces.Box(low=low,
                                             high=high,
                                             dtype=np.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = np.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=np.float32)
        all_rest_rewards = list(self.customer_reward.values())
        result = np.append(result, all_rest_rewards)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

      
    def modulate_reward(self):
      number_of_customers = len(self.map) - 1
      number_per_consultant = int(number_of_customers/2)
#       number_per_consultant = int(number_of_customers/1.5)
      self.customer_reward = {
          'S': 0
      }
      for customer_nr in range(1, number_of_customers + 1):
        self.customer_reward[int_to_state_name(customer_nr)] = 0
      
      # every consultant only visits a few random customers
      samples = random.sample(range(1, number_of_customers + 1), k=number_per_consultant)
      key_list = list(self.customer_reward.keys())
      for sample in samples:
        self.customer_reward[key_list[sample]] = 1000

      
    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.modulate_reward()
        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))
      
    def render(self):
      print(self.customer_reward)

In [0]:
env = BeraterEnv()
print(env.reset())
print(env.customer_reward)


[    0.   700.   900.  -200. -1000.     0.  1000.  1000.     0.  1000.
  1000.  1000.     0.     0.     0.  1000.     0.     0.     0.]
{'S': 0, 'A': 1000, 'B': 1000, 'C': 0, 'D': 1000, 'E': 1000, 'F': 1000, 'G': 0, 'H': 0, 'K': 0, 'L': 1000, 'M': 0, 'N': 0, 'O': 0}

Try out Environment


In [0]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)


<BeraterEnv instance>
[    0.   700.   900.  -200. -1000.     0.  1000.  1000.     0.  1000.
     0.     0.     0.  1000.     0.  1000.     0.  1000.     0.]
Episode:    0   Step:    1  S --0-> A R= 0.12 totalR= 0.12 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:    2  A --3-> D R= 0.15 totalR= 0.27 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    3  D --1-> F R=-0.01 totalR= 0.26 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    4  F --0-> D R=-0.01 totalR= 0.25 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    5  D --3-> D R=-0.17 totalR= 0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    6  D --3-> D R=-0.17 totalR=-0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    7  D --3-> D R=-0.17 totalR=-0.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    8  D --3-> D R=-0.17 totalR=-0.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    9  D --1-> F R=-0.01 totalR=-0.42 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   10  F --3-> F R=-0.17 totalR=-0.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   11  F --1-> E R=-0.02 totalR=-0.61 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   12  E --2-> H R= 0.15 totalR=-0.46 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   13  H --0-> E R=-0.02 totalR=-0.48 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  E --3-> E R=-0.17 totalR=-0.64 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   15  E --2-> H R=-0.02 totalR=-0.66 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   16  H --0-> E R=-0.02 totalR=-0.68 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   17  E --0-> A R=-0.02 totalR=-0.69 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   18  A --0-> S R=-0.05 totalR=-0.74 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   19  S --2-> C R=-0.03 totalR=-0.78 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   20  C --1-> B R= 0.16 totalR=-0.62 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:   21  B --2-> C R=-0.01 totalR=-0.63 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   22  C --3-> L R= 0.13 totalR=-0.49 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   23  L --3-> L R=-0.17 totalR=-0.66 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   24  L --2-> L R=-0.17 totalR=-0.83 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   25  L --0-> C R=-0.03 totalR=-0.86 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   26  C --1-> B R=-0.01 totalR=-0.87 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   27  B --1-> A R=-0.02 totalR=-0.88 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   28  A --1-> B R=-0.02 totalR=-0.90 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   29  B --1-> A R=-0.02 totalR=-0.92 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   30  A --0-> S R=-0.05 totalR=-0.97 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   31  S --1-> B R=-0.02 totalR=-0.98 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   32  B --0-> S R=-0.02 totalR=-1.00 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   33  S --3-> S R=-0.17 totalR=-1.17 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   34  S --0-> A R=-0.05 totalR=-1.22 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   35  A --3-> D R=-0.02 totalR=-1.23 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   36  D --1-> F R=-0.01 totalR=-1.24 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   37  F --2-> G R=-0.03 totalR=-1.28 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   38  G --3-> G R=-0.17 totalR=-1.44 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   39  G --3-> G R=-0.17 totalR=-1.61 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   40  G --0-> F R=-0.03 totalR=-1.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   41  F --2-> G R=-0.03 totalR=-1.68 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   42  G --3-> G R=-0.17 totalR=-1.84 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   43  G --0-> F R=-0.03 totalR=-1.88 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   44  F --1-> E R=-0.02 totalR=-1.89 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   45  E --3-> E R=-0.17 totalR=-2.06 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   46  E --1-> F R=-0.02 totalR=-2.08 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   47  F --3-> F R=-0.17 totalR=-2.24 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   48  F --3-> F R=-0.17 totalR=-2.41 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   49  F --2-> G R=-0.03 totalR=-2.44 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   50  G --3-> G R=-0.17 totalR=-2.61 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   51  G --0-> F R=-0.03 totalR=-2.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   52  F --1-> E R=-0.02 totalR=-2.66 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   53  E --1-> F R=-0.02 totalR=-2.68 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   54  F --1-> E R=-0.02 totalR=-2.69 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   55  E --3-> E R=-0.17 totalR=-2.86 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   56  E --0-> A R=-0.02 totalR=-2.88 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   57  A --3-> D R=-0.02 totalR=-2.89 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   58  D --2-> D R=-0.17 totalR=-3.06 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   59  D --0-> A R=-0.02 totalR=-3.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   60  A --3-> D R=-0.02 totalR=-3.09 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   61  D --3-> D R=-0.17 totalR=-3.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   62  D --2-> D R=-0.17 totalR=-3.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   63  D --3-> D R=-0.17 totalR=-3.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   64  D --2-> D R=-0.17 totalR=-3.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   65  D --3-> D R=-0.17 totalR=-3.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   66  D --0-> A R=-0.02 totalR=-3.94 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   67  A --2-> E R=-0.02 totalR=-3.96 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   68  E --0-> A R=-0.02 totalR=-3.97 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   69  A --0-> S R=-0.05 totalR=-4.02 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   70  S --0-> A R=-0.05 totalR=-4.07 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   71  A --1-> B R=-0.02 totalR=-4.09 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   72  B --1-> A R=-0.02 totalR=-4.11 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   73  A --2-> E R=-0.02 totalR=-4.12 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   74  E --0-> A R=-0.02 totalR=-4.14 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   75  A --0-> S R=-0.05 totalR=-4.19 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   76  S --1-> B R=-0.02 totalR=-4.21 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   77  B --3-> K R=-0.03 totalR=-4.24 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   78  K --0-> B R=-0.03 totalR=-4.27 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   79  B --1-> A R=-0.02 totalR=-4.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   80  A --2-> E R=-0.02 totalR=-4.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   81  E --2-> H R=-0.02 totalR=-4.32 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   82  H --3-> H R=-0.17 totalR=-4.49 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   83  H --0-> E R=-0.02 totalR=-4.51 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   84  E --1-> F R=-0.02 totalR=-4.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   85  F --1-> E R=-0.02 totalR=-4.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   86  E --3-> E R=-0.17 totalR=-4.71 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   87  E --1-> F R=-0.02 totalR=-4.72 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   88  F --1-> E R=-0.02 totalR=-4.74 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   89  E --3-> E R=-0.17 totalR=-4.91 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   90  E --2-> H R=-0.02 totalR=-4.92 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   91  H --3-> H R=-0.17 totalR=-5.09 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   92  H --3-> H R=-0.17 totalR=-5.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   93  H --2-> H R=-0.17 totalR=-5.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   94  H --2-> H R=-0.17 totalR=-5.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   95  H --3-> H R=-0.17 totalR=-5.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   96  H --0-> E R=-0.02 totalR=-5.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   97  E --2-> H R=-0.02 totalR=-5.79 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   98  H --3-> H R=-0.17 totalR=-5.96 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   99  H --1-> K R=-0.05 totalR=-6.01 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  100  K --0-> B R=-0.03 totalR=-6.04 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  101  B --1-> A R=-0.02 totalR=-6.06 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  102  A --2-> E R=-0.02 totalR=-6.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  103  E --0-> A R=-0.02 totalR=-6.09 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  104  A --3-> D R=-0.02 totalR=-6.11 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  105  D --0-> A R=-0.02 totalR=-6.12 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  106  A --2-> E R=-0.02 totalR=-6.14 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  107  E --0-> A R=-0.02 totalR=-6.16 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  108  A --3-> D R=-0.02 totalR=-6.17 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  109  D --3-> D R=-0.17 totalR=-6.34 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  110  D --0-> A R=-0.02 totalR=-6.36 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  111  A --3-> D R=-0.02 totalR=-6.37 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  112  D --0-> A R=-0.02 totalR=-6.39 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  113  A --0-> S R=-0.05 totalR=-6.44 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  114  S --0-> A R=-0.05 totalR=-6.49 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  115  A --0-> S R=-0.05 totalR=-6.54 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  116  S --2-> C R=-0.03 totalR=-6.57 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  117  C --3-> L R=-0.03 totalR=-6.61 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  118  L --0-> C R=-0.03 totalR=-6.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  119  C --3-> L R=-0.03 totalR=-6.67 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  120  L --2-> L R=-0.17 totalR=-6.84 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  121  L --3-> L R=-0.17 totalR=-7.01 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  122  L --3-> L R=-0.17 totalR=-7.17 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  123  L --1-> M R=-0.01 totalR=-7.18 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  124  M --1-> L R=-0.01 totalR=-7.19 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  125  L --1-> M R=-0.01 totalR=-7.20 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  126  M --0-> C R=-0.02 totalR=-7.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  127  C --1-> B R=-0.01 totalR=-7.23 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  128  B --1-> A R=-0.02 totalR=-7.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  129  A --1-> B R=-0.02 totalR=-7.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  130  B --3-> K R=-0.03 totalR=-7.29 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  131  K --0-> B R=-0.03 totalR=-7.33 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  132  B --3-> K R=-0.03 totalR=-7.36 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  133  K --1-> H R=-0.05 totalR=-7.41 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  134  H --2-> H R=-0.17 totalR=-7.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  135  H --0-> E R=-0.02 totalR=-7.59 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  136  E --1-> F R=-0.02 totalR=-7.61 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  137  F --2-> G R=-0.03 totalR=-7.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  138  G --0-> F R=-0.03 totalR=-7.67 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  139  F --2-> G R=-0.03 totalR=-7.71 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  140  G --0-> F R=-0.03 totalR=-7.74 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  141  F --1-> E R=-0.02 totalR=-7.76 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  142  E --3-> E R=-0.17 totalR=-7.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  143  E --2-> H R=-0.02 totalR=-7.94 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  144  H --2-> H R=-0.17 totalR=-8.11 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  145  H --1-> K R=-0.05 totalR=-8.16 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  146  K --0-> B R=-0.03 totalR=-8.19 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  147  B --3-> K R=-0.03 totalR=-8.22 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  148  K --1-> H R=-0.05 totalR=-8.28 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  149  H --1-> K R=-0.05 totalR=-8.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  150  K --3-> K R=-0.17 totalR=-8.49 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  151  K --0-> B R=-0.03 totalR=-8.53 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  152  B --2-> C R=-0.01 totalR=-8.53 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  153  C --2-> M R=-0.02 totalR=-8.55 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  154  M --3-> M R=-0.17 totalR=-8.72 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  155  M --2-> N R= 0.15 totalR=-8.57 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:  156  N --3-> N R=-0.17 totalR=-8.73 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  157  N --3-> N R=-0.17 totalR=-8.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  158  N --3-> N R=-0.17 totalR=-9.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  159  N --2-> N R=-0.17 totalR=-9.23 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  160  N --1-> O R=-0.02 totalR=-9.25 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  161  O --2-> O R=-0.17 totalR=-9.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  162  O --2-> O R=-0.17 totalR=-9.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  163  O --3-> O R=-0.17 totalR=-9.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  164  O --2-> O R=-0.17 totalR=-9.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  165  O --3-> O R=-0.17 totalR=-10.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  166  O --3-> O R=-0.17 totalR=-10.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  167  O --2-> O R=-0.17 totalR=-10.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  168  O --2-> O R=-0.17 totalR=-10.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  169  O --3-> O R=-0.17 totalR=-10.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  170  O --0-> N R=-0.02 totalR=-10.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  171  N --1-> O R=-0.02 totalR=-10.78 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  172  O --2-> O R=-0.17 totalR=-10.95 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  173  O --3-> O R=-0.17 totalR=-11.12 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  174  O --2-> O R=-0.17 totalR=-11.28 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  175  O --1-> G R=-0.05 totalR=-11.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  176  G --2-> G R=-0.17 totalR=-11.50 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  177  G --1-> O R=-0.05 totalR=-11.55 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  178  O --0-> N R=-0.02 totalR=-11.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  179  N --2-> N R=-0.17 totalR=-11.73 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  180  N --2-> N R=-0.17 totalR=-11.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  181  N --3-> N R=-0.17 totalR=-12.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  182  N --0-> M R=-0.02 totalR=-12.08 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  183  M --3-> M R=-0.17 totalR=-12.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  184  M --2-> N R=-0.02 totalR=-12.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  185  N --3-> N R=-0.17 totalR=-12.43 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  186  N --0-> M R=-0.02 totalR=-12.45 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  187  M --0-> C R=-0.02 totalR=-12.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  188  C --2-> M R=-0.02 totalR=-12.48 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  189  M --0-> C R=-0.02 totalR=-12.50 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  190  C --2-> M R=-0.02 totalR=-12.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  191  M --3-> M R=-0.17 totalR=-12.68 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  192  M --2-> N R=-0.02 totalR=-12.70 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  193  N --2-> N R=-0.17 totalR=-12.87 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  194  N --3-> N R=-0.17 totalR=-13.03 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  195  N --0-> M R=-0.02 totalR=-13.05 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  196  M --0-> C R=-0.02 totalR=-13.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  197  C --0-> S R=-0.03 totalR=-13.10 cost= 200 customerR=   0 optimum=6000
Done: episodes=     1  avgSteps=197.00  avgTotalReward=-13.10
Episode finished after 197 timesteps
[    0.  -300.  -100.  -200. -1000.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.]

Baseline


In [0]:
from copy import deepcopy
import json

class Baseline():

  def __init__(self, env, max_reward, verbose=1):
    self.env = env
    self.max_reward = max_reward
    self.verbose = verbose
    self.reset()

  def reset(self):
    self.map = self.env.map
    self.rewards = self.env.customer_reward.copy()
    
  def as_string(self, state):
    # reward/cost does not hurt, but is useless, path obsucres same state
    new_state = {
        'rewards': state['rewards'],
        'position': state['position']
    }
    return json.dumps(new_state, sort_keys=True)
  
  def is_goal(self, state):
    if state['position'] != 'S': return False
    for reward in state['rewards'].values():
      if reward != 0: return False
    return True
    

  def expand(self, state):
    states = []
    for position, cost in self.map[state['position']]:
      new_state = deepcopy(state)
      new_state['position'] = position
      new_state['rewards'][position] = 0
      reward = state['rewards'][position]
      new_state['reward'] += reward
      new_state['cost'] += cost
      new_state['path'].append(position)
      states.append(new_state)
    return states

  def search(self, root, max_depth = 25):
      closed = set()
      open = [root]

      while open:
          state = open.pop(0)
          if self.as_string(state) in closed: continue  

          closed.add(self.as_string(state))

          depth = len(state['path'])
          if depth > max_depth:
            if self.verbose > 0:
              print("Visited:", len(closed))
              print("Reached max depth, without reaching goal")
            return None

          if self.is_goal(state):
            scaled_reward = (state['reward'] - state['cost']) / self.max_reward
            state['scaled_reward'] = scaled_reward
            if self.verbose > 0:
              print("Scaled reward:", scaled_reward)            
              print("Perfect path", state['path'])
            return state

          expanded = self.expand(state)
          open += expanded
          # make this best first
          open.sort(key=lambda state: state['cost'])
        
  def find_optimum(self):
    initial_state = {
        'rewards': self.rewards.copy(),
        'position': 'S',
        'reward': 0,
        'cost': 0,
        'path': ['S']
    }
    return self.search(initial_state)
  
  def benchmark(self, model, sample_runs=100):
    self.verbose = 0
    BeraterEnv.showStep = False
    BeraterEnv.showDone = False

    perfect_rewards = []
    model_rewards = []
    for run in range(sample_runs):
      observation = self.env.reset()
      self.reset()
      
      optimum_state = self.find_optimum()
      perfect_rewards.append(optimum_state['scaled_reward'])
      
      state = np.zeros((1, 2*128))
      dones = np.zeros((1))

      for t in range(1000):
        actions, _, state, _ = model.step(observation, S=state, M=dones)
        observation, reward, done, info = self.env.step(actions[0])
        if done:
          break
      model_rewards.append(env.totalReward)
    return perfect_rewards, model_rewards
  
  def score(self, model, sample_runs=100):
    perfect_rewards, model_rewards = self.benchmark(model, sample_runs=100)
    
    perfect_score_mean, perfect_score_std = np.array(perfect_rewards).mean(), np.array(perfect_rewards).std()
    test_score_mean, test_score_std = np.array(model_rewards).mean(), np.array(model_rewards).std()
    
    return perfect_score_mean, perfect_score_std, test_score_mean, test_score_std

Train model

Estimation

  • total cost when travelling all paths (back and forth): 2500
  • all rewards: 6000
  • but: rewards are much more sparse while routes stay the same, maybe expect less
  • estimate: no illegal moves and between
    • half the travel cost: (6000 - 1250) / 6000 = .79
    • and full traval cost (6000 - 2500) / 6000 = 0.58
  • additionally: the agent only sees very little of the whole scenario
    • changes with every episode
    • was ok when network can learn fixed scenario

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/berater

In [0]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

Step 1: Extract MLP builder from openai sources


In [0]:
# copied from https://github.com/openai/baselines/blob/master/baselines/a2c/utils.py

def ortho_init(scale=1.0):
    def _ortho_init(shape, dtype, partition_info=None):
        #lasagne ortho init for tf
        shape = tuple(shape)
        if len(shape) == 2:
            flat_shape = shape
        elif len(shape) == 4: # assumes NHWC
            flat_shape = (np.prod(shape[:-1]), shape[-1])
        else:
            raise NotImplementedError
        a = np.random.normal(0.0, 1.0, flat_shape)
        u, _, v = np.linalg.svd(a, full_matrices=False)
        q = u if u.shape == flat_shape else v # pick the one with the correct shape
        q = q.reshape(shape)
        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
    return _ortho_init      

def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
    with tf.variable_scope(scope):
        nin = x.get_shape()[1].value
        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
        return tf.matmul(x, w)+b
      

# copied from https://github.com/openai/baselines/blob/master/baselines/common/models.py#L31
def mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False):
    """
    Stack of fully-connected layers to be used in a policy / q-function approximator

    Parameters:
    ----------

    num_layers: int                 number of fully-connected layers (default: 2)

    num_hidden: int                 size of fully-connected layers (default: 64)

    activation:                     activation function (default: tf.tanh)

    Returns:
    -------

    function that builds fully connected network with a given input tensor / placeholder
    """
    def network_fn(X):
#         print('network_fn called')
#         Tensor("ppo2_model_4/Ob:0", shape=(1, 19), dtype=float32)
#         Tensor("ppo2_model_4/Ob_1:0", shape=(512, 19), dtype=float32)
#         print (X)
        h = tf.layers.flatten(X)
        for i in range(num_layers):
            h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)
          
#         Tensor("ppo2_model_4/pi/Tanh_2:0", shape=(1, 500), dtype=float32)
#         Tensor("ppo2_model_4/pi_2/Tanh_2:0", shape=(512, 500), dtype=float32)
#         print(h)
        return h

    return network_fn

Step 2: Replace exotic parts

Steps:

  1. Low level matmul replaced with dense layer (no need for custom code here)

  2. initializer changed to best practice glorot uniform, but does not give reliable results, so use seed

  3. use relu activations (should train faster)
  4. standard batch normalization does not train with any configuration (no idea why), so we need to keep layer normalization 1.Dropout and L2 would be nice as well, but easy to do within the boundaries of the OpenAI framework: https://stackoverflow.com/questions/38292760/tensorflow-introducing-both-l2-regularization-and-dropout-into-the-network-do

Alternative: Using Keras API

Not done here, as no big benefit expected and would need to be integrated into surrounding low level tensorflow model. Need to reuse session. If you want to do this, be sure to check at least the first link


In [0]:
# first the dense layer
def mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False):
    def network_fn(X):
        h = tf.layers.flatten(X)
        for i in range(num_layers):
            h = tf.layers.dense(h, units=num_hidden, kernel_initializer=ortho_init(np.sqrt(2)))
#             h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)
        return h

    return network_fn

In [0]:
# then initializer, relu activations
def mlp(num_layers=2, num_hidden=64, activation=tf.nn.relu, layer_norm=False):
    def network_fn(X):
        h = tf.layers.flatten(X)
        for i in range(num_layers):
            h = tf.layers.dense(h, units=num_hidden, kernel_initializer=tf.initializers.glorot_uniform(seed=17))
            if layer_norm:
#               h = tf.layers.batch_normalization(h, center=True, scale=True)
              h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)
        return h

    return network_fn

In [0]:
%%time

# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
# https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30
# https://arxiv.org/abs/1607.06450 for layer_norm

# lr linear from lr=1e-2 to lr=1e-4 (default lr=3e-4)
def lr_range(frac):
  # we get the remaining updates between 1 and 0
  start_lr = 1e-2
  end_lr = 1e-4
  diff_lr = start_lr - end_lr
  lr = end_lr + diff_lr * frac
  return lr
  
network = mlp(num_hidden=500, num_layers=3, layer_norm=True)
  
model = ppo2.learn(
    env=monitored_env,
    network=network,
    lr=lr_range,
    gamma=1.0,
    ent_coef=0.05,
    total_timesteps=1000000)

# model = ppo2.learn(
#     env=monitored_env,
#     network='mlp',
#     num_hidden=500,
#     num_layers=3,
#     layer_norm=True,
#     lr=lr_range,
#     gamma=1.0,
#     ent_coef=0.05,
#     total_timesteps=500000)


# model.save('berater-ppo-v11.pkl')
monitored_env.close()


Logging to /tmp/openai-2019-01-27-15-27-47-240107
-----------------------------------
| approxkl           | 1.4420375  |
| clipfrac           | 0.8397217  |
| eplenmean          | 104        |
| eprewmean          | -6.1689844 |
| explained_variance | -0.248     |
| fps                | 386        |
| nupdates           | 1          |
| policy_entropy     | 0.8404105  |
| policy_loss        | 0.19051582 |
| serial_timesteps   | 2048       |
| time_elapsed       | 5.3        |
| total_timesteps    | 2048       |
| value_loss         | 5.3930745  |
-----------------------------------
---------------------------------------
| approxkl           | 0.026781807    |
| clipfrac           | 0.1953125      |
| eplenmean          | 338            |
| eprewmean          | -9.549868      |
| explained_variance | 0.327          |
| fps                | 409            |
| nupdates           | 10             |
| policy_entropy     | 0.93414086     |
| policy_loss        | -0.00030102135 |
| serial_timesteps   | 20480          |
| time_elapsed       | 50.4           |
| total_timesteps    | 20480          |
| value_loss         | 0.088846214    |
---------------------------------------
-------------------------------------
| approxkl           | 0.016508345  |
| clipfrac           | 0.20739746   |
| eplenmean          | 33.8         |
| eprewmean          | 0.14341663   |
| explained_variance | 0.0657       |
| fps                | 403          |
| nupdates           | 20           |
| policy_entropy     | 0.90960133   |
| policy_loss        | -0.014445528 |
| serial_timesteps   | 40960        |
| time_elapsed       | 101          |
| total_timesteps    | 40960        |
| value_loss         | 0.053486325  |
-------------------------------------
-------------------------------------
| approxkl           | 0.023043975  |
| clipfrac           | 0.20336914   |
| eplenmean          | 23.3         |
| eprewmean          | 0.54283327   |
| explained_variance | 0.701        |
| fps                | 403          |
| nupdates           | 30           |
| policy_entropy     | 0.5709828    |
| policy_loss        | -0.008646036 |
| serial_timesteps   | 61440        |
| time_elapsed       | 151          |
| total_timesteps    | 61440        |
| value_loss         | 0.0075427843 |
-------------------------------------
--------------------------------------
| approxkl           | 0.036137722   |
| clipfrac           | 0.15966797    |
| eplenmean          | 18            |
| eprewmean          | 0.6485833     |
| explained_variance | 0.905         |
| fps                | 420           |
| nupdates           | 40            |
| policy_entropy     | 0.34395832    |
| policy_loss        | -0.0138049545 |
| serial_timesteps   | 81920         |
| time_elapsed       | 202           |
| total_timesteps    | 81920         |
| value_loss         | 0.0025978752  |
--------------------------------------
-------------------------------------
| approxkl           | 0.016258882  |
| clipfrac           | 0.08215332   |
| eplenmean          | 17.1         |
| eprewmean          | 0.6855001    |
| explained_variance | 0.892        |
| fps                | 409          |
| nupdates           | 50           |
| policy_entropy     | 0.22930014   |
| policy_loss        | -0.009889038 |
| serial_timesteps   | 102400       |
| time_elapsed       | 252          |
| total_timesteps    | 102400       |
| value_loss         | 0.0029221126 |
-------------------------------------
-------------------------------------
| approxkl           | 0.040627833  |
| clipfrac           | 0.0793457    |
| eplenmean          | 16.2         |
| eprewmean          | 0.69150007   |
| explained_variance | 0.937        |
| fps                | 409          |
| nupdates           | 60           |
| policy_entropy     | 0.14939314   |
| policy_loss        | -0.008367878 |
| serial_timesteps   | 122880       |
| time_elapsed       | 301          |
| total_timesteps    | 122880       |
| value_loss         | 0.0018437295 |
-------------------------------------
-------------------------------------
| approxkl           | 0.047517613  |
| clipfrac           | 0.23278809   |
| eplenmean          | 19.5         |
| eprewmean          | 0.51625      |
| explained_variance | 0.62         |
| fps                | 413          |
| nupdates           | 70           |
| policy_entropy     | 0.42431322   |
| policy_loss        | -0.023480183 |
| serial_timesteps   | 143360       |
| time_elapsed       | 351          |
| total_timesteps    | 143360       |
| value_loss         | 0.008940705  |
-------------------------------------
------------------------------------
| approxkl           | 0.022558138 |
| clipfrac           | 0.07788086  |
| eplenmean          | 16.4        |
| eprewmean          | 0.69066656  |
| explained_variance | 0.788       |
| fps                | 410         |
| nupdates           | 80          |
| policy_entropy     | 0.24044533  |
| policy_loss        | -0.01237258 |
| serial_timesteps   | 163840      |
| time_elapsed       | 400         |
| total_timesteps    | 163840      |
| value_loss         | 0.006068559 |
------------------------------------
-------------------------------------
| approxkl           | 0.008366971  |
| clipfrac           | 0.03491211   |
| eplenmean          | 15.1         |
| eprewmean          | 0.7080833    |
| explained_variance | 0.931        |
| fps                | 411          |
| nupdates           | 90           |
| policy_entropy     | 0.09522042   |
| policy_loss        | -0.004614097 |
| serial_timesteps   | 184320       |
| time_elapsed       | 450          |
| total_timesteps    | 184320       |
| value_loss         | 0.0021189214 |
-------------------------------------
-------------------------------------
| approxkl           | 0.10888699   |
| clipfrac           | 0.075805664  |
| eplenmean          | 15.4         |
| eprewmean          | 0.7098333    |
| explained_variance | 0.932        |
| fps                | 405          |
| nupdates           | 100          |
| policy_entropy     | 0.14791274   |
| policy_loss        | 0.004112877  |
| serial_timesteps   | 204800       |
| time_elapsed       | 501          |
| total_timesteps    | 204800       |
| value_loss         | 0.0018511078 |
-------------------------------------
-------------------------------------
| approxkl           | 0.00982387   |
| clipfrac           | 0.06713867   |
| eplenmean          | 16.2         |
| eprewmean          | 0.6831667    |
| explained_variance | 0.896        |
| fps                | 406          |
| nupdates           | 110          |
| policy_entropy     | 0.24413782   |
| policy_loss        | -0.007331957 |
| serial_timesteps   | 225280       |
| time_elapsed       | 552          |
| total_timesteps    | 225280       |
| value_loss         | 0.00279108   |
-------------------------------------
--------------------------------------
| approxkl           | 0.014135233   |
| clipfrac           | 0.056152344   |
| eplenmean          | 16.2          |
| eprewmean          | 0.69716674    |
| explained_variance | 0.932         |
| fps                | 403           |
| nupdates           | 120           |
| policy_entropy     | 0.17605259    |
| policy_loss        | -0.0014135699 |
| serial_timesteps   | 245760        |
| time_elapsed       | 602           |
| total_timesteps    | 245760        |
| value_loss         | 0.002320418   |
--------------------------------------
-------------------------------------
| approxkl           | 0.020926617  |
| clipfrac           | 0.06652832   |
| eplenmean          | 16.2         |
| eprewmean          | 0.6959168    |
| explained_variance | 0.777        |
| fps                | 426          |
| nupdates           | 130          |
| policy_entropy     | 0.18375051   |
| policy_loss        | 0.0067167035 |
| serial_timesteps   | 266240       |
| time_elapsed       | 652          |
| total_timesteps    | 266240       |
| value_loss         | 0.0066590556 |
-------------------------------------
--------------------------------------
| approxkl           | 0.010054722   |
| clipfrac           | 0.03149414    |
| eplenmean          | 15.6          |
| eprewmean          | 0.709         |
| explained_variance | 0.939         |
| fps                | 427           |
| nupdates           | 140           |
| policy_entropy     | 0.103629835   |
| policy_loss        | -0.0017408483 |
| serial_timesteps   | 286720        |
| time_elapsed       | 700           |
| total_timesteps    | 286720        |
| value_loss         | 0.0016504797  |
--------------------------------------
------------------------------------
| approxkl           | 0.4747235   |
| clipfrac           | 0.1262207   |
| eplenmean          | 19.8        |
| eprewmean          | 0.50933325  |
| explained_variance | 0.302       |
| fps                | 402         |
| nupdates           | 150         |
| policy_entropy     | 0.15966588  |
| policy_loss        | 0.000542385 |
| serial_timesteps   | 307200      |
| time_elapsed       | 751         |
| total_timesteps    | 307200      |
| value_loss         | 0.055644292 |
------------------------------------
--------------------------------------
| approxkl           | 0.0585206     |
| clipfrac           | 0.04699707    |
| eplenmean          | 14.9          |
| eprewmean          | 0.7075        |
| explained_variance | 0.948         |
| fps                | 401           |
| nupdates           | 160           |
| policy_entropy     | 0.099518925   |
| policy_loss        | -0.0028842394 |
| serial_timesteps   | 327680        |
| time_elapsed       | 802           |
| total_timesteps    | 327680        |
| value_loss         | 0.0017029579  |
--------------------------------------
--------------------------------------
| approxkl           | 0.016357124   |
| clipfrac           | 0.032958984   |
| eplenmean          | 15.1          |
| eprewmean          | 0.7125        |
| explained_variance | 0.958         |
| fps                | 406           |
| nupdates           | 170           |
| policy_entropy     | 0.054430917   |
| policy_loss        | -0.0020781658 |
| serial_timesteps   | 348160        |
| time_elapsed       | 853           |
| total_timesteps    | 348160        |
| value_loss         | 0.0014069453  |
--------------------------------------
-------------------------------------
| approxkl           | 0.13181257   |
| clipfrac           | 0.041870117  |
| eplenmean          | 15.4         |
| eprewmean          | 0.6958334    |
| explained_variance | 0.858        |
| fps                | 440          |
| nupdates           | 180          |
| policy_entropy     | 0.066288464  |
| policy_loss        | -0.011668324 |
| serial_timesteps   | 368640       |
| time_elapsed       | 899          |
| total_timesteps    | 368640       |
| value_loss         | 0.0045863683 |
-------------------------------------
--------------------------------------
| approxkl           | 0.007379102   |
| clipfrac           | 0.027832031   |
| eplenmean          | 14.3          |
| eprewmean          | 0.72275       |
| explained_variance | 0.971         |
| fps                | 445           |
| nupdates           | 190           |
| policy_entropy     | 0.096365094   |
| policy_loss        | -0.003078008  |
| serial_timesteps   | 389120        |
| time_elapsed       | 945           |
| total_timesteps    | 389120        |
| value_loss         | 0.00093441986 |
--------------------------------------
-------------------------------------
| approxkl           | 0.011448664  |
| clipfrac           | 0.029174805  |
| eplenmean          | 15.3         |
| eprewmean          | 0.7102501    |
| explained_variance | 0.946        |
| fps                | 449          |
| nupdates           | 200          |
| policy_entropy     | 0.0614279    |
| policy_loss        | -0.008784407 |
| serial_timesteps   | 409600       |
| time_elapsed       | 991          |
| total_timesteps    | 409600       |
| value_loss         | 0.0018629392 |
-------------------------------------
--------------------------------------
| approxkl           | 0.01342484    |
| clipfrac           | 0.04626465    |
| eplenmean          | 15.2          |
| eprewmean          | 0.7158335     |
| explained_variance | 0.974         |
| fps                | 449           |
| nupdates           | 210           |
| policy_entropy     | 0.09103564    |
| policy_loss        | -0.0051630316 |
| serial_timesteps   | 430080        |
| time_elapsed       | 1.04e+03      |
| total_timesteps    | 430080        |
| value_loss         | 0.0008239947  |
--------------------------------------
-------------------------------------
| approxkl           | 0.007953094  |
| clipfrac           | 0.028930664  |
| eplenmean          | 15.2         |
| eprewmean          | 0.7163334    |
| explained_variance | 0.969        |
| fps                | 454          |
| nupdates           | 220          |
| policy_entropy     | 0.06973121   |
| policy_loss        | -0.006585888 |
| serial_timesteps   | 450560       |
| time_elapsed       | 1.08e+03     |
| total_timesteps    | 450560       |
| value_loss         | 0.0009260485 |
-------------------------------------
--------------------------------------
| approxkl           | 0.03702754    |
| clipfrac           | 0.05444336    |
| eplenmean          | 15.5          |
| eprewmean          | 0.70675004    |
| explained_variance | 0.951         |
| fps                | 455           |
| nupdates           | 230           |
| policy_entropy     | 0.09789952    |
| policy_loss        | -0.0019488911 |
| serial_timesteps   | 471040        |
| time_elapsed       | 1.13e+03      |
| total_timesteps    | 471040        |
| value_loss         | 0.0014300046  |
--------------------------------------
--------------------------------------
| approxkl           | 0.051745325   |
| clipfrac           | 0.03125       |
| eplenmean          | 15.1          |
| eprewmean          | 0.7195833     |
| explained_variance | 0.975         |
| fps                | 449           |
| nupdates           | 240           |
| policy_entropy     | 0.065127894   |
| policy_loss        | -0.00515756   |
| serial_timesteps   | 491520        |
| time_elapsed       | 1.17e+03      |
| total_timesteps    | 491520        |
| value_loss         | 0.00075985683 |
--------------------------------------
-------------------------------------
| approxkl           | 0.007276212  |
| clipfrac           | 0.044311523  |
| eplenmean          | 15.8         |
| eprewmean          | 0.7081669    |
| explained_variance | 0.956        |
| fps                | 445          |
| nupdates           | 250          |
| policy_entropy     | 0.12242043   |
| policy_loss        | -0.004001532 |
| serial_timesteps   | 512000       |
| time_elapsed       | 1.22e+03     |
| total_timesteps    | 512000       |
| value_loss         | 0.0012700066 |
-------------------------------------
--------------------------------------
| approxkl           | 0.007015673   |
| clipfrac           | 0.024291992   |
| eplenmean          | 14.4          |
| eprewmean          | 0.72583336    |
| explained_variance | 0.983         |
| fps                | 424           |
| nupdates           | 260           |
| policy_entropy     | 0.057486854   |
| policy_loss        | -0.0027593905 |
| serial_timesteps   | 532480        |
| time_elapsed       | 1.27e+03      |
| total_timesteps    | 532480        |
| value_loss         | 0.0005704223  |
--------------------------------------
-------------------------------------
| approxkl           | 0.006855382  |
| clipfrac           | 0.028564453  |
| eplenmean          | 14.6         |
| eprewmean          | 0.72208345   |
| explained_variance | 0.977        |
| fps                | 435          |
| nupdates           | 270          |
| policy_entropy     | 0.06387735   |
| policy_loss        | -0.005685599 |
| serial_timesteps   | 552960       |
| time_elapsed       | 1.32e+03     |
| total_timesteps    | 552960       |
| value_loss         | 0.0007501704 |
-------------------------------------
-------------------------------------
| approxkl           | 0.002857305  |
| clipfrac           | 0.015625     |
| eplenmean          | 15           |
| eprewmean          | 0.7223334    |
| explained_variance | 0.983        |
| fps                | 429          |
| nupdates           | 280          |
| policy_entropy     | 0.05390006   |
| policy_loss        | -0.002011288 |
| serial_timesteps   | 573440       |
| time_elapsed       | 1.36e+03     |
| total_timesteps    | 573440       |
| value_loss         | 0.0005740882 |
-------------------------------------
--------------------------------------
| approxkl           | 0.12518914    |
| clipfrac           | 0.041137695   |
| eplenmean          | 16            |
| eprewmean          | 0.7076667     |
| explained_variance | 0.908         |
| fps                | 428           |
| nupdates           | 290           |
| policy_entropy     | 0.053669304   |
| policy_loss        | 0.00048783532 |
| serial_timesteps   | 593920        |
| time_elapsed       | 1.41e+03      |
| total_timesteps    | 593920        |
| value_loss         | 0.0021670922  |
--------------------------------------
-------------------------------------
| approxkl           | 0.009636785  |
| clipfrac           | 0.064697266  |
| eplenmean          | 16.4         |
| eprewmean          | 0.66958344   |
| explained_variance | 0.905        |
| fps                | 432          |
| nupdates           | 300          |
| policy_entropy     | 0.23065643   |
| policy_loss        | -0.010738534 |
| serial_timesteps   | 614400       |
| time_elapsed       | 1.46e+03     |
| total_timesteps    | 614400       |
| value_loss         | 0.0027685342 |
-------------------------------------
--------------------------------------
| approxkl           | 0.0031783069  |
| clipfrac           | 0.027709961   |
| eplenmean          | 15.2          |
| eprewmean          | 0.71508336    |
| explained_variance | 0.965         |
| fps                | 417           |
| nupdates           | 310           |
| policy_entropy     | 0.11050528    |
| policy_loss        | -0.0055362736 |
| serial_timesteps   | 634880        |
| time_elapsed       | 1.51e+03      |
| total_timesteps    | 634880        |
| value_loss         | 0.0011054395  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0029553147  |
| clipfrac           | 0.026977539   |
| eplenmean          | 15.6          |
| eprewmean          | 0.7099167     |
| explained_variance | 0.962         |
| fps                | 431           |
| nupdates           | 320           |
| policy_entropy     | 0.13246873    |
| policy_loss        | -0.0050656684 |
| serial_timesteps   | 655360        |
| time_elapsed       | 1.55e+03      |
| total_timesteps    | 655360        |
| value_loss         | 0.0012099134  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0068599028  |
| clipfrac           | 0.033691406   |
| eplenmean          | 15.1          |
| eprewmean          | 0.71875       |
| explained_variance | 0.947         |
| fps                | 433           |
| nupdates           | 330           |
| policy_entropy     | 0.071830705   |
| policy_loss        | -0.0007105068 |
| serial_timesteps   | 675840        |
| time_elapsed       | 1.6e+03       |
| total_timesteps    | 675840        |
| value_loss         | 0.0016185608  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0043550213  |
| clipfrac           | 0.024291992   |
| eplenmean          | 14.9          |
| eprewmean          | 0.72900003    |
| explained_variance | 0.982         |
| fps                | 427           |
| nupdates           | 340           |
| policy_entropy     | 0.07719641    |
| policy_loss        | -0.0024399122 |
| serial_timesteps   | 696320        |
| time_elapsed       | 1.65e+03      |
| total_timesteps    | 696320        |
| value_loss         | 0.0005805964  |
--------------------------------------
-------------------------------------
| approxkl           | 0.13895583   |
| clipfrac           | 0.09851074   |
| eplenmean          | 16.9         |
| eprewmean          | 0.7007501    |
| explained_variance | 0.936        |
| fps                | 429          |
| nupdates           | 350          |
| policy_entropy     | 0.10242043   |
| policy_loss        | -0.026852302 |
| serial_timesteps   | 716800       |
| time_elapsed       | 1.7e+03      |
| total_timesteps    | 716800       |
| value_loss         | 0.0016744145 |
-------------------------------------
--------------------------------------
| approxkl           | 0.002885365   |
| clipfrac           | 0.011230469   |
| eplenmean          | 15            |
| eprewmean          | 0.7263334     |
| explained_variance | 0.983         |
| fps                | 434           |
| nupdates           | 360           |
| policy_entropy     | 0.06623655    |
| policy_loss        | -0.0028897545 |
| serial_timesteps   | 737280        |
| time_elapsed       | 1.75e+03      |
| total_timesteps    | 737280        |
| value_loss         | 0.0006023402  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0031549255  |
| clipfrac           | 0.020507812   |
| eplenmean          | 14.9          |
| eprewmean          | 0.7280834     |
| explained_variance | 0.983         |
| fps                | 427           |
| nupdates           | 370           |
| policy_entropy     | 0.066166855   |
| policy_loss        | -0.0030142223 |
| serial_timesteps   | 757760        |
| time_elapsed       | 1.79e+03      |
| total_timesteps    | 757760        |
| value_loss         | 0.0005591386  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0041663903  |
| clipfrac           | 0.033081055   |
| eplenmean          | 14.5          |
| eprewmean          | 0.73100007    |
| explained_variance | 0.979         |
| fps                | 447           |
| nupdates           | 380           |
| policy_entropy     | 0.08447483    |
| policy_loss        | -0.0053323675 |
| serial_timesteps   | 778240        |
| time_elapsed       | 1.84e+03      |
| total_timesteps    | 778240        |
| value_loss         | 0.00063511264 |
--------------------------------------
--------------------------------------
| approxkl           | 0.0029821154  |
| clipfrac           | 0.014282227   |
| eplenmean          | 15            |
| eprewmean          | 0.7252502     |
| explained_variance | 0.978         |
| fps                | 445           |
| nupdates           | 390           |
| policy_entropy     | 0.06973519    |
| policy_loss        | -0.0038597833 |
| serial_timesteps   | 798720        |
| time_elapsed       | 1.89e+03      |
| total_timesteps    | 798720        |
| value_loss         | 0.00070443004 |
--------------------------------------
-------------------------------------
| approxkl           | 0.0025000875 |
| clipfrac           | 0.017211914  |
| eplenmean          | 15.2         |
| eprewmean          | 0.7215833    |
| explained_variance | 0.977        |
| fps                | 442          |
| nupdates           | 400          |
| policy_entropy     | 0.062204212  |
| policy_loss        | -0.004480864 |
| serial_timesteps   | 819200       |
| time_elapsed       | 1.93e+03     |
| total_timesteps    | 819200       |
| value_loss         | 0.0007088655 |
-------------------------------------
--------------------------------------
| approxkl           | 0.006064726   |
| clipfrac           | 0.018310547   |
| eplenmean          | 15.5          |
| eprewmean          | 0.72141653    |
| explained_variance | 0.981         |
| fps                | 440           |
| nupdates           | 410           |
| policy_entropy     | 0.053719625   |
| policy_loss        | -0.0041853436 |
| serial_timesteps   | 839680        |
| time_elapsed       | 1.98e+03      |
| total_timesteps    | 839680        |
| value_loss         | 0.0005962068  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0018461812  |
| clipfrac           | 0.010620117   |
| eplenmean          | 14.8          |
| eprewmean          | 0.72924995    |
| explained_variance | 0.985         |
| fps                | 430           |
| nupdates           | 420           |
| policy_entropy     | 0.03774019    |
| policy_loss        | -0.002248707  |
| serial_timesteps   | 860160        |
| time_elapsed       | 2.03e+03      |
| total_timesteps    | 860160        |
| value_loss         | 0.00048187908 |
--------------------------------------
------------------------------------
| approxkl           | 0.05437982  |
| clipfrac           | 0.036621094 |
| eplenmean          | 14.5        |
| eprewmean          | 0.7378333   |
| explained_variance | 0.902       |
| fps                | 430         |
| nupdates           | 430         |
| policy_entropy     | 0.054468013 |
| policy_loss        | -0.01444613 |
| serial_timesteps   | 880640      |
| time_elapsed       | 2.08e+03    |
| total_timesteps    | 880640      |
| value_loss         | 0.002808094 |
------------------------------------
---------------------------------------
| approxkl           | 0.001031183    |
| clipfrac           | 0.010131836    |
| eplenmean          | 15.1           |
| eprewmean          | 0.7238334      |
| explained_variance | 0.932          |
| fps                | 436            |
| nupdates           | 440            |
| policy_entropy     | 0.042608798    |
| policy_loss        | -0.00094039476 |
| serial_timesteps   | 901120         |
| time_elapsed       | 2.12e+03       |
| total_timesteps    | 901120         |
| value_loss         | 0.0014700713   |
---------------------------------------
-------------------------------------
| approxkl           | 0.0018764732 |
| clipfrac           | 0.011962891  |
| eplenmean          | 14.3         |
| eprewmean          | 0.7239167    |
| explained_variance | 0.972        |
| fps                | 400          |
| nupdates           | 450          |
| policy_entropy     | 0.044268757  |
| policy_loss        | -0.003393606 |
| serial_timesteps   | 921600       |
| time_elapsed       | 2.17e+03     |
| total_timesteps    | 921600       |
| value_loss         | 0.0009643771 |
-------------------------------------
--------------------------------------
| approxkl           | 0.0017598544  |
| clipfrac           | 0.012451172   |
| eplenmean          | 14.3          |
| eprewmean          | 0.7313333     |
| explained_variance | 0.987         |
| fps                | 435           |
| nupdates           | 460           |
| policy_entropy     | 0.042088002   |
| policy_loss        | -0.0031104754 |
| serial_timesteps   | 942080        |
| time_elapsed       | 2.22e+03      |
| total_timesteps    | 942080        |
| value_loss         | 0.00043203327 |
--------------------------------------
--------------------------------------
| approxkl           | 0.0005749549  |
| clipfrac           | 0.0078125     |
| eplenmean          | 14.4          |
| eprewmean          | 0.7337499     |
| explained_variance | 0.986         |
| fps                | 427           |
| nupdates           | 470           |
| policy_entropy     | 0.04269629    |
| policy_loss        | -0.002129221  |
| serial_timesteps   | 962560        |
| time_elapsed       | 2.27e+03      |
| total_timesteps    | 962560        |
| value_loss         | 0.00048825034 |
--------------------------------------
--------------------------------------
| approxkl           | 0.00039610395 |
| clipfrac           | 0.0034179688  |
| eplenmean          | 14.5          |
| eprewmean          | 0.73708344    |
| explained_variance | 0.984         |
| fps                | 401           |
| nupdates           | 480           |
| policy_entropy     | 0.040699374   |
| policy_loss        | -0.001574069  |
| serial_timesteps   | 983040        |
| time_elapsed       | 2.31e+03      |
| total_timesteps    | 983040        |
| value_loss         | 0.00053364615 |
--------------------------------------
CPU times: user 47min 46s, sys: 8min 56s, total: 56min 43s
Wall time: 39min 23s

In [0]:
# !ls -l $log_dir

In [0]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
plt.ylim(0, .75)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility
Out[0]:
[<matplotlib.lines.Line2D at 0x7f16b5403240>]

Enjoy model


In [0]:
import numpy as np 

observation = env.reset()
env.render()
baseline = Baseline(env, max_reward=6000)


{'S': 0, 'A': 0, 'B': 0, 'C': 0, 'D': 1000, 'E': 1000, 'F': 0, 'G': 1000, 'H': 1000, 'K': 0, 'L': 1000, 'M': 0, 'N': 0, 'O': 1000}

In [0]:
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps, reward={}".format(t+1, env.totalReward))
        break
env.close()


Episode:    0   Step:    1  S --2-> C R=-0.03 totalR=-0.03 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:    2  C --2-> M R=-0.02 totalR=-0.05 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    3  M --1-> L R= 0.16 totalR= 0.11 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    4  L --1-> M R=-0.01 totalR= 0.10 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    5  M --2-> N R=-0.02 totalR= 0.08 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    6  N --1-> O R= 0.15 totalR= 0.23 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    7  O --1-> G R= 0.12 totalR= 0.35 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:    8  G --0-> F R=-0.03 totalR= 0.32 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:    9  F --0-> D R= 0.16 totalR= 0.47 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:   10  D --0-> A R=-0.02 totalR= 0.46 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   11  A --2-> E R= 0.15 totalR= 0.61 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   12  E --2-> H R= 0.15 totalR= 0.76 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   13  H --0-> E R=-0.02 totalR= 0.74 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  E --0-> A R=-0.02 totalR= 0.72 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   15  A --1-> B R=-0.02 totalR= 0.71 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   16  B --0-> S R=-0.02 totalR= 0.69 cost= 100 customerR=   0 optimum=6000
Episode finished after 16 timesteps, reward=0.6916666666666664

In [0]:
%time baseline.find_optimum()


Scaled reward: 0.7083333333333334
Perfect path ['S', 'B', 'C', 'M', 'L', 'M', 'N', 'O', 'G', 'F', 'D', 'F', 'E', 'H', 'E', 'A', 'B', 'S']
CPU times: user 138 ms, sys: 1.57 ms, total: 139 ms
Wall time: 143 ms
Out[0]:
{'cost': 1750,
 'path': ['S',
  'B',
  'C',
  'M',
  'L',
  'M',
  'N',
  'O',
  'G',
  'F',
  'D',
  'F',
  'E',
  'H',
  'E',
  'A',
  'B',
  'S'],
 'position': 'S',
 'reward': 6000,
 'rewards': {'A': 0,
  'B': 0,
  'C': 0,
  'D': 0,
  'E': 0,
  'F': 0,
  'G': 0,
  'H': 0,
  'K': 0,
  'L': 0,
  'M': 0,
  'N': 0,
  'O': 0,
  'S': 0},
 'scaled_reward': 0.7083333333333334}

Evaluation


In [0]:
baseline = Baseline(env, max_reward=6000)
perfect_score_mean, perfect_score_std, test_score_mean, test_score_std = baseline.score(model, sample_runs=100)

In [0]:
# perfect scores
perfect_score_mean, perfect_score_std


Out[0]:
(0.7383333333333334, 0.03227486121839515)

In [0]:
# test scores for our model
test_score_mean, test_score_std


Out[0]:
(0.7308333333333333, 0.03404449702635918)

In [0]: