Berater Environment v7

Changes from v6

  1. per episode set certain rewards to 0 to simulate different customers per consultant

next steps

  1. consider returning to complete observation (should give better results) like in previous notebooks
  2. configure custom network including regularization (https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/)

Installation (required for colab)


In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment


In [0]:
import numpy
import random

import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
#         self.map = {
#             'S': [('A', 100), ('B', 400), ('C', 200 )],
#             'A': [('B', 250), ('C', 400), ('S', 100 )],
#             'B': [('A', 250), ('C', 250), ('S', 400 )],
#             'C': [('A', 400), ('B', 250), ('S', 200 )]
#         }
        self.map = {
            'S': [('A', 300), ('B', 100), ('C', 200 )],
            'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
            'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
            'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
            'D': [('A', 100), ('F', 50)],
            'E': [('A', 100), ('F', 100), ('H', 100)],
            'F': [('D', 50), ('E', 100), ('G', 200)],
            'G': [('F', 200), ('O', 300)],
            'H': [('E', 100), ('K', 300)],
            'K': [('B', 200), ('H', 300)],
            'L': [('C', 200), ('M', 50)],
            'M': [('C', 100), ('L', 50), ('N', 100)],
            'N': [('M', 100), ('O', 100)],
            'O': [('N', 100), ('G', 300)]
        }
        self.action_space = spaces.Discrete(4)
        # position, and up to 4 paths from that position, non existing path is -1000 and no position change
        self.observation_space = spaces.Box(low=numpy.array([0,-1000,-1000,-1000,-1000]),
                                             high=numpy.array([13,1000,1000,1000,1000]),
                                             dtype=numpy.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = numpy.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=numpy.float32)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

      
    def modulate_reward(self):
      number_of_customers = len(self.map) - 1
      number_per_consultant = int(number_of_customers/2)
#       number_per_consultant = int(number_of_customers/1.5)
      self.customer_reward = {
          'S': 0
      }
      for customer_nr in range(1, number_of_customers + 1):
        self.customer_reward[int_to_state_name(customer_nr)] = 0
      
      # every consultant only visits a few random customers
      samples = random.sample(range(1, number_of_customers + 1), k=number_per_consultant)
      key_list = list(self.customer_reward.keys())
      for sample in samples:
        self.customer_reward[key_list[sample]] = 1000

      
    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.modulate_reward()
        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))
      
    def render(self):
      print(self.customer_reward)

In [3]:
env = BeraterEnv()
print(env.reset())
print(env.customer_reward)


[    0.  -300.   900.  -200. -1000.]
{'S': 0, 'A': 0, 'B': 1000, 'C': 0, 'D': 1000, 'E': 0, 'F': 1000, 'G': 1000, 'H': 1000, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'O': 1000}

Try out Environment


In [4]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)


<BeraterEnv instance>
[    0.  -300.  -100.   800. -1000.]
Episode:    0   Step:    1  S --0-> A R=-0.05 totalR=-0.05 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:    2  A --3-> D R=-0.02 totalR=-0.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    3  D --1-> F R= 0.16 totalR= 0.09 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    4  F --0-> D R=-0.01 totalR= 0.08 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    5  D --3-> D R=-0.17 totalR=-0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    6  D --3-> D R=-0.17 totalR=-0.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    7  D --3-> D R=-0.17 totalR=-0.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    8  D --3-> D R=-0.17 totalR=-0.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    9  D --1-> F R=-0.01 totalR=-0.59 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   10  F --3-> F R=-0.17 totalR=-0.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   11  F --1-> E R= 0.15 totalR=-0.61 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   12  E --2-> H R=-0.02 totalR=-0.62 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   13  H --0-> E R=-0.02 totalR=-0.64 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  E --3-> E R=-0.17 totalR=-0.81 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   15  E --2-> H R=-0.02 totalR=-0.82 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   16  H --0-> E R=-0.02 totalR=-0.84 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   17  E --0-> A R=-0.02 totalR=-0.86 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   18  A --0-> S R=-0.05 totalR=-0.91 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   19  S --2-> C R= 0.13 totalR=-0.78 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   20  C --1-> B R=-0.01 totalR=-0.78 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   21  B --2-> C R=-0.01 totalR=-0.79 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   22  C --3-> L R=-0.03 totalR=-0.83 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   23  L --3-> L R=-0.17 totalR=-0.99 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   24  L --2-> L R=-0.17 totalR=-1.16 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   25  L --0-> C R=-0.03 totalR=-1.19 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   26  C --1-> B R=-0.01 totalR=-1.20 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   27  B --1-> A R=-0.02 totalR=-1.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   28  A --1-> B R=-0.02 totalR=-1.23 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   29  B --1-> A R=-0.02 totalR=-1.25 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   30  A --0-> S R=-0.05 totalR=-1.30 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   31  S --1-> B R=-0.02 totalR=-1.32 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   32  B --0-> S R=-0.02 totalR=-1.33 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   33  S --3-> S R=-0.17 totalR=-1.50 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   34  S --0-> A R=-0.05 totalR=-1.55 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   35  A --3-> D R=-0.02 totalR=-1.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   36  D --1-> F R=-0.01 totalR=-1.57 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   37  F --2-> G R= 0.13 totalR=-1.44 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   38  G --3-> G R=-0.17 totalR=-1.61 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   39  G --3-> G R=-0.17 totalR=-1.78 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   40  G --0-> F R=-0.03 totalR=-1.81 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   41  F --2-> G R=-0.03 totalR=-1.84 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   42  G --3-> G R=-0.17 totalR=-2.01 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   43  G --0-> F R=-0.03 totalR=-2.04 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   44  F --1-> E R=-0.02 totalR=-2.06 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   45  E --3-> E R=-0.17 totalR=-2.23 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   46  E --1-> F R=-0.02 totalR=-2.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   47  F --3-> F R=-0.17 totalR=-2.41 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   48  F --3-> F R=-0.17 totalR=-2.57 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   49  F --2-> G R=-0.03 totalR=-2.61 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   50  G --3-> G R=-0.17 totalR=-2.77 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   51  G --0-> F R=-0.03 totalR=-2.81 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   52  F --1-> E R=-0.02 totalR=-2.82 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   53  E --1-> F R=-0.02 totalR=-2.84 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   54  F --1-> E R=-0.02 totalR=-2.86 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   55  E --3-> E R=-0.17 totalR=-3.02 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   56  E --0-> A R=-0.02 totalR=-3.04 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   57  A --3-> D R=-0.02 totalR=-3.06 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   58  D --2-> D R=-0.17 totalR=-3.22 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   59  D --0-> A R=-0.02 totalR=-3.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   60  A --3-> D R=-0.02 totalR=-3.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   61  D --3-> D R=-0.17 totalR=-3.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   62  D --2-> D R=-0.17 totalR=-3.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   63  D --3-> D R=-0.17 totalR=-3.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   64  D --2-> D R=-0.17 totalR=-3.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   65  D --3-> D R=-0.17 totalR=-4.09 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   66  D --0-> A R=-0.02 totalR=-4.11 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   67  A --2-> E R=-0.02 totalR=-4.12 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   68  E --0-> A R=-0.02 totalR=-4.14 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   69  A --0-> S R=-0.05 totalR=-4.19 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   70  S --0-> A R=-0.05 totalR=-4.24 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   71  A --1-> B R=-0.02 totalR=-4.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   72  B --1-> A R=-0.02 totalR=-4.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   73  A --2-> E R=-0.02 totalR=-4.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   74  E --0-> A R=-0.02 totalR=-4.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   75  A --0-> S R=-0.05 totalR=-4.36 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   76  S --1-> B R=-0.02 totalR=-4.37 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   77  B --3-> K R= 0.13 totalR=-4.24 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   78  K --0-> B R=-0.03 totalR=-4.27 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   79  B --1-> A R=-0.02 totalR=-4.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   80  A --2-> E R=-0.02 totalR=-4.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   81  E --2-> H R=-0.02 totalR=-4.32 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   82  H --3-> H R=-0.17 totalR=-4.49 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   83  H --0-> E R=-0.02 totalR=-4.51 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   84  E --1-> F R=-0.02 totalR=-4.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   85  F --1-> E R=-0.02 totalR=-4.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   86  E --3-> E R=-0.17 totalR=-4.71 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   87  E --1-> F R=-0.02 totalR=-4.72 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   88  F --1-> E R=-0.02 totalR=-4.74 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   89  E --3-> E R=-0.17 totalR=-4.91 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   90  E --2-> H R=-0.02 totalR=-4.92 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   91  H --3-> H R=-0.17 totalR=-5.09 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   92  H --3-> H R=-0.17 totalR=-5.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   93  H --2-> H R=-0.17 totalR=-5.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   94  H --2-> H R=-0.17 totalR=-5.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   95  H --3-> H R=-0.17 totalR=-5.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   96  H --0-> E R=-0.02 totalR=-5.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   97  E --2-> H R=-0.02 totalR=-5.79 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   98  H --3-> H R=-0.17 totalR=-5.96 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   99  H --1-> K R=-0.05 totalR=-6.01 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  100  K --0-> B R=-0.03 totalR=-6.04 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  101  B --1-> A R=-0.02 totalR=-6.06 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  102  A --2-> E R=-0.02 totalR=-6.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  103  E --0-> A R=-0.02 totalR=-6.09 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  104  A --3-> D R=-0.02 totalR=-6.11 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  105  D --0-> A R=-0.02 totalR=-6.12 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  106  A --2-> E R=-0.02 totalR=-6.14 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  107  E --0-> A R=-0.02 totalR=-6.16 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  108  A --3-> D R=-0.02 totalR=-6.17 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  109  D --3-> D R=-0.17 totalR=-6.34 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  110  D --0-> A R=-0.02 totalR=-6.36 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  111  A --3-> D R=-0.02 totalR=-6.37 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  112  D --0-> A R=-0.02 totalR=-6.39 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  113  A --0-> S R=-0.05 totalR=-6.44 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  114  S --0-> A R=-0.05 totalR=-6.49 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  115  A --0-> S R=-0.05 totalR=-6.54 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  116  S --2-> C R=-0.03 totalR=-6.57 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  117  C --3-> L R=-0.03 totalR=-6.61 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  118  L --0-> C R=-0.03 totalR=-6.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  119  C --3-> L R=-0.03 totalR=-6.67 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  120  L --2-> L R=-0.17 totalR=-6.84 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  121  L --3-> L R=-0.17 totalR=-7.01 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  122  L --3-> L R=-0.17 totalR=-7.17 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  123  L --1-> M R=-0.01 totalR=-7.18 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  124  M --1-> L R=-0.01 totalR=-7.19 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  125  L --1-> M R=-0.01 totalR=-7.20 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  126  M --0-> C R=-0.02 totalR=-7.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  127  C --1-> B R=-0.01 totalR=-7.22 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  128  B --1-> A R=-0.02 totalR=-7.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  129  A --1-> B R=-0.02 totalR=-7.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  130  B --3-> K R=-0.03 totalR=-7.29 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  131  K --0-> B R=-0.03 totalR=-7.32 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  132  B --3-> K R=-0.03 totalR=-7.36 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  133  K --1-> H R=-0.05 totalR=-7.41 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  134  H --2-> H R=-0.17 totalR=-7.57 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  135  H --0-> E R=-0.02 totalR=-7.59 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  136  E --1-> F R=-0.02 totalR=-7.61 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  137  F --2-> G R=-0.03 totalR=-7.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  138  G --0-> F R=-0.03 totalR=-7.67 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  139  F --2-> G R=-0.03 totalR=-7.71 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  140  G --0-> F R=-0.03 totalR=-7.74 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  141  F --1-> E R=-0.02 totalR=-7.76 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  142  E --3-> E R=-0.17 totalR=-7.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  143  E --2-> H R=-0.02 totalR=-7.94 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  144  H --2-> H R=-0.17 totalR=-8.11 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  145  H --1-> K R=-0.05 totalR=-8.16 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  146  K --0-> B R=-0.03 totalR=-8.19 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  147  B --3-> K R=-0.03 totalR=-8.22 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  148  K --1-> H R=-0.05 totalR=-8.28 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  149  H --1-> K R=-0.05 totalR=-8.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  150  K --3-> K R=-0.17 totalR=-8.49 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  151  K --0-> B R=-0.03 totalR=-8.53 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  152  B --2-> C R=-0.01 totalR=-8.53 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  153  C --2-> M R=-0.02 totalR=-8.55 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  154  M --3-> M R=-0.17 totalR=-8.72 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  155  M --2-> N R=-0.02 totalR=-8.73 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  156  N --3-> N R=-0.17 totalR=-8.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  157  N --3-> N R=-0.17 totalR=-9.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  158  N --3-> N R=-0.17 totalR=-9.23 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  159  N --2-> N R=-0.17 totalR=-9.40 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  160  N --1-> O R= 0.15 totalR=-9.25 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:  161  O --2-> O R=-0.17 totalR=-9.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  162  O --2-> O R=-0.17 totalR=-9.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  163  O --3-> O R=-0.17 totalR=-9.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  164  O --2-> O R=-0.17 totalR=-9.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  165  O --3-> O R=-0.17 totalR=-10.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  166  O --3-> O R=-0.17 totalR=-10.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  167  O --2-> O R=-0.17 totalR=-10.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  168  O --2-> O R=-0.17 totalR=-10.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  169  O --3-> O R=-0.17 totalR=-10.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  170  O --0-> N R=-0.02 totalR=-10.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  171  N --1-> O R=-0.02 totalR=-10.78 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  172  O --2-> O R=-0.17 totalR=-10.95 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  173  O --3-> O R=-0.17 totalR=-11.12 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  174  O --2-> O R=-0.17 totalR=-11.28 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  175  O --1-> G R=-0.05 totalR=-11.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  176  G --2-> G R=-0.17 totalR=-11.50 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  177  G --1-> O R=-0.05 totalR=-11.55 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  178  O --0-> N R=-0.02 totalR=-11.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  179  N --2-> N R=-0.17 totalR=-11.73 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  180  N --2-> N R=-0.17 totalR=-11.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  181  N --3-> N R=-0.17 totalR=-12.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  182  N --0-> M R=-0.02 totalR=-12.08 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  183  M --3-> M R=-0.17 totalR=-12.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  184  M --2-> N R=-0.02 totalR=-12.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  185  N --3-> N R=-0.17 totalR=-12.43 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  186  N --0-> M R=-0.02 totalR=-12.45 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  187  M --0-> C R=-0.02 totalR=-12.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  188  C --2-> M R=-0.02 totalR=-12.48 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  189  M --0-> C R=-0.02 totalR=-12.50 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  190  C --2-> M R=-0.02 totalR=-12.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  191  M --3-> M R=-0.17 totalR=-12.68 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  192  M --2-> N R=-0.02 totalR=-12.70 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  193  N --2-> N R=-0.17 totalR=-12.87 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  194  N --3-> N R=-0.17 totalR=-13.03 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  195  N --0-> M R=-0.02 totalR=-13.05 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  196  M --0-> C R=-0.02 totalR=-13.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  197  C --0-> S R=-0.03 totalR=-13.10 cost= 200 customerR=   0 optimum=6000
Done: episodes=     1  avgSteps=197.00  avgTotalReward=-13.10
Episode finished after 197 timesteps
[    0.  -300.  -100.  -200. -1000.]

Train model

  • random has lower total reward than version with dense customers
  • total cost when travelling all paths (back and forth): 2500
  • additional pernalty for liiegal moves 1000
  • all rewards: 6000
  • perfect score???
  • estimate: half the travel cost and no illegal moves: (6000 - 1250) / 6000 = .79
  • but: rewards are much more sparse while routes stay the same, maybe expect less
  • additionally: the agent only sees very little of the whole scenario
    • changes with every episode
    • was ok when network can learn fixed scenario

In [5]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/berater

In [7]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
# https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30
%time model = ppo2.learn(\
    env=monitored_env,\
    network='mlp',\
    num_hidden=2000,\
    num_layers=3,\
    ent_coef=0.03,\
    total_timesteps=1000000)

# %time model = ppo2.learn(\
#     env=monitored_env,\
#     network='mlp',\
#     num_hidden=2000,\
#     num_layers=3,\
#     ent_coef=0.1,\
#     total_timesteps=500000)

# model = ppo2.learn(
#     env=monitored_env,\
#     layer_norm=True,\
#     network='mlp',\
#     num_hidden=2000,\
#     activation=tf.nn.relu,\
#     num_layers=3,\
#     ent_coef=0.03,\
#     total_timesteps=1000000)

# monitored_env = bench.Monitor(env, log_dir)
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         monitored_env,\
#         seed=42,\
#         network='mlp',\
#         lr=1e-3,\
#         gamma=0.99,\
#         total_timesteps=30000,\
#         buffer_size=50000,\
#         exploration_fraction=0.5,\
#         exploration_final_eps=0.02,\
#         print_freq=1000)

model.save('berater-ppo-v7.pkl')
monitored_env.close()


Logging to /tmp/openai-2019-01-03-18-48-01-372539
-------------------------------------
| approxkl           | 0.019593637  |
| clipfrac           | 0.3166504    |
| eplenmean          | 151          |
| eprewmean          | -9.909617    |
| explained_variance | -0.285       |
| fps                | 385          |
| nupdates           | 1            |
| policy_entropy     | 1.3666965    |
| policy_loss        | -0.024349716 |
| serial_timesteps   | 2048         |
| time_elapsed       | 5.31         |
| total_timesteps    | 2048         |
| value_loss         | 2.1245756    |
-------------------------------------
--------------------------------------
| approxkl           | 0.007577969   |
| clipfrac           | 0.099121094   |
| eplenmean          | 43            |
| eprewmean          | -0.16400005   |
| explained_variance | -0.0476       |
| fps                | 442           |
| nupdates           | 10            |
| policy_entropy     | 1.0529546     |
| policy_loss        | -0.0045482353 |
| serial_timesteps   | 20480         |
| time_elapsed       | 46.9          |
| total_timesteps    | 20480         |
| value_loss         | 0.1092316     |
--------------------------------------
--------------------------------------
| approxkl           | 0.014131604   |
| clipfrac           | 0.1619873     |
| eplenmean          | 37.7          |
| eprewmean          | 0.17383336    |
| explained_variance | 0.14          |
| fps                | 437           |
| nupdates           | 20            |
| policy_entropy     | 0.8798149     |
| policy_loss        | -0.0012780649 |
| serial_timesteps   | 40960         |
| time_elapsed       | 93.3          |
| total_timesteps    | 40960         |
| value_loss         | 0.025928076   |
--------------------------------------
-------------------------------------
| approxkl           | 0.01006335   |
| clipfrac           | 0.14685059   |
| eplenmean          | 40.9         |
| eprewmean          | 0.1085       |
| explained_variance | 0.17         |
| fps                | 446          |
| nupdates           | 30           |
| policy_entropy     | 0.9289736    |
| policy_loss        | 0.0007974657 |
| serial_timesteps   | 61440        |
| time_elapsed       | 140          |
| total_timesteps    | 61440        |
| value_loss         | 0.02261215   |
-------------------------------------
--------------------------------------
| approxkl           | 0.010071272   |
| clipfrac           | 0.11987305    |
| eplenmean          | 30.2          |
| eprewmean          | 0.29674998    |
| explained_variance | 0.173         |
| fps                | 446           |
| nupdates           | 40            |
| policy_entropy     | 0.74906313    |
| policy_loss        | -0.0055703856 |
| serial_timesteps   | 81920         |
| time_elapsed       | 186           |
| total_timesteps    | 81920         |
| value_loss         | 0.020835266   |
--------------------------------------
-------------------------------------
| approxkl           | 0.014220675  |
| clipfrac           | 0.114868164  |
| eplenmean          | 25.1         |
| eprewmean          | 0.40708336   |
| explained_variance | 0.302        |
| fps                | 444          |
| nupdates           | 50           |
| policy_entropy     | 0.6445336    |
| policy_loss        | -0.008773514 |
| serial_timesteps   | 102400       |
| time_elapsed       | 233          |
| total_timesteps    | 102400       |
| value_loss         | 0.0149234375 |
-------------------------------------
--------------------------------------
| approxkl           | 0.009891702   |
| clipfrac           | 0.09655762    |
| eplenmean          | 21.2          |
| eprewmean          | 0.48675       |
| explained_variance | 0.342         |
| fps                | 449           |
| nupdates           | 60            |
| policy_entropy     | 0.55703175    |
| policy_loss        | -0.0037293616 |
| serial_timesteps   | 122880        |
| time_elapsed       | 278           |
| total_timesteps    | 122880        |
| value_loss         | 0.016623318   |
--------------------------------------
--------------------------------------
| approxkl           | 0.007388264   |
| clipfrac           | 0.11425781    |
| eplenmean          | 23.4          |
| eprewmean          | 0.47516668    |
| explained_variance | 0.362         |
| fps                | 451           |
| nupdates           | 70            |
| policy_entropy     | 0.6015584     |
| policy_loss        | -0.0039213863 |
| serial_timesteps   | 143360        |
| time_elapsed       | 325           |
| total_timesteps    | 143360        |
| value_loss         | 0.013776244   |
--------------------------------------
------------------------------------
| approxkl           | 0.021204086 |
| clipfrac           | 0.14904785  |
| eplenmean          | 20.4        |
| eprewmean          | 0.5139166   |
| explained_variance | 0.432       |
| fps                | 442         |
| nupdates           | 80          |
| policy_entropy     | 0.5630118   |
| policy_loss        | 0.007254677 |
| serial_timesteps   | 163840      |
| time_elapsed       | 371         |
| total_timesteps    | 163840      |
| value_loss         | 0.014949808 |
------------------------------------
--------------------------------------
| approxkl           | 0.005994698   |
| clipfrac           | 0.072631836   |
| eplenmean          | 20.9          |
| eprewmean          | 0.5075834     |
| explained_variance | 0.371         |
| fps                | 450           |
| nupdates           | 90            |
| policy_entropy     | 0.535086      |
| policy_loss        | -0.0045562023 |
| serial_timesteps   | 184320        |
| time_elapsed       | 417           |
| total_timesteps    | 184320        |
| value_loss         | 0.0148131875  |
--------------------------------------
------------------------------------
| approxkl           | 0.017413573 |
| clipfrac           | 0.16247559  |
| eplenmean          | 23.9        |
| eprewmean          | 0.4732501   |
| explained_variance | 0.436       |
| fps                | 457         |
| nupdates           | 100         |
| policy_entropy     | 0.57642573  |
| policy_loss        | 0.010225539 |
| serial_timesteps   | 204800      |
| time_elapsed       | 463         |
| total_timesteps    | 204800      |
| value_loss         | 0.014204118 |
------------------------------------
--------------------------------------
| approxkl           | 0.017630205   |
| clipfrac           | 0.13671875    |
| eplenmean          | 20.5          |
| eprewmean          | 0.49975005    |
| explained_variance | 0.402         |
| fps                | 445           |
| nupdates           | 110           |
| policy_entropy     | 0.5470696     |
| policy_loss        | -0.0019779103 |
| serial_timesteps   | 225280        |
| time_elapsed       | 509           |
| total_timesteps    | 225280        |
| value_loss         | 0.014785086   |
--------------------------------------
-------------------------------------
| approxkl           | 0.006835807  |
| clipfrac           | 0.08239746   |
| eplenmean          | 21.7         |
| eprewmean          | 0.5004167    |
| explained_variance | 0.437        |
| fps                | 449          |
| nupdates           | 120          |
| policy_entropy     | 0.5435441    |
| policy_loss        | -0.002747289 |
| serial_timesteps   | 245760       |
| time_elapsed       | 555          |
| total_timesteps    | 245760       |
| value_loss         | 0.0132071525 |
-------------------------------------
-------------------------------------
| approxkl           | 0.022876767  |
| clipfrac           | 0.12609863   |
| eplenmean          | 22.4         |
| eprewmean          | 0.48058334   |
| explained_variance | 0.427        |
| fps                | 461          |
| nupdates           | 130          |
| policy_entropy     | 0.5264168    |
| policy_loss        | -0.009492428 |
| serial_timesteps   | 266240       |
| time_elapsed       | 600          |
| total_timesteps    | 266240       |
| value_loss         | 0.012696637  |
-------------------------------------
--------------------------------------
| approxkl           | 0.008718463   |
| clipfrac           | 0.09326172    |
| eplenmean          | 20.4          |
| eprewmean          | 0.50858337    |
| explained_variance | 0.406         |
| fps                | 459           |
| nupdates           | 140           |
| policy_entropy     | 0.5484952     |
| policy_loss        | -0.0065971324 |
| serial_timesteps   | 286720        |
| time_elapsed       | 647           |
| total_timesteps    | 286720        |
| value_loss         | 0.012461164   |
--------------------------------------
--------------------------------------
| approxkl           | 0.0053289165  |
| clipfrac           | 0.06237793    |
| eplenmean          | 19.2          |
| eprewmean          | 0.5545        |
| explained_variance | 0.444         |
| fps                | 446           |
| nupdates           | 150           |
| policy_entropy     | 0.47149378    |
| policy_loss        | -0.0009572138 |
| serial_timesteps   | 307200        |
| time_elapsed       | 692           |
| total_timesteps    | 307200        |
| value_loss         | 0.013025255   |
--------------------------------------
-------------------------------------
| approxkl           | 0.008539572  |
| clipfrac           | 0.083984375  |
| eplenmean          | 20           |
| eprewmean          | 0.5241667    |
| explained_variance | 0.381        |
| fps                | 461          |
| nupdates           | 160          |
| policy_entropy     | 0.5156527    |
| policy_loss        | -0.006648442 |
| serial_timesteps   | 327680       |
| time_elapsed       | 737          |
| total_timesteps    | 327680       |
| value_loss         | 0.012629914  |
-------------------------------------
-------------------------------------
| approxkl           | 0.0062726056 |
| clipfrac           | 0.088134766  |
| eplenmean          | 20.4         |
| eprewmean          | 0.54025006   |
| explained_variance | 0.449        |
| fps                | 459          |
| nupdates           | 170          |
| policy_entropy     | 0.49528784   |
| policy_loss        | 0.0007002303 |
| serial_timesteps   | 348160       |
| time_elapsed       | 782          |
| total_timesteps    | 348160       |
| value_loss         | 0.013654434  |
-------------------------------------
--------------------------------------
| approxkl           | 0.009324665   |
| clipfrac           | 0.095947266   |
| eplenmean          | 22.4          |
| eprewmean          | 0.488         |
| explained_variance | 0.417         |
| fps                | 452           |
| nupdates           | 180           |
| policy_entropy     | 0.56534773    |
| policy_loss        | -0.0029521256 |
| serial_timesteps   | 368640        |
| time_elapsed       | 828           |
| total_timesteps    | 368640        |
| value_loss         | 0.012802036   |
--------------------------------------
--------------------------------------
| approxkl           | 0.009771489   |
| clipfrac           | 0.10656738    |
| eplenmean          | 22.3          |
| eprewmean          | 0.5189167     |
| explained_variance | 0.375         |
| fps                | 463           |
| nupdates           | 190           |
| policy_entropy     | 0.59984696    |
| policy_loss        | -0.0051241177 |
| serial_timesteps   | 389120        |
| time_elapsed       | 873           |
| total_timesteps    | 389120        |
| value_loss         | 0.012006206   |
--------------------------------------
-------------------------------------
| approxkl           | 0.006131854  |
| clipfrac           | 0.076416016  |
| eplenmean          | 19.2         |
| eprewmean          | 0.5595833    |
| explained_variance | 0.415        |
| fps                | 458          |
| nupdates           | 200          |
| policy_entropy     | 0.4860985    |
| policy_loss        | -0.005064509 |
| serial_timesteps   | 409600       |
| time_elapsed       | 917          |
| total_timesteps    | 409600       |
| value_loss         | 0.013654921  |
-------------------------------------
-------------------------------------
| approxkl           | 0.007408309  |
| clipfrac           | 0.08679199   |
| eplenmean          | 22.8         |
| eprewmean          | 0.49958336   |
| explained_variance | 0.43         |
| fps                | 467          |
| nupdates           | 210          |
| policy_entropy     | 0.56970495   |
| policy_loss        | -0.002875534 |
| serial_timesteps   | 430080       |
| time_elapsed       | 963          |
| total_timesteps    | 430080       |
| value_loss         | 0.013359591  |
-------------------------------------
--------------------------------------
| approxkl           | 0.008439079   |
| clipfrac           | 0.0892334     |
| eplenmean          | 22            |
| eprewmean          | 0.48216668    |
| explained_variance | 0.438         |
| fps                | 458           |
| nupdates           | 220           |
| policy_entropy     | 0.54835963    |
| policy_loss        | -0.0035187965 |
| serial_timesteps   | 450560        |
| time_elapsed       | 1.01e+03      |
| total_timesteps    | 450560        |
| value_loss         | 0.01059682    |
--------------------------------------
--------------------------------------
| approxkl           | 0.01032425    |
| clipfrac           | 0.10681152    |
| eplenmean          | 20.3          |
| eprewmean          | 0.527         |
| explained_variance | 0.408         |
| fps                | 460           |
| nupdates           | 230           |
| policy_entropy     | 0.5360224     |
| policy_loss        | -0.0051227687 |
| serial_timesteps   | 471040        |
| time_elapsed       | 1.05e+03      |
| total_timesteps    | 471040        |
| value_loss         | 0.012118952   |
--------------------------------------
--------------------------------------
| approxkl           | 0.007683249   |
| clipfrac           | 0.08557129    |
| eplenmean          | 21.6          |
| eprewmean          | 0.51          |
| explained_variance | 0.438         |
| fps                | 450           |
| nupdates           | 240           |
| policy_entropy     | 0.5593899     |
| policy_loss        | -0.0035762433 |
| serial_timesteps   | 491520        |
| time_elapsed       | 1.1e+03       |
| total_timesteps    | 491520        |
| value_loss         | 0.011544634   |
--------------------------------------
CPU times: user 21min 46s, sys: 4min 14s, total: 26min
Wall time: 18min 52s

In [0]:
# !ls -l $log_dir

In [9]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
plt.ylim(0, .75)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility
Out[9]:
[<matplotlib.lines.Line2D at 0x7f4d35619d30>]

Enjoy model


In [10]:
import numpy as np 

observation = env.reset()
env.render()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


{'S': 0, 'A': 1000, 'B': 1000, 'C': 0, 'D': 1000, 'E': 0, 'F': 1000, 'G': 0, 'H': 0, 'K': 0, 'L': 0, 'M': 1000, 'N': 0, 'O': 1000}
Episode:    0   Step:    1  S --1-> B R= 0.15 totalR= 0.15 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    2  B --1-> A R= 0.15 totalR= 0.30 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    3  A --3-> D R= 0.15 totalR= 0.45 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    4  D --1-> F R= 0.16 totalR= 0.61 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    5  F --1-> E R=-0.02 totalR= 0.59 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    6  E --0-> A R=-0.02 totalR= 0.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    7  A --1-> B R=-0.02 totalR= 0.56 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    8  B --0-> S R=-0.02 totalR= 0.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    9  S --2-> C R=-0.03 totalR= 0.51 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   10  C --2-> M R= 0.15 totalR= 0.66 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   11  M --2-> N R=-0.02 totalR= 0.64 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   12  N --1-> O R= 0.15 totalR= 0.79 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   13  O --1-> G R=-0.05 totalR= 0.74 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   14  G --1-> O R=-0.05 totalR= 0.69 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   15  O --1-> G R=-0.05 totalR= 0.64 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   16  G --0-> F R=-0.03 totalR= 0.61 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   17  F --0-> D R=-0.01 totalR= 0.60 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   18  D --1-> F R=-0.01 totalR= 0.59 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   19  F --1-> E R=-0.02 totalR= 0.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   20  E --0-> A R=-0.02 totalR= 0.56 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   21  A --1-> B R=-0.02 totalR= 0.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   22  B --0-> S R=-0.02 totalR= 0.52 cost= 100 customerR=   0 optimum=6000
Episode finished after 22 timesteps

In [0]: