Berater Environment v8

Changes from v7

  1. return to complete observation gives better results
    • not listing all paths, but
    • local paths plus all rest rewards

next steps

  1. configure custom network including regularization (https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/)
  • better rewards
    • set discount factor (gamma) to 1
      • rewards late in the game are as good as eartly ones
      • no need to push game to an end, as every move comes at costs anyway
    • add reward for returning home once all other locations have been visited
  • better observation?
    • network can learn all costs and all connections as they are static
    • rewards are not, but are given in the observation
    • all information is there, but
    • it is very convoluted, too hard for us as humans
    • could we make this more accessible? Would this also help?
  • create baselines to better understand what is a good result
    1. low level: always go in the direction of greatest reward
    2. Dijkstra

Installation (required for colab)


In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment


In [0]:
import numpy as np
import random

import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
#         self.map = {
#             'S': [('A', 100), ('B', 400), ('C', 200 )],
#             'A': [('B', 250), ('C', 400), ('S', 100 )],
#             'B': [('A', 250), ('C', 250), ('S', 400 )],
#             'C': [('A', 400), ('B', 250), ('S', 200 )]
#         }
        self.map = {
            'S': [('A', 300), ('B', 100), ('C', 200 )],
            'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
            'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
            'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
            'D': [('A', 100), ('F', 50)],
            'E': [('A', 100), ('F', 100), ('H', 100)],
            'F': [('D', 50), ('E', 100), ('G', 200)],
            'G': [('F', 200), ('O', 300)],
            'H': [('E', 100), ('K', 300)],
            'K': [('B', 200), ('H', 300)],
            'L': [('C', 200), ('M', 50)],
            'M': [('C', 100), ('L', 50), ('N', 100)],
            'N': [('M', 100), ('O', 100)],
            'O': [('N', 100), ('G', 300)]
        }
        max_paths = 4
        self.action_space = spaces.Discrete(max_paths)
      
        positions = len(self.map)
        # observations: position, reward of all 4 local paths, rest reward of all locations
        # non existing path is -1000 and no position change
        # look at what #getObservation returns if you are confused
        low = np.append(np.append([0], np.full(max_paths, -1000)), np.full(positions, 0))
        high = np.append(np.append([positions - 1], np.full(max_paths, 1000)), np.full(positions, 1000))
        self.observation_space = spaces.Box(low=low,
                                             high=high,
                                             dtype=np.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = np.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=np.float32)
        all_rest_rewards = list(self.customer_reward.values())
        result = np.append(result, all_rest_rewards)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

      
    def modulate_reward(self):
      number_of_customers = len(self.map) - 1
      number_per_consultant = int(number_of_customers/2)
#       number_per_consultant = int(number_of_customers/1.5)
      self.customer_reward = {
          'S': 0
      }
      for customer_nr in range(1, number_of_customers + 1):
        self.customer_reward[int_to_state_name(customer_nr)] = 0
      
      # every consultant only visits a few random customers
      samples = random.sample(range(1, number_of_customers + 1), k=number_per_consultant)
      key_list = list(self.customer_reward.keys())
      for sample in samples:
        self.customer_reward[key_list[sample]] = 1000

      
    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.modulate_reward()
        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))
      
    def render(self):
      print(self.customer_reward)

In [0]:
env = BeraterEnv()
print(env.reset())
print(env.customer_reward)


[    0.   700.   900.   800. -1000.     0.  1000.  1000.  1000.     0.
     0.  1000.     0.     0.     0.  1000.  1000.     0.     0.]
{'S': 0, 'A': 1000, 'B': 1000, 'C': 1000, 'D': 0, 'E': 0, 'F': 1000, 'G': 0, 'H': 0, 'K': 0, 'L': 1000, 'M': 1000, 'N': 0, 'O': 0}

Try out Environment


In [0]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)


<BeraterEnv instance>
[    0.   700.  -100.  -200. -1000.     0.  1000.     0.     0.     0.
  1000.     0.  1000.     0.  1000.     0.  1000.     0.  1000.]
Episode:    0   Step:    1  S --0-> A R= 0.12 totalR= 0.12 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:    2  A --3-> D R=-0.02 totalR= 0.10 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    3  D --1-> F R=-0.01 totalR= 0.09 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    4  F --0-> D R=-0.01 totalR= 0.08 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    5  D --3-> D R=-0.17 totalR=-0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    6  D --3-> D R=-0.17 totalR=-0.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    7  D --3-> D R=-0.17 totalR=-0.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    8  D --3-> D R=-0.17 totalR=-0.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    9  D --1-> F R=-0.01 totalR=-0.59 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   10  F --3-> F R=-0.17 totalR=-0.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   11  F --1-> E R= 0.15 totalR=-0.61 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   12  E --2-> H R=-0.02 totalR=-0.62 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   13  H --0-> E R=-0.02 totalR=-0.64 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  E --3-> E R=-0.17 totalR=-0.81 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   15  E --2-> H R=-0.02 totalR=-0.82 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   16  H --0-> E R=-0.02 totalR=-0.84 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   17  E --0-> A R=-0.02 totalR=-0.86 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   18  A --0-> S R=-0.05 totalR=-0.91 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   19  S --2-> C R=-0.03 totalR=-0.94 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   20  C --1-> B R=-0.01 totalR=-0.95 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   21  B --2-> C R=-0.01 totalR=-0.96 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   22  C --3-> L R=-0.03 totalR=-0.99 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   23  L --3-> L R=-0.17 totalR=-1.16 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   24  L --2-> L R=-0.17 totalR=-1.33 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   25  L --0-> C R=-0.03 totalR=-1.36 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   26  C --1-> B R=-0.01 totalR=-1.37 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   27  B --1-> A R=-0.02 totalR=-1.38 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   28  A --1-> B R=-0.02 totalR=-1.40 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   29  B --1-> A R=-0.02 totalR=-1.42 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   30  A --0-> S R=-0.05 totalR=-1.47 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   31  S --1-> B R=-0.02 totalR=-1.48 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   32  B --0-> S R=-0.02 totalR=-1.50 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   33  S --3-> S R=-0.17 totalR=-1.67 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   34  S --0-> A R=-0.05 totalR=-1.72 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   35  A --3-> D R=-0.02 totalR=-1.73 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   36  D --1-> F R=-0.01 totalR=-1.74 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   37  F --2-> G R= 0.13 totalR=-1.61 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   38  G --3-> G R=-0.17 totalR=-1.78 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   39  G --3-> G R=-0.17 totalR=-1.94 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   40  G --0-> F R=-0.03 totalR=-1.98 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   41  F --2-> G R=-0.03 totalR=-2.01 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   42  G --3-> G R=-0.17 totalR=-2.18 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   43  G --0-> F R=-0.03 totalR=-2.21 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   44  F --1-> E R=-0.02 totalR=-2.23 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   45  E --3-> E R=-0.17 totalR=-2.39 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   46  E --1-> F R=-0.02 totalR=-2.41 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   47  F --3-> F R=-0.17 totalR=-2.57 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   48  F --3-> F R=-0.17 totalR=-2.74 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   49  F --2-> G R=-0.03 totalR=-2.77 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   50  G --3-> G R=-0.17 totalR=-2.94 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   51  G --0-> F R=-0.03 totalR=-2.97 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   52  F --1-> E R=-0.02 totalR=-2.99 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   53  E --1-> F R=-0.02 totalR=-3.01 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   54  F --1-> E R=-0.02 totalR=-3.02 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   55  E --3-> E R=-0.17 totalR=-3.19 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   56  E --0-> A R=-0.02 totalR=-3.21 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   57  A --3-> D R=-0.02 totalR=-3.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   58  D --2-> D R=-0.17 totalR=-3.39 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   59  D --0-> A R=-0.02 totalR=-3.41 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   60  A --3-> D R=-0.02 totalR=-3.42 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   61  D --3-> D R=-0.17 totalR=-3.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   62  D --2-> D R=-0.17 totalR=-3.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   63  D --3-> D R=-0.17 totalR=-3.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   64  D --2-> D R=-0.17 totalR=-4.09 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   65  D --3-> D R=-0.17 totalR=-4.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   66  D --0-> A R=-0.02 totalR=-4.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   67  A --2-> E R=-0.02 totalR=-4.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   68  E --0-> A R=-0.02 totalR=-4.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   69  A --0-> S R=-0.05 totalR=-4.36 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   70  S --0-> A R=-0.05 totalR=-4.41 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   71  A --1-> B R=-0.02 totalR=-4.42 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   72  B --1-> A R=-0.02 totalR=-4.44 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   73  A --2-> E R=-0.02 totalR=-4.46 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   74  E --0-> A R=-0.02 totalR=-4.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   75  A --0-> S R=-0.05 totalR=-4.52 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   76  S --1-> B R=-0.02 totalR=-4.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   77  B --3-> K R= 0.13 totalR=-4.41 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   78  K --0-> B R=-0.03 totalR=-4.44 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   79  B --1-> A R=-0.02 totalR=-4.46 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   80  A --2-> E R=-0.02 totalR=-4.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   81  E --2-> H R=-0.02 totalR=-4.49 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   82  H --3-> H R=-0.17 totalR=-4.66 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   83  H --0-> E R=-0.02 totalR=-4.67 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   84  E --1-> F R=-0.02 totalR=-4.69 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   85  F --1-> E R=-0.02 totalR=-4.71 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   86  E --3-> E R=-0.17 totalR=-4.87 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   87  E --1-> F R=-0.02 totalR=-4.89 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   88  F --1-> E R=-0.02 totalR=-4.91 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   89  E --3-> E R=-0.17 totalR=-5.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   90  E --2-> H R=-0.02 totalR=-5.09 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   91  H --3-> H R=-0.17 totalR=-5.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   92  H --3-> H R=-0.17 totalR=-5.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   93  H --2-> H R=-0.17 totalR=-5.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   94  H --2-> H R=-0.17 totalR=-5.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   95  H --3-> H R=-0.17 totalR=-5.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   96  H --0-> E R=-0.02 totalR=-5.94 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   97  E --2-> H R=-0.02 totalR=-5.96 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   98  H --3-> H R=-0.17 totalR=-6.12 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   99  H --1-> K R=-0.05 totalR=-6.17 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  100  K --0-> B R=-0.03 totalR=-6.21 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  101  B --1-> A R=-0.02 totalR=-6.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  102  A --2-> E R=-0.02 totalR=-6.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  103  E --0-> A R=-0.02 totalR=-6.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  104  A --3-> D R=-0.02 totalR=-6.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  105  D --0-> A R=-0.02 totalR=-6.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  106  A --2-> E R=-0.02 totalR=-6.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  107  E --0-> A R=-0.02 totalR=-6.32 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  108  A --3-> D R=-0.02 totalR=-6.34 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  109  D --3-> D R=-0.17 totalR=-6.51 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  110  D --0-> A R=-0.02 totalR=-6.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  111  A --3-> D R=-0.02 totalR=-6.54 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  112  D --0-> A R=-0.02 totalR=-6.56 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  113  A --0-> S R=-0.05 totalR=-6.61 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  114  S --0-> A R=-0.05 totalR=-6.66 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  115  A --0-> S R=-0.05 totalR=-6.71 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  116  S --2-> C R=-0.03 totalR=-6.74 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  117  C --3-> L R=-0.03 totalR=-6.77 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  118  L --0-> C R=-0.03 totalR=-6.81 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  119  C --3-> L R=-0.03 totalR=-6.84 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  120  L --2-> L R=-0.17 totalR=-7.01 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  121  L --3-> L R=-0.17 totalR=-7.17 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  122  L --3-> L R=-0.17 totalR=-7.34 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  123  L --1-> M R= 0.16 totalR=-7.18 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:  124  M --1-> L R=-0.01 totalR=-7.19 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  125  L --1-> M R=-0.01 totalR=-7.20 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  126  M --0-> C R=-0.02 totalR=-7.22 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  127  C --1-> B R=-0.01 totalR=-7.22 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  128  B --1-> A R=-0.02 totalR=-7.24 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  129  A --1-> B R=-0.02 totalR=-7.26 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  130  B --3-> K R=-0.03 totalR=-7.29 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  131  K --0-> B R=-0.03 totalR=-7.32 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  132  B --3-> K R=-0.03 totalR=-7.36 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  133  K --1-> H R=-0.05 totalR=-7.41 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  134  H --2-> H R=-0.17 totalR=-7.57 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  135  H --0-> E R=-0.02 totalR=-7.59 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  136  E --1-> F R=-0.02 totalR=-7.61 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  137  F --2-> G R=-0.03 totalR=-7.64 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  138  G --0-> F R=-0.03 totalR=-7.67 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  139  F --2-> G R=-0.03 totalR=-7.71 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  140  G --0-> F R=-0.03 totalR=-7.74 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  141  F --1-> E R=-0.02 totalR=-7.76 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  142  E --3-> E R=-0.17 totalR=-7.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  143  E --2-> H R=-0.02 totalR=-7.94 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  144  H --2-> H R=-0.17 totalR=-8.11 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  145  H --1-> K R=-0.05 totalR=-8.16 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  146  K --0-> B R=-0.03 totalR=-8.19 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  147  B --3-> K R=-0.03 totalR=-8.22 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  148  K --1-> H R=-0.05 totalR=-8.28 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  149  H --1-> K R=-0.05 totalR=-8.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  150  K --3-> K R=-0.17 totalR=-8.49 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  151  K --0-> B R=-0.03 totalR=-8.53 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:  152  B --2-> C R=-0.01 totalR=-8.53 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:  153  C --2-> M R=-0.02 totalR=-8.55 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  154  M --3-> M R=-0.17 totalR=-8.72 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  155  M --2-> N R=-0.02 totalR=-8.73 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  156  N --3-> N R=-0.17 totalR=-8.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  157  N --3-> N R=-0.17 totalR=-9.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  158  N --3-> N R=-0.17 totalR=-9.23 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  159  N --2-> N R=-0.17 totalR=-9.40 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  160  N --1-> O R= 0.15 totalR=-9.25 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:  161  O --2-> O R=-0.17 totalR=-9.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  162  O --2-> O R=-0.17 totalR=-9.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  163  O --3-> O R=-0.17 totalR=-9.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  164  O --2-> O R=-0.17 totalR=-9.92 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  165  O --3-> O R=-0.17 totalR=-10.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  166  O --3-> O R=-0.17 totalR=-10.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  167  O --2-> O R=-0.17 totalR=-10.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  168  O --2-> O R=-0.17 totalR=-10.58 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  169  O --3-> O R=-0.17 totalR=-10.75 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  170  O --0-> N R=-0.02 totalR=-10.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  171  N --1-> O R=-0.02 totalR=-10.78 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  172  O --2-> O R=-0.17 totalR=-10.95 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  173  O --3-> O R=-0.17 totalR=-11.12 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  174  O --2-> O R=-0.17 totalR=-11.28 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  175  O --1-> G R=-0.05 totalR=-11.33 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  176  G --2-> G R=-0.17 totalR=-11.50 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  177  G --1-> O R=-0.05 totalR=-11.55 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:  178  O --0-> N R=-0.02 totalR=-11.57 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  179  N --2-> N R=-0.17 totalR=-11.73 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  180  N --2-> N R=-0.17 totalR=-11.90 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  181  N --3-> N R=-0.17 totalR=-12.07 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  182  N --0-> M R=-0.02 totalR=-12.08 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  183  M --3-> M R=-0.17 totalR=-12.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  184  M --2-> N R=-0.02 totalR=-12.27 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  185  N --3-> N R=-0.17 totalR=-12.43 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  186  N --0-> M R=-0.02 totalR=-12.45 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  187  M --0-> C R=-0.02 totalR=-12.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  188  C --2-> M R=-0.02 totalR=-12.48 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  189  M --0-> C R=-0.02 totalR=-12.50 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  190  C --2-> M R=-0.02 totalR=-12.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  191  M --3-> M R=-0.17 totalR=-12.68 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  192  M --2-> N R=-0.02 totalR=-12.70 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  193  N --2-> N R=-0.17 totalR=-12.87 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  194  N --3-> N R=-0.17 totalR=-13.03 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:  195  N --0-> M R=-0.02 totalR=-13.05 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  196  M --0-> C R=-0.02 totalR=-13.07 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:  197  C --0-> S R=-0.03 totalR=-13.10 cost= 200 customerR=   0 optimum=6000
Done: episodes=     1  avgSteps=197.00  avgTotalReward=-13.10
Episode finished after 197 timesteps
[    0.  -300.  -100.  -200. -1000.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.]

Train model

  • random has lower total reward than version with dense customers
  • total cost when travelling all paths (back and forth): 2500
  • additional pernalty for liiegal moves 1000
  • all rewards: 6000
  • perfect score???
  • estimate: half the travel cost and no illegal moves: (6000 - 1250) / 6000 = .79
  • but: rewards are much more sparse while routes stay the same, maybe expect less
  • additionally: the agent only sees very little of the whole scenario
    • changes with every episode
    • was ok when network can learn fixed scenario

In [0]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/berater

In [0]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
# https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30
%time model = ppo2.learn(\
    env=monitored_env,\
    network='mlp',\
    num_hidden=5000,\
    num_layers=3,\
    ent_coef=0.01,\
    total_timesteps=500000)

# %time model = ppo2.learn(\
#     env=monitored_env,\
#     network='mlp',\
#     num_hidden=2000,\
#     num_layers=3,\
#     ent_coef=0.1,\
#     total_timesteps=500000)

# model = ppo2.learn(
#     env=monitored_env,\
#     layer_norm=True,\
#     network='mlp',\
#     num_hidden=2000,\
#     activation=tf.nn.relu,\
#     num_layers=3,\
#     ent_coef=0.03,\
#     total_timesteps=1000000)

# monitored_env = bench.Monitor(env, log_dir)
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         monitored_env,\
#         seed=42,\
#         network='mlp',\
#         lr=1e-3,\
#         gamma=0.99,\
#         total_timesteps=30000,\
#         buffer_size=50000,\
#         exploration_fraction=0.5,\
#         exploration_final_eps=0.02,\
#         print_freq=1000)

model.save('berater-ppo-v7.pkl')
monitored_env.close()


Logging to /tmp/openai-2019-01-05-09-49-26-745013
-------------------------------------
| approxkl           | 0.05127006   |
| clipfrac           | 0.4300537    |
| eplenmean          | 109          |
| eprewmean          | -6.393057    |
| explained_variance | -0.374       |
| fps                | 169          |
| nupdates           | 1            |
| policy_entropy     | 1.3456209    |
| policy_loss        | -0.017576873 |
| serial_timesteps   | 2048         |
| time_elapsed       | 12.1         |
| total_timesteps    | 2048         |
| value_loss         | 5.0013447    |
-------------------------------------
-------------------------------------
| approxkl           | 0.033649225  |
| clipfrac           | 0.31469727   |
| eplenmean          | 39.4         |
| eprewmean          | -0.12291664  |
| explained_variance | 0.191        |
| fps                | 193          |
| nupdates           | 10           |
| policy_entropy     | 0.9306614    |
| policy_loss        | -0.025409166 |
| serial_timesteps   | 20480        |
| time_elapsed       | 107          |
| total_timesteps    | 20480        |
| value_loss         | 0.17561111   |
-------------------------------------
------------------------------------
| approxkl           | 0.05542698  |
| clipfrac           | 0.31018066  |
| eplenmean          | 31.6        |
| eprewmean          | 0.37308335  |
| explained_variance | 0.0981      |
| fps                | 193         |
| nupdates           | 20          |
| policy_entropy     | 0.5296578   |
| policy_loss        | -0.02961483 |
| serial_timesteps   | 40960       |
| time_elapsed       | 213         |
| total_timesteps    | 40960       |
| value_loss         | 0.02592381  |
------------------------------------
-------------------------------------
| approxkl           | 0.04365254   |
| clipfrac           | 0.22741699   |
| eplenmean          | 20.2         |
| eprewmean          | 0.5805834    |
| explained_variance | 0.572        |
| fps                | 193          |
| nupdates           | 30           |
| policy_entropy     | 0.4309192    |
| policy_loss        | -0.018368207 |
| serial_timesteps   | 61440        |
| time_elapsed       | 319          |
| total_timesteps    | 61440        |
| value_loss         | 0.013548988  |
-------------------------------------
-------------------------------------
| approxkl           | 0.038945585  |
| clipfrac           | 0.20263672   |
| eplenmean          | 19.4         |
| eprewmean          | 0.6041667    |
| explained_variance | 0.624        |
| fps                | 191          |
| nupdates           | 40           |
| policy_entropy     | 0.43682644   |
| policy_loss        | -0.004864812 |
| serial_timesteps   | 81920        |
| time_elapsed       | 425          |
| total_timesteps    | 81920        |
| value_loss         | 0.03578509   |
-------------------------------------
-------------------------------------
| approxkl           | 0.041430943  |
| clipfrac           | 0.19946289   |
| eplenmean          | 18.9         |
| eprewmean          | 0.61275005   |
| explained_variance | 0.561        |
| fps                | 196          |
| nupdates           | 50           |
| policy_entropy     | 0.35651794   |
| policy_loss        | -0.016227707 |
| serial_timesteps   | 102400       |
| time_elapsed       | 531          |
| total_timesteps    | 102400       |
| value_loss         | 0.010343211  |
-------------------------------------
------------------------------------
| approxkl           | 0.11565823  |
| clipfrac           | 0.24047852  |
| eplenmean          | 21.1        |
| eprewmean          | 0.5963334   |
| explained_variance | 0.372       |
| fps                | 194         |
| nupdates           | 60          |
| policy_entropy     | 0.33400667  |
| policy_loss        | 0.013092534 |
| serial_timesteps   | 122880      |
| time_elapsed       | 636         |
| total_timesteps    | 122880      |
| value_loss         | 0.009124604 |
------------------------------------
--------------------------------------
| approxkl           | 0.008532841   |
| clipfrac           | 0.029174805   |
| eplenmean          | 16.9          |
| eprewmean          | 0.65024996    |
| explained_variance | 0.82          |
| fps                | 194           |
| nupdates           | 70            |
| policy_entropy     | 0.04408144    |
| policy_loss        | -0.0120493835 |
| serial_timesteps   | 143360        |
| time_elapsed       | 742           |
| total_timesteps    | 143360        |
| value_loss         | 0.023514796   |
--------------------------------------
-------------------------------------
| approxkl           | 0.100871064  |
| clipfrac           | 0.19384766   |
| eplenmean          | 19.5         |
| eprewmean          | 0.60025      |
| explained_variance | 0.722        |
| fps                | 193          |
| nupdates           | 80           |
| policy_entropy     | 0.2680549    |
| policy_loss        | -0.022070099 |
| serial_timesteps   | 163840       |
| time_elapsed       | 848          |
| total_timesteps    | 163840       |
| value_loss         | 0.007869679  |
-------------------------------------
-------------------------------------
| approxkl           | 0.034146316  |
| clipfrac           | 0.09362793   |
| eplenmean          | 16.3         |
| eprewmean          | 0.67475      |
| explained_variance | 0.807        |
| fps                | 193          |
| nupdates           | 90           |
| policy_entropy     | 0.15488099   |
| policy_loss        | -0.009867809 |
| serial_timesteps   | 184320       |
| time_elapsed       | 954          |
| total_timesteps    | 184320       |
| value_loss         | 0.0075420733 |
-------------------------------------
-------------------------------------
| approxkl           | 0.091742806  |
| clipfrac           | 0.15698242   |
| eplenmean          | 17           |
| eprewmean          | 0.65841657   |
| explained_variance | 0.901        |
| fps                | 192          |
| nupdates           | 100          |
| policy_entropy     | 0.21207516   |
| policy_loss        | -0.013321315 |
| serial_timesteps   | 204800       |
| time_elapsed       | 1.06e+03     |
| total_timesteps    | 204800       |
| value_loss         | 0.0020749767 |
-------------------------------------
-------------------------------------
| approxkl           | 0.11334531   |
| clipfrac           | 0.22814941   |
| eplenmean          | 19.6         |
| eprewmean          | 0.6185834    |
| explained_variance | 0.826        |
| fps                | 192          |
| nupdates           | 110          |
| policy_entropy     | 0.29120943   |
| policy_loss        | -0.023046596 |
| serial_timesteps   | 225280       |
| time_elapsed       | 1.17e+03     |
| total_timesteps    | 225280       |
| value_loss         | 0.0043909065 |
-------------------------------------
-------------------------------------
| approxkl           | 0.038772028  |
| clipfrac           | 0.107055664  |
| eplenmean          | 15.5         |
| eprewmean          | 0.68008333   |
| explained_variance | 0.894        |
| fps                | 192          |
| nupdates           | 120          |
| policy_entropy     | 0.16804786   |
| policy_loss        | -0.009299656 |
| serial_timesteps   | 245760       |
| time_elapsed       | 1.27e+03     |
| total_timesteps    | 245760       |
| value_loss         | 0.004024453  |
-------------------------------------
-------------------------------------
| approxkl           | 0.13149968   |
| clipfrac           | 0.15039062   |
| eplenmean          | 19.2         |
| eprewmean          | 0.6229167    |
| explained_variance | 0.76         |
| fps                | 189          |
| nupdates           | 130          |
| policy_entropy     | 0.17033865   |
| policy_loss        | -0.008782024 |
| serial_timesteps   | 266240       |
| time_elapsed       | 1.38e+03     |
| total_timesteps    | 266240       |
| value_loss         | 0.011089902  |
-------------------------------------
-------------------------------------
| approxkl           | 0.04462887   |
| clipfrac           | 0.10961914   |
| eplenmean          | 16.5         |
| eprewmean          | 0.6571667    |
| explained_variance | 0.545        |
| fps                | 193          |
| nupdates           | 140          |
| policy_entropy     | 0.14125988   |
| policy_loss        | -0.033947762 |
| serial_timesteps   | 286720       |
| time_elapsed       | 1.48e+03     |
| total_timesteps    | 286720       |
| value_loss         | 0.017571434  |
-------------------------------------
-------------------------------------
| approxkl           | 0.052347623  |
| clipfrac           | 0.091918945  |
| eplenmean          | 17.1         |
| eprewmean          | 0.6750834    |
| explained_variance | 0.802        |
| fps                | 192          |
| nupdates           | 150          |
| policy_entropy     | 0.12886694   |
| policy_loss        | 0.012757084  |
| serial_timesteps   | 307200       |
| time_elapsed       | 1.59e+03     |
| total_timesteps    | 307200       |
| value_loss         | 0.0059873643 |
-------------------------------------
-------------------------------------
| approxkl           | 0.24333608   |
| clipfrac           | 0.15649414   |
| eplenmean          | 18.1         |
| eprewmean          | 0.68291664   |
| explained_variance | 0.845        |
| fps                | 191          |
| nupdates           | 160          |
| policy_entropy     | 0.13740852   |
| policy_loss        | -0.018098805 |
| serial_timesteps   | 327680       |
| time_elapsed       | 1.7e+03      |
| total_timesteps    | 327680       |
| value_loss         | 0.0031910392 |
-------------------------------------
------------------------------------
| approxkl           | 0.095002    |
| clipfrac           | 0.12390137  |
| eplenmean          | 16.8        |
| eprewmean          | 0.68441683  |
| explained_variance | 0.607       |
| fps                | 193         |
| nupdates           | 170         |
| policy_entropy     | 0.1286461   |
| policy_loss        | -0.02305587 |
| serial_timesteps   | 348160      |
| time_elapsed       | 1.8e+03     |
| total_timesteps    | 348160      |
| value_loss         | 0.005933811 |
------------------------------------
-------------------------------------
| approxkl           | 0.010284234  |
| clipfrac           | 0.012451172  |
| eplenmean          | 28.9         |
| eprewmean          | 0.4749159    |
| explained_variance | 0.543        |
| fps                | 194          |
| nupdates           | 180          |
| policy_entropy     | 0.044512235  |
| policy_loss        | 0.0027946366 |
| serial_timesteps   | 368640       |
| time_elapsed       | 1.91e+03     |
| total_timesteps    | 368640       |
| value_loss         | 0.04808835   |
-------------------------------------
--------------------------------------
| approxkl           | 0.40270528    |
| clipfrac           | 0.13793945    |
| eplenmean          | 221           |
| eprewmean          | -3.2612076    |
| explained_variance | 0.863         |
| fps                | 193           |
| nupdates           | 190           |
| policy_entropy     | 0.13223392    |
| policy_loss        | -0.0011471274 |
| serial_timesteps   | 389120        |
| time_elapsed       | 2.02e+03      |
| total_timesteps    | 389120        |
| value_loss         | 0.02148403    |
--------------------------------------
-------------------------------------
| approxkl           | 0.12118763   |
| clipfrac           | 0.2479248    |
| eplenmean          | 23.7         |
| eprewmean          | 0.5844167    |
| explained_variance | 0.813        |
| fps                | 193          |
| nupdates           | 200          |
| policy_entropy     | 0.25758258   |
| policy_loss        | -0.030573342 |
| serial_timesteps   | 409600       |
| time_elapsed       | 2.12e+03     |
| total_timesteps    | 409600       |
| value_loss         | 0.0066235336 |
-------------------------------------
--------------------------------------
| approxkl           | 0.119006425   |
| clipfrac           | 0.2220459     |
| eplenmean          | 20.3          |
| eprewmean          | 0.5409167     |
| explained_variance | 0.558         |
| fps                | 190           |
| nupdates           | 210           |
| policy_entropy     | 0.257251      |
| policy_loss        | -0.0049227523 |
| serial_timesteps   | 430080        |
| time_elapsed       | 2.23e+03      |
| total_timesteps    | 430080        |
| value_loss         | 0.012221637   |
--------------------------------------
-------------------------------------
| approxkl           | 0.9420598    |
| clipfrac           | 0.13916016   |
| eplenmean          | 17.3         |
| eprewmean          | 0.6630833    |
| explained_variance | 0.859        |
| fps                | 192          |
| nupdates           | 220          |
| policy_entropy     | 0.09531072   |
| policy_loss        | -0.02849185  |
| serial_timesteps   | 450560       |
| time_elapsed       | 2.33e+03     |
| total_timesteps    | 450560       |
| value_loss         | 0.0070347553 |
-------------------------------------
-------------------------------------
| approxkl           | 0.02957901   |
| clipfrac           | 0.12182617   |
| eplenmean          | 23.4         |
| eprewmean          | 0.58225      |
| explained_variance | 0.78         |
| fps                | 193          |
| nupdates           | 230          |
| policy_entropy     | 0.19237123   |
| policy_loss        | 0.0049069906 |
| serial_timesteps   | 471040       |
| time_elapsed       | 2.44e+03     |
| total_timesteps    | 471040       |
| value_loss         | 0.015563551  |
-------------------------------------
-------------------------------------
| approxkl           | 0.10990309   |
| clipfrac           | 0.26428223   |
| eplenmean          | 17.2         |
| eprewmean          | 0.6840001    |
| explained_variance | -1.19        |
| fps                | 193          |
| nupdates           | 240          |
| policy_entropy     | 0.37644613   |
| policy_loss        | -0.023404025 |
| serial_timesteps   | 491520       |
| time_elapsed       | 2.54e+03     |
| total_timesteps    | 491520       |
| value_loss         | 0.024558686  |
-------------------------------------
CPU times: user 47min 34s, sys: 9min 45s, total: 57min 20s
Wall time: 47min

In [0]:
# !ls -l $log_dir

In [0]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
plt.ylim(0, .75)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility
Out[0]:
[<matplotlib.lines.Line2D at 0x7f2ba4ecec18>]

Enjoy model


In [0]:
import numpy as np 

observation = env.reset()
env.render()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


{'S': 0, 'A': 0, 'B': 0, 'C': 1000, 'D': 1000, 'E': 0, 'F': 0, 'G': 1000, 'H': 0, 'K': 1000, 'L': 1000, 'M': 1000, 'N': 0, 'O': 0}
Episode:    0   Step:    1  S --1-> B R=-0.02 totalR=-0.02 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    2  B --2-> C R= 0.16 totalR= 0.14 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    3  C --2-> M R= 0.15 totalR= 0.29 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    4  M --1-> L R= 0.16 totalR= 0.45 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    5  L --1-> M R=-0.01 totalR= 0.44 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    6  M --1-> L R=-0.01 totalR= 0.43 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    7  L --1-> M R=-0.01 totalR= 0.42 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    8  M --2-> N R=-0.02 totalR= 0.41 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    9  N --1-> O R=-0.02 totalR= 0.39 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   10  O --1-> G R= 0.12 totalR= 0.51 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:   11  G --0-> F R=-0.03 totalR= 0.47 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   12  F --0-> D R= 0.16 totalR= 0.63 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:   13  D --0-> A R=-0.02 totalR= 0.62 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  A --1-> B R=-0.02 totalR= 0.60 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   15  B --3-> K R= 0.13 totalR= 0.73 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   16  K --0-> B R=-0.03 totalR= 0.70 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   17  B --0-> S R=-0.02 totalR= 0.68 cost= 100 customerR=   0 optimum=6000
Episode finished after 17 timesteps

In [0]: