Berater Environment v6

Changes from v5

  1. use complex customer graph

next steps

  1. per episode set certain rewards to 0 to simulate different customers per consultant
    1. make sure things generalize well

Installation (required for colab)


In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment


In [0]:
import numpy
import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
#         self.map = {
#             'S': [('A', 100), ('B', 400), ('C', 200 )],
#             'A': [('B', 250), ('C', 400), ('S', 100 )],
#             'B': [('A', 250), ('C', 250), ('S', 400 )],
#             'C': [('A', 400), ('B', 250), ('S', 200 )]
#         }
        self.map = {
            'S': [('A', 300), ('B', 100), ('C', 200 )],
            'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
            'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
            'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
            'D': [('A', 100), ('F', 50)],
            'E': [('A', 100), ('F', 100), ('H', 100)],
            'F': [('D', 50), ('E', 100), ('G', 200)],
            'G': [('F', 200), ('O', 300)],
            'H': [('E', 100), ('K', 300)],
            'K': [('B', 200), ('H', 300)],
            'L': [('C', 200), ('M', 50)],
            'M': [('C', 100), ('L', 50), ('N', 100)],
            'N': [('M', 100), ('O', 100)],
            'O': [('N', 100), ('G', 300)]
        }
        self.action_space = spaces.Discrete(4)
        # position, and up to 4 paths from that position, non existing path is -1000 and no position change
        self.observation_space = spaces.Box(low=numpy.array([0,-1000,-1000,-1000,-1000]),
                                             high=numpy.array([13,1000,1000,1000,1000]),
                                             dtype=numpy.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = numpy.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=numpy.float32)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False
        reward_per_customer = 1000
        self.customer_reward = {
            'S': 0,
            'A': reward_per_customer,
            'B': reward_per_customer,
            'C': reward_per_customer,
            'D': reward_per_customer,
            'E': reward_per_customer,
            'F': reward_per_customer,
            'G': reward_per_customer,
            'H': reward_per_customer,
            'K': reward_per_customer,
            'L': reward_per_customer,
            'M': reward_per_customer,
            'N': reward_per_customer,
            'O': reward_per_customer
        }

        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))

Try out Environment


In [3]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)


<BeraterEnv instance>
[    0.   700.   900.   800. -1000.]
Episode:    0   Step:    1  S --0-> A R= 0.05 totalR= 0.05 cost= 300 customerR=1000 optimum=13000
Episode:    0   Step:    2  A --3-> D R= 0.07 totalR= 0.12 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:    3  D --1-> F R= 0.07 totalR= 0.20 cost=  50 customerR=1000 optimum=13000
Episode:    0   Step:    4  F --0-> D R=-0.00 totalR= 0.19 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:    5  D --3-> D R=-0.08 totalR= 0.12 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:    6  D --3-> D R=-0.08 totalR= 0.04 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:    7  D --3-> D R=-0.08 totalR=-0.04 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:    8  D --3-> D R=-0.08 totalR=-0.12 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:    9  D --1-> F R=-0.00 totalR=-0.12 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:   10  F --3-> F R=-0.08 totalR=-0.20 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   11  F --1-> E R= 0.07 totalR=-0.13 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:   12  E --2-> H R= 0.07 totalR=-0.06 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:   13  H --0-> E R=-0.01 totalR=-0.07 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   14  E --3-> E R=-0.08 totalR=-0.14 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   15  E --2-> H R=-0.01 totalR=-0.15 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   16  H --0-> E R=-0.01 totalR=-0.16 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   17  E --0-> A R=-0.01 totalR=-0.17 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   18  A --0-> S R=-0.02 totalR=-0.19 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   19  S --2-> C R= 0.06 totalR=-0.13 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:   20  C --1-> B R= 0.07 totalR=-0.05 cost=  50 customerR=1000 optimum=13000
Episode:    0   Step:   21  B --2-> C R=-0.00 totalR=-0.06 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:   22  C --3-> L R= 0.06 totalR= 0.00 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:   23  L --3-> L R=-0.08 totalR=-0.07 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   24  L --2-> L R=-0.08 totalR=-0.15 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   25  L --0-> C R=-0.02 totalR=-0.17 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   26  C --1-> B R=-0.00 totalR=-0.17 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:   27  B --1-> A R=-0.01 totalR=-0.18 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   28  A --1-> B R=-0.01 totalR=-0.18 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   29  B --1-> A R=-0.01 totalR=-0.19 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   30  A --0-> S R=-0.02 totalR=-0.22 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   31  S --1-> B R=-0.01 totalR=-0.22 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   32  B --0-> S R=-0.01 totalR=-0.23 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   33  S --3-> S R=-0.08 totalR=-0.31 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   34  S --0-> A R=-0.02 totalR=-0.33 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   35  A --3-> D R=-0.01 totalR=-0.34 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   36  D --1-> F R=-0.00 totalR=-0.34 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:   37  F --2-> G R= 0.06 totalR=-0.28 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:   38  G --3-> G R=-0.08 totalR=-0.36 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   39  G --3-> G R=-0.08 totalR=-0.43 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   40  G --0-> F R=-0.02 totalR=-0.45 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   41  F --2-> G R=-0.02 totalR=-0.47 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   42  G --3-> G R=-0.08 totalR=-0.54 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   43  G --0-> F R=-0.02 totalR=-0.56 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   44  F --1-> E R=-0.01 totalR=-0.57 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   45  E --3-> E R=-0.08 totalR=-0.64 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   46  E --1-> F R=-0.01 totalR=-0.65 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   47  F --3-> F R=-0.08 totalR=-0.73 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   48  F --3-> F R=-0.08 totalR=-0.80 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   49  F --2-> G R=-0.02 totalR=-0.82 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   50  G --3-> G R=-0.08 totalR=-0.90 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   51  G --0-> F R=-0.02 totalR=-0.91 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   52  F --1-> E R=-0.01 totalR=-0.92 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   53  E --1-> F R=-0.01 totalR=-0.93 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   54  F --1-> E R=-0.01 totalR=-0.93 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   55  E --3-> E R=-0.08 totalR=-1.01 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   56  E --0-> A R=-0.01 totalR=-1.02 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   57  A --3-> D R=-0.01 totalR=-1.03 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   58  D --2-> D R=-0.08 totalR=-1.10 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   59  D --0-> A R=-0.01 totalR=-1.11 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   60  A --3-> D R=-0.01 totalR=-1.12 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   61  D --3-> D R=-0.08 totalR=-1.20 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   62  D --2-> D R=-0.08 totalR=-1.27 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   63  D --3-> D R=-0.08 totalR=-1.35 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   64  D --2-> D R=-0.08 totalR=-1.43 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   65  D --3-> D R=-0.08 totalR=-1.50 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   66  D --0-> A R=-0.01 totalR=-1.51 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   67  A --2-> E R=-0.01 totalR=-1.52 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   68  E --0-> A R=-0.01 totalR=-1.53 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   69  A --0-> S R=-0.02 totalR=-1.55 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   70  S --0-> A R=-0.02 totalR=-1.57 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   71  A --1-> B R=-0.01 totalR=-1.58 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   72  B --1-> A R=-0.01 totalR=-1.59 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   73  A --2-> E R=-0.01 totalR=-1.60 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   74  E --0-> A R=-0.01 totalR=-1.60 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   75  A --0-> S R=-0.02 totalR=-1.63 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:   76  S --1-> B R=-0.01 totalR=-1.63 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   77  B --3-> K R= 0.06 totalR=-1.57 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:   78  K --0-> B R=-0.02 totalR=-1.59 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   79  B --1-> A R=-0.01 totalR=-1.60 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   80  A --2-> E R=-0.01 totalR=-1.60 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   81  E --2-> H R=-0.01 totalR=-1.61 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   82  H --3-> H R=-0.08 totalR=-1.69 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   83  H --0-> E R=-0.01 totalR=-1.70 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   84  E --1-> F R=-0.01 totalR=-1.70 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   85  F --1-> E R=-0.01 totalR=-1.71 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   86  E --3-> E R=-0.08 totalR=-1.79 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   87  E --1-> F R=-0.01 totalR=-1.80 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   88  F --1-> E R=-0.01 totalR=-1.80 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   89  E --3-> E R=-0.08 totalR=-1.88 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   90  E --2-> H R=-0.01 totalR=-1.89 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   91  H --3-> H R=-0.08 totalR=-1.97 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   92  H --3-> H R=-0.08 totalR=-2.04 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   93  H --2-> H R=-0.08 totalR=-2.12 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   94  H --2-> H R=-0.08 totalR=-2.20 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   95  H --3-> H R=-0.08 totalR=-2.27 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   96  H --0-> E R=-0.01 totalR=-2.28 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   97  E --2-> H R=-0.01 totalR=-2.29 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:   98  H --3-> H R=-0.08 totalR=-2.37 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:   99  H --1-> K R=-0.02 totalR=-2.39 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  100  K --0-> B R=-0.02 totalR=-2.40 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  101  B --1-> A R=-0.01 totalR=-2.41 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  102  A --2-> E R=-0.01 totalR=-2.42 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  103  E --0-> A R=-0.01 totalR=-2.43 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  104  A --3-> D R=-0.01 totalR=-2.43 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  105  D --0-> A R=-0.01 totalR=-2.44 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  106  A --2-> E R=-0.01 totalR=-2.45 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  107  E --0-> A R=-0.01 totalR=-2.46 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  108  A --3-> D R=-0.01 totalR=-2.47 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  109  D --3-> D R=-0.08 totalR=-2.54 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  110  D --0-> A R=-0.01 totalR=-2.55 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  111  A --3-> D R=-0.01 totalR=-2.56 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  112  D --0-> A R=-0.01 totalR=-2.57 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  113  A --0-> S R=-0.02 totalR=-2.59 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  114  S --0-> A R=-0.02 totalR=-2.61 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  115  A --0-> S R=-0.02 totalR=-2.63 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  116  S --2-> C R=-0.02 totalR=-2.65 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  117  C --3-> L R=-0.02 totalR=-2.67 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  118  L --0-> C R=-0.02 totalR=-2.68 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  119  C --3-> L R=-0.02 totalR=-2.70 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  120  L --2-> L R=-0.08 totalR=-2.77 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  121  L --3-> L R=-0.08 totalR=-2.85 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  122  L --3-> L R=-0.08 totalR=-2.93 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  123  L --1-> M R= 0.07 totalR=-2.85 cost=  50 customerR=1000 optimum=13000
Episode:    0   Step:  124  M --1-> L R=-0.00 totalR=-2.86 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:  125  L --1-> M R=-0.00 totalR=-2.86 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:  126  M --0-> C R=-0.01 totalR=-2.87 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  127  C --1-> B R=-0.00 totalR=-2.87 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:  128  B --1-> A R=-0.01 totalR=-2.88 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  129  A --1-> B R=-0.01 totalR=-2.89 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  130  B --3-> K R=-0.02 totalR=-2.90 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  131  K --0-> B R=-0.02 totalR=-2.92 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  132  B --3-> K R=-0.02 totalR=-2.93 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  133  K --1-> H R=-0.02 totalR=-2.96 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  134  H --2-> H R=-0.08 totalR=-3.03 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  135  H --0-> E R=-0.01 totalR=-3.04 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  136  E --1-> F R=-0.01 totalR=-3.05 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  137  F --2-> G R=-0.02 totalR=-3.07 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  138  G --0-> F R=-0.02 totalR=-3.08 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  139  F --2-> G R=-0.02 totalR=-3.10 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  140  G --0-> F R=-0.02 totalR=-3.11 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  141  F --1-> E R=-0.01 totalR=-3.12 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  142  E --3-> E R=-0.08 totalR=-3.20 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  143  E --2-> H R=-0.01 totalR=-3.20 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  144  H --2-> H R=-0.08 totalR=-3.28 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  145  H --1-> K R=-0.02 totalR=-3.30 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  146  K --0-> B R=-0.02 totalR=-3.32 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  147  B --3-> K R=-0.02 totalR=-3.33 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  148  K --1-> H R=-0.02 totalR=-3.36 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  149  H --1-> K R=-0.02 totalR=-3.38 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  150  K --3-> K R=-0.08 totalR=-3.46 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  151  K --0-> B R=-0.02 totalR=-3.47 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:  152  B --2-> C R=-0.00 totalR=-3.48 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:  153  C --2-> M R=-0.01 totalR=-3.48 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  154  M --3-> M R=-0.08 totalR=-3.56 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  155  M --2-> N R= 0.07 totalR=-3.49 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:  156  N --3-> N R=-0.08 totalR=-3.57 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  157  N --3-> N R=-0.08 totalR=-3.65 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  158  N --3-> N R=-0.08 totalR=-3.72 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  159  N --2-> N R=-0.08 totalR=-3.80 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  160  N --1-> O R= 0.07 totalR=-3.73 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:  161  O --2-> O R=-0.08 totalR=-3.81 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  162  O --2-> O R=-0.08 totalR=-3.88 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  163  O --3-> O R=-0.08 totalR=-3.96 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  164  O --2-> O R=-0.08 totalR=-4.04 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  165  O --3-> O R=-0.08 totalR=-4.12 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  166  O --3-> O R=-0.08 totalR=-4.19 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  167  O --2-> O R=-0.08 totalR=-4.27 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  168  O --2-> O R=-0.08 totalR=-4.35 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  169  O --3-> O R=-0.08 totalR=-4.42 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  170  O --0-> N R=-0.01 totalR=-4.43 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  171  N --1-> O R=-0.01 totalR=-4.44 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  172  O --2-> O R=-0.08 totalR=-4.52 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  173  O --3-> O R=-0.08 totalR=-4.59 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  174  O --2-> O R=-0.08 totalR=-4.67 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  175  O --1-> G R=-0.02 totalR=-4.69 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  176  G --2-> G R=-0.08 totalR=-4.77 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  177  G --1-> O R=-0.02 totalR=-4.79 cost= 300 customerR=   0 optimum=13000
Episode:    0   Step:  178  O --0-> N R=-0.01 totalR=-4.80 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  179  N --2-> N R=-0.08 totalR=-4.88 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  180  N --2-> N R=-0.08 totalR=-4.95 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  181  N --3-> N R=-0.08 totalR=-5.03 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  182  N --0-> M R=-0.01 totalR=-5.04 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  183  M --3-> M R=-0.08 totalR=-5.12 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  184  M --2-> N R=-0.01 totalR=-5.12 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  185  N --3-> N R=-0.08 totalR=-5.20 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  186  N --0-> M R=-0.01 totalR=-5.21 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  187  M --0-> C R=-0.01 totalR=-5.22 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  188  C --2-> M R=-0.01 totalR=-5.22 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  189  M --0-> C R=-0.01 totalR=-5.23 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  190  C --2-> M R=-0.01 totalR=-5.24 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  191  M --3-> M R=-0.08 totalR=-5.32 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  192  M --2-> N R=-0.01 totalR=-5.32 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  193  N --2-> N R=-0.08 totalR=-5.40 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  194  N --3-> N R=-0.08 totalR=-5.48 cost=1000 customerR=   0 optimum=13000
Episode:    0   Step:  195  N --0-> M R=-0.01 totalR=-5.48 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  196  M --0-> C R=-0.01 totalR=-5.49 cost= 100 customerR=   0 optimum=13000
Episode:    0   Step:  197  C --0-> S R=-0.02 totalR=-5.51 cost= 200 customerR=   0 optimum=13000
Done: episodes=     1  avgSteps=197.00  avgTotalReward=-5.51
Episode finished after 197 timesteps
[    0.  -300.  -100.  -200. -1000.]

Train model

  • random has reward of -5.51
  • total cost when travelling all paths (back and forth): 5000
  • additional pernalty for liiegal moves 1000
  • all rewards: 13000
  • perfect score???
  • estimate: half the travel cost and no illegal moves: (13000 - 2500) / 13000 = .80

In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/berater

In [6]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
model = ppo2.learn(network='mlp', env=monitored_env, total_timesteps=150000)

# monitored_env = bench.Monitor(env, log_dir)
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         monitored_env,\
#         seed=42,\
#         network='mlp',\
#         lr=1e-3,\
#         gamma=0.99,\
#         total_timesteps=30000,\
#         buffer_size=50000,\
#         exploration_fraction=0.5,\
#         exploration_final_eps=0.02,\
#         print_freq=1000)

model.save('berater-ppo-v6.pkl')
monitored_env.close()


Logging to /tmp/openai-2019-01-03-11-42-54-618865
--------------------------------------
| approxkl           | 0.0008961776  |
| clipfrac           | 0.0           |
| eplenmean          | 134           |
| eprewmean          | -3.5052223    |
| explained_variance | -3.14         |
| fps                | 473           |
| nupdates           | 1             |
| policy_entropy     | 1.3854092     |
| policy_loss        | -0.0039864834 |
| serial_timesteps   | 2048          |
| time_elapsed       | 4.32          |
| total_timesteps    | 2048          |
| value_loss         | 0.13796656    |
--------------------------------------
--------------------------------------
| approxkl           | 0.00066269375 |
| clipfrac           | 0.0           |
| eplenmean          | 128           |
| eprewmean          | -1.1288865    |
| explained_variance | -0.0832       |
| fps                | 501           |
| nupdates           | 10            |
| policy_entropy     | 1.1806759     |
| policy_loss        | -0.0038700835 |
| serial_timesteps   | 20480         |
| time_elapsed       | 40.5          |
| total_timesteps    | 20480         |
| value_loss         | 0.028150184   |
--------------------------------------
--------------------------------------
| approxkl           | 0.00061270216 |
| clipfrac           | 0.0007324219  |
| eplenmean          | 36.2          |
| eprewmean          | 0.5741921     |
| explained_variance | 0.364         |
| fps                | 461           |
| nupdates           | 20            |
| policy_entropy     | 0.9146788     |
| policy_loss        | -0.0031393853 |
| serial_timesteps   | 40960         |
| time_elapsed       | 82.7          |
| total_timesteps    | 40960         |
| value_loss         | 0.0127601875  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0005067946  |
| clipfrac           | 0.0           |
| eplenmean          | 26            |
| eprewmean          | 0.7328461     |
| explained_variance | 0.66          |
| fps                | 509           |
| nupdates           | 30            |
| policy_entropy     | 0.6833499     |
| policy_loss        | -0.0016387015 |
| serial_timesteps   | 61440         |
| time_elapsed       | 123           |
| total_timesteps    | 61440         |
| value_loss         | 0.010324567   |
--------------------------------------
--------------------------------------
| approxkl           | 0.00013729218 |
| clipfrac           | 0.0           |
| eplenmean          | 21.1          |
| eprewmean          | 0.7752693     |
| explained_variance | 0.687         |
| fps                | 502           |
| nupdates           | 40            |
| policy_entropy     | 0.4920636     |
| policy_loss        | -0.0004988579 |
| serial_timesteps   | 81920         |
| time_elapsed       | 163           |
| total_timesteps    | 81920         |
| value_loss         | 0.010660268   |
--------------------------------------

In [7]:
!ls -l $log_dir


total 72
-rw-r--r-- 1 root root 70457 Jan  3 11:46 monitor.csv

In [12]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility

In [0]:
import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(-1, 1)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)

In [14]:
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


Out[14]:
[<matplotlib.lines.Line2D at 0x7f14089faa20>]

Enjoy model


In [11]:
import numpy as np 

observation = env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


Episode:    0   Step:    1  S --1-> B R= 0.07 totalR= 0.07 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:    2  B --3-> K R= 0.06 totalR= 0.13 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:    3  K --1-> H R= 0.05 totalR= 0.18 cost= 300 customerR=1000 optimum=13000
Episode:    0   Step:    4  H --0-> E R= 0.07 totalR= 0.25 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:    5  E --0-> A R= 0.07 totalR= 0.32 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:    6  A --3-> D R= 0.07 totalR= 0.39 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:    7  D --1-> F R= 0.07 totalR= 0.47 cost=  50 customerR=1000 optimum=13000
Episode:    0   Step:    8  F --2-> G R= 0.06 totalR= 0.53 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:    9  G --1-> O R= 0.05 totalR= 0.58 cost= 300 customerR=1000 optimum=13000
Episode:    0   Step:   10  O --0-> N R= 0.07 totalR= 0.65 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:   11  N --0-> M R= 0.07 totalR= 0.72 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:   12  M --0-> C R= 0.07 totalR= 0.79 cost= 100 customerR=1000 optimum=13000
Episode:    0   Step:   13  C --3-> L R= 0.06 totalR= 0.85 cost= 200 customerR=1000 optimum=13000
Episode:    0   Step:   14  L --0-> C R=-0.02 totalR= 0.83 cost= 200 customerR=   0 optimum=13000
Episode:    0   Step:   15  C --1-> B R=-0.00 totalR= 0.83 cost=  50 customerR=   0 optimum=13000
Episode:    0   Step:   16  B --0-> S R=-0.01 totalR= 0.82 cost= 100 customerR=   0 optimum=13000
Episode finished after 16 timesteps

In [0]: