Berater Environment v10

Changes from v9

Fighting Overfitting as proposed in https://arxiv.org/abs/1812.02341
1. Batch Normalization
2. Encourage Exploration (increase entropy bonus again) (https://arxiv.org/pdf/1812.02341.pdf)
3. deeper networks (only shown for convolutions on video games)
added scoring code over a number of samples

next steps

configure custom network
- including L2 regularization / Dropout
- not possible to just configure these two

Installation (required for colab)



In [0]:

    
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment



In [0]:

    
import numpy as np
import random

import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
#         self.map = {
#             'S': [('A', 100), ('B', 400), ('C', 200 )],
#             'A': [('B', 250), ('C', 400), ('S', 100 )],
#             'B': [('A', 250), ('C', 250), ('S', 400 )],
#             'C': [('A', 400), ('B', 250), ('S', 200 )]
#         }
        self.map = {
            'S': [('A', 300), ('B', 100), ('C', 200 )],
            'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
            'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
            'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
            'D': [('A', 100), ('F', 50)],
            'E': [('A', 100), ('F', 100), ('H', 100)],
            'F': [('D', 50), ('E', 100), ('G', 200)],
            'G': [('F', 200), ('O', 300)],
            'H': [('E', 100), ('K', 300)],
            'K': [('B', 200), ('H', 300)],
            'L': [('C', 200), ('M', 50)],
            'M': [('C', 100), ('L', 50), ('N', 100)],
            'N': [('M', 100), ('O', 100)],
            'O': [('N', 100), ('G', 300)]
        }
        max_paths = 4
        self.action_space = spaces.Discrete(max_paths)
      
        positions = len(self.map)
        # observations: position, reward of all 4 local paths, rest reward of all locations
        # non existing path is -1000 and no position change
        # look at what #getObservation returns if you are confused
        low = np.append(np.append([0], np.full(max_paths, -1000)), np.full(positions, 0))
        high = np.append(np.append([positions - 1], np.full(max_paths, 1000)), np.full(positions, 1000))
        self.observation_space = spaces.Box(low=low,
                                             high=high,
                                             dtype=np.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = np.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=np.float32)
        all_rest_rewards = list(self.customer_reward.values())
        result = np.append(result, all_rest_rewards)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

      
    def modulate_reward(self):
      number_of_customers = len(self.map) - 1
      number_per_consultant = int(number_of_customers/2)
#       number_per_consultant = int(number_of_customers/1.5)
      self.customer_reward = {
          'S': 0
      }
      for customer_nr in range(1, number_of_customers + 1):
        self.customer_reward[int_to_state_name(customer_nr)] = 0
      
      # every consultant only visits a few random customers
      samples = random.sample(range(1, number_of_customers + 1), k=number_per_consultant)
      key_list = list(self.customer_reward.keys())
      for sample in samples:
        self.customer_reward[key_list[sample]] = 1000

      
    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.modulate_reward()
        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))
      
    def render(self):
      print(self.customer_reward)



In [3]:

    
env = BeraterEnv()
print(env.reset())
print(env.customer_reward)









    



[    0.  -300.   900.  -200. -1000.     0.     0.  1000.     0.     0.
  1000.     0.  1000.     0.  1000.     0.  1000.  1000.     0.]
{'S': 0, 'A': 0, 'B': 1000, 'C': 0, 'D': 0, 'E': 1000, 'F': 0, 'G': 1000, 'H': 0, 'K': 1000, 'L': 0, 'M': 1000, 'N': 1000, 'O': 0}

Try out Environment



In [4]:

    
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)









    



<BeraterEnv instance>
[    0.   700.  -100.   800. -1000.     0.  1000.     0.  1000.  1000.
  1000.  1000.  1000.     0.     0.     0.     0.     0.     0.]
Episode:    0   Step:    1  S --0-> A R= 0.12 totalR= 0.12 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:    2  A --3-> D R= 0.15 totalR= 0.27 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    3  D --1-> F R= 0.16 totalR= 0.42 cost=  50 customerR=1000 optimum=6000
Episode:    0   Step:    4  F --0-> D R=-0.01 totalR= 0.42 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    5  D --3-> D R=-0.17 totalR= 0.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    6  D --3-> D R=-0.17 totalR= 0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    7  D --3-> D R=-0.17 totalR=-0.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    8  D --3-> D R=-0.17 totalR=-0.25 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:    9  D --1-> F R=-0.01 totalR=-0.26 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   10  F --3-> F R=-0.17 totalR=-0.43 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   11  F --1-> E R= 0.15 totalR=-0.28 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   12  E --2-> H R=-0.02 totalR=-0.29 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   13  H --0-> E R=-0.02 totalR=-0.31 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   14  E --3-> E R=-0.17 totalR=-0.47 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   15  E --2-> H R=-0.02 totalR=-0.49 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   16  H --0-> E R=-0.02 totalR=-0.51 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   17  E --0-> A R=-0.02 totalR=-0.53 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   18  A --0-> S R=-0.05 totalR=-0.58 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   19  S --2-> C R= 0.13 totalR=-0.44 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   20  C --1-> B R=-0.01 totalR=-0.45 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   21  B --2-> C R=-0.01 totalR=-0.46 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   22  C --3-> L R=-0.03 totalR=-0.49 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   23  L --3-> L R=-0.17 totalR=-0.66 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   24  L --2-> L R=-0.17 totalR=-0.83 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   25  L --0-> C R=-0.03 totalR=-0.86 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   26  C --1-> B R=-0.01 totalR=-0.87 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   27  B --1-> A R=-0.02 totalR=-0.88 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   28  A --1-> B R=-0.02 totalR=-0.90 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   29  B --1-> A R=-0.02 totalR=-0.92 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   30  A --0-> S R=-0.05 totalR=-0.97 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   31  S --1-> B R=-0.02 totalR=-0.98 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   32  B --0-> S R=-0.02 totalR=-1.00 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   33  S --3-> S R=-0.17 totalR=-1.17 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   34  S --0-> A R=-0.05 totalR=-1.22 cost= 300 customerR=   0 optimum=6000
Episode:    0   Step:   35  A --3-> D R=-0.02 totalR=-1.23 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   36  D --1-> F R=-0.01 totalR=-1.24 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:   37  F --2-> G R= 0.13 totalR=-1.11 cost= 200 customerR=1000 optimum=6000
Episode:    0   Step:   38  G --3-> G R=-0.17 totalR=-1.28 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   39  G --3-> G R=-0.17 totalR=-1.44 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   40  G --0-> F R=-0.03 totalR=-1.48 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   41  F --2-> G R=-0.03 totalR=-1.51 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   42  G --3-> G R=-0.17 totalR=-1.68 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   43  G --0-> F R=-0.03 totalR=-1.71 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   44  F --1-> E R=-0.02 totalR=-1.73 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   45  E --3-> E R=-0.17 totalR=-1.89 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   46  E --1-> F R=-0.02 totalR=-1.91 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   47  F --3-> F R=-0.17 totalR=-2.08 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   48  F --3-> F R=-0.17 totalR=-2.24 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   49  F --2-> G R=-0.03 totalR=-2.28 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   50  G --3-> G R=-0.17 totalR=-2.44 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   51  G --0-> F R=-0.03 totalR=-2.48 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   52  F --1-> E R=-0.02 totalR=-2.49 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   53  E --1-> F R=-0.02 totalR=-2.51 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   54  F --1-> E R=-0.02 totalR=-2.52 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   55  E --3-> E R=-0.17 totalR=-2.69 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   56  E --0-> A R=-0.02 totalR=-2.71 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   57  A --3-> D R=-0.02 totalR=-2.72 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   58  D --2-> D R=-0.17 totalR=-2.89 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   59  D --0-> A R=-0.02 totalR=-2.91 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   60  A --3-> D R=-0.02 totalR=-2.92 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   61  D --3-> D R=-0.17 totalR=-3.09 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   62  D --2-> D R=-0.17 totalR=-3.26 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   63  D --3-> D R=-0.17 totalR=-3.42 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   64  D --2-> D R=-0.17 totalR=-3.59 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   65  D --3-> D R=-0.17 totalR=-3.76 cost=1000 customerR=   0 optimum=6000
Episode:    0   Step:   66  D --0-> A R=-0.02 totalR=-3.77 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   67  A --2-> E R=-0.02 totalR=-3.79 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   68  E --0-> A R=-0.02 totalR=-3.81 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   69  A --0-> S R=-0.05 totalR=-3.86 cost= 300 customerR=   0 optimum=6000
Done: episodes=     1  avgSteps= 69.00  avgTotalReward=-3.86
Episode finished after 69 timesteps
[    0.  -300.  -100.  -200. -1000.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.]

Baseline



In [0]:

    
from copy import deepcopy
import json

class Baseline():

  def __init__(self, env, verbose=1):
    self.env = env
    self.verbose = verbose
    self.reset()

  def reset(self):
    self.map = self.env.map
    self.rewards = self.env.customer_reward.copy()
    
  def as_string(self, state):
    # reward/cost does not hurt, but is useless, path obsucres same state
    new_state = {
        'rewards': state['rewards'],
        'position': state['position']
    }
    return json.dumps(new_state, sort_keys=True)
  
  def is_goal(self, state):
    if state['position'] != 'S': return False
    for reward in state['rewards'].values():
      if reward != 0: return False
    return True
    

  def expand(self, state):
    states = []
    for position, cost in self.map[state['position']]:
      new_state = deepcopy(state)
      new_state['position'] = position
      new_state['rewards'][position] = 0
      reward = state['rewards'][position]
      new_state['reward'] += reward
      new_state['cost'] += cost
      new_state['path'].append(position)
      states.append(new_state)
    return states

  def search(self, root, max_depth = 25):
      closed = set()
      open = [root]

      while open:
          state = open.pop(0)
          if self.as_string(state) in closed: continue  

          closed.add(self.as_string(state))

          depth = len(state['path'])
          if depth > max_depth:
            if self.verbose > 0:
              print("Visited:", len(closed))
              print("Reached max depth, without reaching goal")
            return None

          if self.is_goal(state):
            scaled_reward = (state['reward'] - state['cost']) / 6000
            state['scaled_reward'] = scaled_reward
            if self.verbose > 0:
              print("Scaled reward:", scaled_reward)            
              print("Perfect path", state['path'])
            return state

          expanded = self.expand(state)
          open += expanded
          # make this best first
          open.sort(key=lambda state: state['cost'])
        
  def find_optimum(self):
    initial_state = {
        'rewards': self.rewards.copy(),
        'position': 'S',
        'reward': 0,
        'cost': 0,
        'path': ['S']
    }
    return self.search(initial_state)
  
  def benchmark(self, model, sample_runs=100):
    self.verbose = 0
    BeraterEnv.showStep = False
    BeraterEnv.showDone = False

    perfect_rewards = []
    model_rewards = []
    for run in range(sample_runs):
      observation = self.env.reset()
      self.reset()
      
      optimum_state = self.find_optimum()
      perfect_rewards.append(optimum_state['scaled_reward'])
      
      state = np.zeros((1, 2*128))
      dones = np.zeros((1))

      for t in range(1000):
        actions, _, state, _ = model.step(observation, S=state, M=dones)
        observation, reward, done, info = self.env.step(actions[0])
        if done:
          break
      model_rewards.append(env.totalReward)
    return perfect_rewards, model_rewards

Train model

Estimation

total cost when travelling all paths (back and forth): 2500
all rewards: 6000
but: rewards are much more sparse while routes stay the same, maybe expect less
estimate: no illegal moves and between
- half the travel cost: (6000 - 1250) / 6000 = .79
- and full traval cost (6000 - 2500) / 6000 = 0.58
additionally: the agent only sees very little of the whole scenario
- changes with every episode
- was ok when network can learn fixed scenario



In [6]:

    
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)



In [0]:

    
!rm -r logs
!mkdir logs
!mkdir logs/berater



In [8]:

    
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
# https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30
# https://arxiv.org/abs/1607.06450 for layer_norm

# lr linear from lr=1e-2 to lr=1e-4 (default lr=3e-4)
def lr_range(frac):
  # we get the remaining updates between 1 and 0
  start_lr = 1e-2
  end_lr = 1e-4
  diff_lr = start_lr - end_lr
  lr = end_lr + diff_lr * frac
  return lr
  
  
%time model = ppo2.learn(\
    env=monitored_env,\
    network='mlp',\
    num_hidden=500,\
    num_layers=3,\
    lr=lr_range,\
    gamma=1.0,\
    ent_coef=0.05,\
    layer_norm=True,\
    total_timesteps=500000)

# model.save('berater-ppo-v10.pkl')
monitored_env.close()









    



Logging to /tmp/openai-2019-01-20-12-42-29-316583
-----------------------------------
| approxkl           | 1.106912   |
| clipfrac           | 0.85180664 |
| eplenmean          | 112        |
| eprewmean          | -7.344446  |
| explained_variance | -0.338     |
| fps                | 454        |
| nupdates           | 1          |
| policy_entropy     | 0.95964    |
| policy_loss        | 0.19367278 |
| serial_timesteps   | 2048       |
| time_elapsed       | 4.5        |
| total_timesteps    | 2048       |
| value_loss         | 6.1471424  |
-----------------------------------
------------------------------------
| approxkl           | 0.04339493  |
| clipfrac           | 0.4329834   |
| eplenmean          | 416         |
| eprewmean          | -20.148624  |
| explained_variance | -0.723      |
| fps                | 489         |
| nupdates           | 10          |
| policy_entropy     | 0.65929854  |
| policy_loss        | 0.015227782 |
| serial_timesteps   | 20480       |
| time_elapsed       | 42.2        |
| total_timesteps    | 20480       |
| value_loss         | 0.12013262  |
------------------------------------
------------------------------------
| approxkl           | 0.034814283 |
| clipfrac           | 0.26574707  |
| eplenmean          | 393         |
| eprewmean          | -15.240261  |
| explained_variance | -0.342      |
| fps                | 490         |
| nupdates           | 20          |
| policy_entropy     | 0.90020245  |
| policy_loss        | 0.023324102 |
| serial_timesteps   | 40960       |
| time_elapsed       | 84.1        |
| total_timesteps    | 40960       |
| value_loss         | 1.227856    |
------------------------------------
------------------------------------
| approxkl           | 0.054020975 |
| clipfrac           | 0.36206055  |
| eplenmean          | 68.5        |
| eprewmean          | -1.2545832  |
| explained_variance | 0.563       |
| fps                | 487         |
| nupdates           | 30          |
| policy_entropy     | 1.0407382   |
| policy_loss        | 0.020006128 |
| serial_timesteps   | 61440       |
| time_elapsed       | 126         |
| total_timesteps    | 61440       |
| value_loss         | 2.0772023   |
------------------------------------
-------------------------------------
| approxkl           | 0.011550133  |
| clipfrac           | 0.14611816   |
| eplenmean          | 52.4         |
| eprewmean          | -0.6906667   |
| explained_variance | 0.54         |
| fps                | 486          |
| nupdates           | 40           |
| policy_entropy     | 1.0082229    |
| policy_loss        | -0.003875678 |
| serial_timesteps   | 81920        |
| time_elapsed       | 168          |
| total_timesteps    | 81920        |
| value_loss         | 1.0327008    |
-------------------------------------
--------------------------------------
| approxkl           | 0.019898184   |
| clipfrac           | 0.21374512    |
| eplenmean          | 28.8          |
| eprewmean          | 0.16991667    |
| explained_variance | -0.0189       |
| fps                | 485           |
| nupdates           | 50            |
| policy_entropy     | 0.83662224    |
| policy_loss        | -0.0059831487 |
| serial_timesteps   | 102400        |
| time_elapsed       | 210           |
| total_timesteps    | 102400        |
| value_loss         | 0.089379594   |
--------------------------------------
-------------------------------------
| approxkl           | 0.04559314   |
| clipfrac           | 0.1986084    |
| eplenmean          | 23           |
| eprewmean          | 0.49091664   |
| explained_variance | 0.491        |
| fps                | 485          |
| nupdates           | 60           |
| policy_entropy     | 0.5633248    |
| policy_loss        | -0.010887655 |
| serial_timesteps   | 122880       |
| time_elapsed       | 253          |
| total_timesteps    | 122880       |
| value_loss         | 0.010913096  |
-------------------------------------
--------------------------------------
| approxkl           | 0.016467288   |
| clipfrac           | 0.119140625   |
| eplenmean          | 17.8          |
| eprewmean          | 0.6400001     |
| explained_variance | 0.784         |
| fps                | 486           |
| nupdates           | 70            |
| policy_entropy     | 0.34218857    |
| policy_loss        | -0.0059060347 |
| serial_timesteps   | 143360        |
| time_elapsed       | 295           |
| total_timesteps    | 143360        |
| value_loss         | 0.0054553514  |
--------------------------------------
--------------------------------------
| approxkl           | 0.01945089    |
| clipfrac           | 0.10852051    |
| eplenmean          | 16.4          |
| eprewmean          | 0.6736666     |
| explained_variance | 0.836         |
| fps                | 485           |
| nupdates           | 80            |
| policy_entropy     | 0.26294127    |
| policy_loss        | -0.0045008133 |
| serial_timesteps   | 163840        |
| time_elapsed       | 338           |
| total_timesteps    | 163840        |
| value_loss         | 0.004712579   |
--------------------------------------
-------------------------------------
| approxkl           | 0.032737214  |
| clipfrac           | 0.13598633   |
| eplenmean          | 19.4         |
| eprewmean          | 0.6159167    |
| explained_variance | 0.691        |
| fps                | 488          |
| nupdates           | 90           |
| policy_entropy     | 0.3559902    |
| policy_loss        | -0.017158197 |
| serial_timesteps   | 184320       |
| time_elapsed       | 380          |
| total_timesteps    | 184320       |
| value_loss         | 0.006609668  |
-------------------------------------
-------------------------------------
| approxkl           | 0.03589167   |
| clipfrac           | 0.11401367   |
| eplenmean          | 15.6         |
| eprewmean          | 0.67225015   |
| explained_variance | 0.893        |
| fps                | 485          |
| nupdates           | 100          |
| policy_entropy     | 0.266416     |
| policy_loss        | -0.006536624 |
| serial_timesteps   | 204800       |
| time_elapsed       | 422          |
| total_timesteps    | 204800       |
| value_loss         | 0.0033179817 |
-------------------------------------
-------------------------------------
| approxkl           | 0.010546623  |
| clipfrac           | 0.07910156   |
| eplenmean          | 15.2         |
| eprewmean          | 0.69358337   |
| explained_variance | 0.908        |
| fps                | 488          |
| nupdates           | 110          |
| policy_entropy     | 0.28306586   |
| policy_loss        | -0.00791038  |
| serial_timesteps   | 225280       |
| time_elapsed       | 464          |
| total_timesteps    | 225280       |
| value_loss         | 0.0024299715 |
-------------------------------------
-------------------------------------
| approxkl           | 0.0128233405 |
| clipfrac           | 0.07409668   |
| eplenmean          | 16.2         |
| eprewmean          | 0.6847499    |
| explained_variance | 0.912        |
| fps                | 490          |
| nupdates           | 120          |
| policy_entropy     | 0.18715079   |
| policy_loss        | -0.008657055 |
| serial_timesteps   | 245760       |
| time_elapsed       | 506          |
| total_timesteps    | 245760       |
| value_loss         | 0.002223344  |
-------------------------------------
-------------------------------------
| approxkl           | 0.011322146  |
| clipfrac           | 0.06738281   |
| eplenmean          | 15.7         |
| eprewmean          | 0.6863335    |
| explained_variance | 0.902        |
| fps                | 485          |
| nupdates           | 130          |
| policy_entropy     | 0.2390564    |
| policy_loss        | -0.007650907 |
| serial_timesteps   | 266240       |
| time_elapsed       | 549          |
| total_timesteps    | 266240       |
| value_loss         | 0.0031219693 |
-------------------------------------
--------------------------------------
| approxkl           | 0.015042002   |
| clipfrac           | 0.08508301    |
| eplenmean          | 16.6          |
| eprewmean          | 0.6698334     |
| explained_variance | 0.905         |
| fps                | 488           |
| nupdates           | 140           |
| policy_entropy     | 0.23085533    |
| policy_loss        | -0.0080692535 |
| serial_timesteps   | 286720        |
| time_elapsed       | 591           |
| total_timesteps    | 286720        |
| value_loss         | 0.0028567512  |
--------------------------------------
-------------------------------------
| approxkl           | 0.014706541  |
| clipfrac           | 0.07287598   |
| eplenmean          | 15.9         |
| eprewmean          | 0.6889167    |
| explained_variance | 0.907        |
| fps                | 482          |
| nupdates           | 150          |
| policy_entropy     | 0.21100071   |
| policy_loss        | -0.008012    |
| serial_timesteps   | 307200       |
| time_elapsed       | 634          |
| total_timesteps    | 307200       |
| value_loss         | 0.0024616255 |
-------------------------------------
--------------------------------------
| approxkl           | 0.0069028493  |
| clipfrac           | 0.057373047   |
| eplenmean          | 15            |
| eprewmean          | 0.7115834     |
| explained_variance | 0.946         |
| fps                | 484           |
| nupdates           | 160           |
| policy_entropy     | 0.16892657    |
| policy_loss        | -0.0069142045 |
| serial_timesteps   | 327680        |
| time_elapsed       | 676           |
| total_timesteps    | 327680        |
| value_loss         | 0.0016335829  |
--------------------------------------
-------------------------------------
| approxkl           | 0.039367225  |
| clipfrac           | 0.07067871   |
| eplenmean          | 15.2         |
| eprewmean          | 0.6885834    |
| explained_variance | 0.825        |
| fps                | 490          |
| nupdates           | 170          |
| policy_entropy     | 0.15657976   |
| policy_loss        | -0.013493077 |
| serial_timesteps   | 348160       |
| time_elapsed       | 719          |
| total_timesteps    | 348160       |
| value_loss         | 0.0063659805 |
-------------------------------------
-------------------------------------
| approxkl           | 0.014341769  |
| clipfrac           | 0.056640625  |
| eplenmean          | 15.1         |
| eprewmean          | 0.70958346   |
| explained_variance | 0.952        |
| fps                | 489          |
| nupdates           | 180          |
| policy_entropy     | 0.16610754   |
| policy_loss        | -0.007942176 |
| serial_timesteps   | 368640       |
| time_elapsed       | 761          |
| total_timesteps    | 368640       |
| value_loss         | 0.001545281  |
-------------------------------------
-------------------------------------
| approxkl           | 0.042431876  |
| clipfrac           | 0.032958984  |
| eplenmean          | 14.2         |
| eprewmean          | 0.72108346   |
| explained_variance | 0.953        |
| fps                | 481          |
| nupdates           | 190          |
| policy_entropy     | 0.10738936   |
| policy_loss        | -0.008103901 |
| serial_timesteps   | 389120       |
| time_elapsed       | 803          |
| total_timesteps    | 389120       |
| value_loss         | 0.0015851058 |
-------------------------------------
--------------------------------------
| approxkl           | 0.0067297537  |
| clipfrac           | 0.027954102   |
| eplenmean          | 14.3          |
| eprewmean          | 0.72550005    |
| explained_variance | 0.958         |
| fps                | 486           |
| nupdates           | 200           |
| policy_entropy     | 0.105539836   |
| policy_loss        | -0.0072995224 |
| serial_timesteps   | 409600        |
| time_elapsed       | 846           |
| total_timesteps    | 409600        |
| value_loss         | 0.0012891506  |
--------------------------------------
--------------------------------------
| approxkl           | 0.0032799218  |
| clipfrac           | 0.032470703   |
| eplenmean          | 14.2          |
| eprewmean          | 0.71125       |
| explained_variance | 0.966         |
| fps                | 485           |
| nupdates           | 210           |
| policy_entropy     | 0.14588971    |
| policy_loss        | -0.0056610624 |
| serial_timesteps   | 430080        |
| time_elapsed       | 888           |
| total_timesteps    | 430080        |
| value_loss         | 0.0010298654  |
--------------------------------------
-------------------------------------
| approxkl           | 0.0039197803 |
| clipfrac           | 0.02319336   |
| eplenmean          | 14.3         |
| eprewmean          | 0.7120001    |
| explained_variance | 0.969        |
| fps                | 485          |
| nupdates           | 220          |
| policy_entropy     | 0.10374612   |
| policy_loss        | -0.004388118 |
| serial_timesteps   | 450560       |
| time_elapsed       | 931          |
| total_timesteps    | 450560       |
| value_loss         | 0.0009157567 |
-------------------------------------
-------------------------------------
| approxkl           | 0.0014609414 |
| clipfrac           | 0.014282227  |
| eplenmean          | 14.4         |
| eprewmean          | 0.7238334    |
| explained_variance | 0.97         |
| fps                | 482          |
| nupdates           | 230          |
| policy_entropy     | 0.09417924   |
| policy_loss        | -0.003142601 |
| serial_timesteps   | 471040       |
| time_elapsed       | 973          |
| total_timesteps    | 471040       |
| value_loss         | 0.0009445905 |
-------------------------------------
--------------------------------------
| approxkl           | 0.0007847682  |
| clipfrac           | 0.0072021484  |
| eplenmean          | 14.3          |
| eprewmean          | 0.72475004    |
| explained_variance | 0.963         |
| fps                | 487           |
| nupdates           | 240           |
| policy_entropy     | 0.09368164    |
| policy_loss        | -0.0042664674 |
| serial_timesteps   | 491520        |
| time_elapsed       | 1.02e+03      |
| total_timesteps    | 491520        |
| value_loss         | 0.0012038256  |
--------------------------------------
CPU times: user 21min 36s, sys: 3min 20s, total: 24min 56s
Wall time: 17min 15s

Visualizing Results

https://github.com/openai/baselines/blob/master/docs/viz/viz.ipynb



In [0]:

    
# !ls -l $log_dir



In [10]:

    
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
plt.ylim(0, .75)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))









    



/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility






    Out[10]:





[<matplotlib.lines.Line2D at 0x7fbcca1dc400>]

Enjoy model



In [11]:

    
import numpy as np 

observation = env.reset()
env.render()
baseline = Baseline(env)









    



{'S': 0, 'A': 1000, 'B': 1000, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 1000, 'H': 1000, 'K': 1000, 'L': 0, 'M': 0, 'N': 0, 'O': 1000}



In [12]:

    
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps, reward={}".format(t+1, env.totalReward))
        break
env.close()









    



Episode:    0   Step:    1  S --1-> B R= 0.15 totalR= 0.15 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    2  B --2-> C R=-0.01 totalR= 0.14 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    3  C --2-> M R=-0.02 totalR= 0.12 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    4  M --2-> N R=-0.02 totalR= 0.11 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:    5  N --1-> O R= 0.15 totalR= 0.26 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:    6  O --1-> G R= 0.12 totalR= 0.38 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:    7  G --0-> F R=-0.03 totalR= 0.34 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:    8  F --0-> D R=-0.01 totalR= 0.33 cost=  50 customerR=   0 optimum=6000
Episode:    0   Step:    9  D --0-> A R= 0.15 totalR= 0.48 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   10  A --2-> E R=-0.02 totalR= 0.47 cost= 100 customerR=   0 optimum=6000
Episode:    0   Step:   11  E --2-> H R= 0.15 totalR= 0.62 cost= 100 customerR=1000 optimum=6000
Episode:    0   Step:   12  H --1-> K R= 0.12 totalR= 0.73 cost= 300 customerR=1000 optimum=6000
Episode:    0   Step:   13  K --0-> B R=-0.03 totalR= 0.70 cost= 200 customerR=   0 optimum=6000
Episode:    0   Step:   14  B --0-> S R=-0.02 totalR= 0.68 cost= 100 customerR=   0 optimum=6000
Episode finished after 14 timesteps, reward=0.6833333333333332



In [13]:

    
%time baseline.find_optimum()









    



Scaled reward: 0.6833333333333333
Perfect path ['S', 'B', 'C', 'M', 'N', 'O', 'G', 'F', 'D', 'A', 'E', 'H', 'K', 'B', 'S']
CPU times: user 91.9 ms, sys: 0 ns, total: 91.9 ms
Wall time: 95.6 ms






    Out[13]:





{'cost': 1900,
 'path': ['S',
  'B',
  'C',
  'M',
  'N',
  'O',
  'G',
  'F',
  'D',
  'A',
  'E',
  'H',
  'K',
  'B',
  'S'],
 'position': 'S',
 'reward': 6000,
 'rewards': {'A': 0,
  'B': 0,
  'C': 0,
  'D': 0,
  'E': 0,
  'F': 0,
  'G': 0,
  'H': 0,
  'K': 0,
  'L': 0,
  'M': 0,
  'N': 0,
  'O': 0,
  'S': 0},
 'scaled_reward': 0.6833333333333333}

Evaluation



In [17]:

    
baseline = Baseline(env)
optimum_score, model_score = baseline.benchmark(model, sample_runs=100)
optimum_score, model_score









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-4713eac5c698> in <module>()
      1 baseline = Baseline(env)
----> 2 optimum, model = baseline.benchmark(model, sample_runs=100)
      3 optimum, model

<ipython-input-5-b46eb0f6385e> in benchmark(self, model, sample_runs)
     99 
    100       for t in range(1000):
--> 101         actions, _, state, _ = model.step(observation, S=state, M=dones)
    102         observation, reward, done, info = self.env.step(actions[0])
    103         if done:

AttributeError: 'list' object has no attribute 'step'



In [15]:

    
np.array(optimum_score).mean(), np.array(optimum_score).std()









    Out[15]:





(0.7541666666666667, 0.043501277120460626)



In [16]:

    
np.array(model_score).mean(), np.array(model_score).std()









    Out[16]:





(0.7416666666666666, 0.04958158260214501)



In [0]: