Berater Environment v5

Changes from v4

  1. encode observation to local one
  2. non existing connection has highest penalty

next steps

  1. use complex customer graph
  2. per episode set certain rewards to 0 to simulate different customers per consultant
  3. make sure things generalize well

Installation (required for colab)


In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment


In [0]:
import numpy
import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
        self.map = {
            'S': [('A', 100), ('B', 400), ('C', 200 )],
            'A': [('B', 250), ('C', 400), ('S', 100 )],
            'B': [('A', 250), ('C', 250), ('S', 400 )],
            'C': [('A', 400), ('B', 250), ('S', 200 )]
        }
#         self.map = {
#             'S': [('A', 300), ('B', 100), ('C', 200 )],
#             'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
#             'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
#             'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
#             'D': [('A', 100), ('F', 50)],
#             'E': [('A', 100), ('F', 100), ('H', 100)],
#             'F': [('D', 50), ('E', 100), ('G', 200)],
#             'G': [('F', 200), ('O', 300)],
#             'H': [('E', 100), ('K', 300)],
#             'K': [('B', 200), ('H', 300)],
#             'L': [('C', 200), ('M', 50)],
#             'M': [('C', 100), ('L', 50), ('N', 100)],
#             'N': [('M', 100), ('O', 100)],
#             'O': [('N', 100), ('G', 300)]
#         }
        self.action_space = spaces.Discrete(4)
        # position, and up to 4 paths from that position, non existing path is -1000 and no position change
        self.observation_space = spaces.Box(low=numpy.array([0,-1000,-1000,-1000,-1000]),
                                             high=numpy.array([13,1000,1000,1000,1000]),
                                             dtype=numpy.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = numpy.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=numpy.float32)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False
        reward_per_customer = 1000
        self.customer_reward = {
            'S': 0,
            'A': reward_per_customer,
            'B': reward_per_customer,
            'C': reward_per_customer,
#             'D': reward_per_customer,
#             'E': reward_per_customer,
#             'F': reward_per_customer,
#             'G': reward_per_customer,
#             'H': reward_per_customer,
#             'K': reward_per_customer,
#             'L': reward_per_customer,
#             'M': reward_per_customer,
#             'N': reward_per_customer,
#             'O': reward_per_customer
        }

        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))

Try out Environment


In [3]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)


<BeraterEnv instance>
[    0.   900.   600.   800. -1000.]
Episode:    0   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode:    0   Step:    2  A --3-> A R=-0.33 totalR=-0.03 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    3  A --1-> C R= 0.20 totalR= 0.17 cost= 400 customerR=1000 optimum=3000
Episode:    0   Step:    4  C --0-> A R=-0.13 totalR= 0.03 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:    5  A --3-> A R=-0.33 totalR=-0.30 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    6  A --3-> A R=-0.33 totalR=-0.63 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    7  A --3-> A R=-0.33 totalR=-0.97 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    8  A --3-> A R=-0.33 totalR=-1.30 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    9  A --1-> C R=-0.13 totalR=-1.43 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:   10  C --3-> C R=-0.33 totalR=-1.77 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:   11  C --1-> B R= 0.25 totalR=-1.52 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:   12  B --2-> S R=-0.13 totalR=-1.65 cost= 400 customerR=   0 optimum=3000
Done: episodes=     1  avgSteps= 12.00  avgTotalReward=-1.65
Episode finished after 12 timesteps
[    0.  -100.  -400.  -200. -1000.]

Train model

  • 0.73 would be perfect total reward

In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/berater

In [6]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
model = ppo2.learn(network='mlp', env=monitored_env, total_timesteps=50000)

# monitored_env = bench.Monitor(env, log_dir)
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         monitored_env,\
#         seed=42,\
#         network='mlp',\
#         lr=1e-3,\
#         gamma=0.99,\
#         total_timesteps=30000,\
#         buffer_size=50000,\
#         exploration_fraction=0.5,\
#         exploration_final_eps=0.02,\
#         print_freq=1000)

model.save('berater-ppo-v4.pkl')
monitored_env.close()


Logging to /tmp/openai-2019-01-03-11-18-00-870090
-------------------------------------
| approxkl           | 0.0015796605 |
| clipfrac           | 0.0          |
| eplenmean          | 11.2         |
| eprewmean          | -0.6935      |
| explained_variance | -0.785       |
| fps                | 159          |
| nupdates           | 1            |
| policy_entropy     | 1.3847406    |
| policy_loss        | -0.013725469 |
| serial_timesteps   | 2048         |
| time_elapsed       | 12.8         |
| total_timesteps    | 2048         |
| value_loss         | 0.28990766   |
-------------------------------------
-------------------------------------
| approxkl           | 0.0029727407 |
| clipfrac           | 0.012329102  |
| eplenmean          | 5.06         |
| eprewmean          | 0.55383337   |
| explained_variance | 0.689        |
| fps                | 417          |
| nupdates           | 10           |
| policy_entropy     | 0.90667784   |
| policy_loss        | -0.027320659 |
| serial_timesteps   | 20480        |
| time_elapsed       | 73.8         |
| total_timesteps    | 20480        |
| value_loss         | 0.011048271  |
-------------------------------------
--------------------------------------
| approxkl           | 0.00030720897 |
| clipfrac           | 0.005004883   |
| eplenmean          | 4.02          |
| eprewmean          | 0.7226666     |
| explained_variance | 0.982         |
| fps                | 438           |
| nupdates           | 20            |
| policy_entropy     | 0.11884171    |
| policy_loss        | -0.008551375  |
| serial_timesteps   | 40960         |
| time_elapsed       | 119           |
| total_timesteps    | 40960         |
| value_loss         | 0.00082263123 |
--------------------------------------

In [7]:
!ls -l $log_dir


total 228
-rw-r--r-- 1 root root 229608 Jan  3 11:20 monitor.csv

In [8]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility

In [0]:
import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(-1, 1)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)

In [10]:
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


Out[10]:
[<matplotlib.lines.Line2D at 0x7f9ae0d8ce80>]

Enjoy model


In [11]:
import numpy as np 

observation = env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


Episode:    0   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode:    0   Step:    2  A --0-> B R= 0.25 totalR= 0.55 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:    3  B --1-> C R= 0.25 totalR= 0.80 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:    4  C --2-> S R=-0.07 totalR= 0.73 cost= 200 customerR=   0 optimum=3000
Episode finished after 4 timesteps

In [0]: