Berater Environment v5

Changes from v4

encode observation to local one
non existing connection has highest penalty

next steps

use complex customer graph
per episode set certain rewards to 0 to simulate different customers per consultant
make sure things generalize well

Installation (required for colab)



In [0]:

    
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

Environment



In [0]:

    
import numpy
import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 5,
        'F': 6,
        'G': 7,
        'H': 8,
        'K': 9,
        'L': 10,
        'M': 11,
        'N': 12,
        'O': 13
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C',
        4: 'D',
        5: 'E',
        6: 'F',
        7: 'G',
        8: 'H',
        9: 'K',
        10: 'L',
        11: 'M',
        12: 'N',
        13: 'O'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 4 discrete deterministic actions, each choosing one direction
    """
    metadata = {'render.modes': ['ansi']}
    
    showStep = False
    showDone = True
    envEpisodeModulo = 100

    def __init__(self):
        self.map = {
            'S': [('A', 100), ('B', 400), ('C', 200 )],
            'A': [('B', 250), ('C', 400), ('S', 100 )],
            'B': [('A', 250), ('C', 250), ('S', 400 )],
            'C': [('A', 400), ('B', 250), ('S', 200 )]
        }
#         self.map = {
#             'S': [('A', 300), ('B', 100), ('C', 200 )],
#             'A': [('S', 300), ('B', 100), ('E', 100 ), ('D', 100 )],
#             'B': [('S', 100), ('A', 100), ('C', 50 ), ('K', 200 )],
#             'C': [('S', 200), ('B', 50), ('M', 100 ), ('L', 200 )],
#             'D': [('A', 100), ('F', 50)],
#             'E': [('A', 100), ('F', 100), ('H', 100)],
#             'F': [('D', 50), ('E', 100), ('G', 200)],
#             'G': [('F', 200), ('O', 300)],
#             'H': [('E', 100), ('K', 300)],
#             'K': [('B', 200), ('H', 300)],
#             'L': [('C', 200), ('M', 50)],
#             'M': [('C', 100), ('L', 50), ('N', 100)],
#             'N': [('M', 100), ('O', 100)],
#             'O': [('N', 100), ('G', 300)]
#         }
        self.action_space = spaces.Discrete(4)
        # position, and up to 4 paths from that position, non existing path is -1000 and no position change
        self.observation_space = spaces.Box(low=numpy.array([0,-1000,-1000,-1000,-1000]),
                                             high=numpy.array([13,1000,1000,1000,1000]),
                                             dtype=numpy.float32)
        self.reward_range = (-1, 1)

        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def iterate_path(self, state, action):
        paths = self.map[state]
        if action < len(paths):
          return paths[action]
        else:
          # sorry, no such action, stay where you are and pay a high penalty
          return (state, 1000)
      
    def step(self, action):
        destination, cost = self.iterate_path(self.state, action)
        lastState = self.state
        customerReward = self.customer_reward[destination]
        reward = (customerReward - cost) / self.optimum

        self.state = destination
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)
        info = {"from": self.state, "to": destination}

        return observation, reward, done, info

    def getObservation(self, position):
        result = numpy.array([ position, 
                               self.getPathObservation(position, 0),
                               self.getPathObservation(position, 1),
                               self.getPathObservation(position, 2),
                               self.getPathObservation(position, 3)
                              ],
                             dtype=numpy.float32)
        return result

    def getPathObservation(self, position, path):
        source = int_to_state_name(position)
        paths = self.map[self.state]
        if path < len(paths):
          target, cost = paths[path]
          reward = self.customer_reward[target] 
          result = reward - cost
        else:
          result = -1000

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

    def reset(self):
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False
        reward_per_customer = 1000
        self.customer_reward = {
            'S': 0,
            'A': reward_per_customer,
            'B': reward_per_customer,
            'C': reward_per_customer,
#             'D': reward_per_customer,
#             'E': reward_per_customer,
#             'F': reward_per_customer,
#             'G': reward_per_customer,
#             'H': reward_per_customer,
#             'K': reward_per_customer,
#             'L': reward_per_customer,
#             'M': reward_per_customer,
#             'N': reward_per_customer,
#             'O': reward_per_customer
        }

        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))

Try out Environment



In [3]:

    
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = BeraterEnv()
print(env)
observation = env.reset()
print(observation)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()
print(observation)









    



<BeraterEnv instance>
[    0.   900.   600.   800. -1000.]
Episode:    0   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode:    0   Step:    2  A --3-> A R=-0.33 totalR=-0.03 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    3  A --1-> C R= 0.20 totalR= 0.17 cost= 400 customerR=1000 optimum=3000
Episode:    0   Step:    4  C --0-> A R=-0.13 totalR= 0.03 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:    5  A --3-> A R=-0.33 totalR=-0.30 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    6  A --3-> A R=-0.33 totalR=-0.63 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    7  A --3-> A R=-0.33 totalR=-0.97 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    8  A --3-> A R=-0.33 totalR=-1.30 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:    9  A --1-> C R=-0.13 totalR=-1.43 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:   10  C --3-> C R=-0.33 totalR=-1.77 cost=1000 customerR=   0 optimum=3000
Episode:    0   Step:   11  C --1-> B R= 0.25 totalR=-1.52 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:   12  B --2-> S R=-0.13 totalR=-1.65 cost= 400 customerR=   0 optimum=3000
Done: episodes=     1  avgSteps= 12.00  avgTotalReward=-1.65
Episode finished after 12 timesteps
[    0.  -100.  -400.  -200. -1000.]

Train model

0.73 would be perfect total reward



In [4]:

    
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)



In [0]:

    
!rm -r logs
!mkdir logs
!mkdir logs/berater



In [6]:

    
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/train_pong.py
# log_dir = logger.get_dir()
log_dir = '/content/logs/berater/'

import gym
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = BeraterEnv()

wrapped_env = DummyVecEnv([lambda: BeraterEnv()])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
model = ppo2.learn(network='mlp', env=monitored_env, total_timesteps=50000)

# monitored_env = bench.Monitor(env, log_dir)
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         monitored_env,\
#         seed=42,\
#         network='mlp',\
#         lr=1e-3,\
#         gamma=0.99,\
#         total_timesteps=30000,\
#         buffer_size=50000,\
#         exploration_fraction=0.5,\
#         exploration_final_eps=0.02,\
#         print_freq=1000)

model.save('berater-ppo-v4.pkl')
monitored_env.close()









    



Logging to /tmp/openai-2019-01-03-11-18-00-870090
-------------------------------------
| approxkl           | 0.0015796605 |
| clipfrac           | 0.0          |
| eplenmean          | 11.2         |
| eprewmean          | -0.6935      |
| explained_variance | -0.785       |
| fps                | 159          |
| nupdates           | 1            |
| policy_entropy     | 1.3847406    |
| policy_loss        | -0.013725469 |
| serial_timesteps   | 2048         |
| time_elapsed       | 12.8         |
| total_timesteps    | 2048         |
| value_loss         | 0.28990766   |
-------------------------------------
-------------------------------------
| approxkl           | 0.0029727407 |
| clipfrac           | 0.012329102  |
| eplenmean          | 5.06         |
| eprewmean          | 0.55383337   |
| explained_variance | 0.689        |
| fps                | 417          |
| nupdates           | 10           |
| policy_entropy     | 0.90667784   |
| policy_loss        | -0.027320659 |
| serial_timesteps   | 20480        |
| time_elapsed       | 73.8         |
| total_timesteps    | 20480        |
| value_loss         | 0.011048271  |
-------------------------------------
--------------------------------------
| approxkl           | 0.00030720897 |
| clipfrac           | 0.005004883   |
| eplenmean          | 4.02          |
| eprewmean          | 0.7226666     |
| explained_variance | 0.982         |
| fps                | 438           |
| nupdates           | 20            |
| policy_entropy     | 0.11884171    |
| policy_loss        | -0.008551375  |
| serial_timesteps   | 40960         |
| time_elapsed       | 119           |
| total_timesteps    | 40960         |
| value_loss         | 0.00082263123 |
--------------------------------------

Visualizing Results

https://github.com/openai/baselines/blob/master/docs/viz/viz.ipynb



In [7]:

    
!ls -l $log_dir









    



total 228
-rw-r--r-- 1 root root 229608 Jan  3 11:20 monitor.csv



In [8]:

    
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)









    



/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility



In [0]:

    
import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(-1, 1)
# plt.plot(np.cumsum(r.monitor.l), r.monitor.r)



In [10]:

    
plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))









    Out[10]:





[<matplotlib.lines.Line2D at 0x7f9ae0d8ce80>]

Enjoy model



In [11]:

    
import numpy as np 

observation = env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()









    



Episode:    0   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode:    0   Step:    2  A --0-> B R= 0.25 totalR= 0.55 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:    3  B --1-> C R= 0.25 totalR= 0.80 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:    4  C --2-> S R=-0.07 totalR= 0.73 cost= 200 customerR=   0 optimum=3000
Episode finished after 4 timesteps



In [0]: