Berater Environment v3

Changes from v2

  1. changed learning strategy from ppo to dqn (seems more intuitive)
  2. Fixed bug in env#reset still returning the the scalar position instead of the complete state

Next Steps

  1. Make monitor files work and plot performance
  2. choose costs of traversal randomly with each episode
    • aim: agent will (hopefully) be able to work with any costs
  3. train a different graph with each episode
    • aim: agent can work on any graph

Installation (required for colab)


In [0]:
# !pip install git+https://github.com/openai/baselines >/dev/null
# !pip install gym >/dev/null

In [0]:
cnt=0

In [0]:
import numpy
import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S': 0,
        'A': 1,
        'B': 2,
        'C': 3,
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S',
        1: 'A',
        2: 'B',
        3: 'C'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 3 discrete deterministic actions:
    - 0: First Direction
    - 1: Second Direction
    - 2: Third Direction / Go home
    """
    metadata = {'render.modes': ['ansi']}
    
    num_envs = 1
    showStep = False
    showDone = True
    showRender = False
    envEpisodeModulo = 100

    def __init__(self):
        self.map = {
            'S': [('A', 100), ('B', 400), ('C', 200 )],
            'A': [('B', 250), ('C', 400), ('S', 100 )],
            'B': [('A', 250), ('C', 250), ('S', 400 )],
            'C': [('A', 400), ('B', 250), ('S', 200 )]
        }
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=numpy.array([0,-1000,-1000,-1000,-1000,-1000,-1000]),
                                             high=numpy.array([3,1000,1000,1000,1000,1000,1000]),
                                             dtype=numpy.float32)


        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False

        self.envReward = 0
        self.envEpisodeCount = 0
        self.envStepCount = 0

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, actionArg):
        paths = self.map[self.state]
        action = actionArg
        destination, cost = paths[action]
        lastState = self.state
        lastObState = state_name_to_int(lastState)
        customerReward = self.customer_reward[destination]

        info = {"from": self.state, "to": destination}

        self.state = destination
        reward = (-cost + self.customer_reward[destination]) / self.optimum
        self.customer_visited(destination)
        done = destination == 'S' and self.all_customers_visited()

        stateAsInt = state_name_to_int(self.state)
        self.totalReward += reward
        self.stepCount += 1
        self.envReward += reward
        self.envStepCount += 1

        if self.showStep:
            print( "Episode: " + ("%4.0f  " % self.envEpisodeCount) + 
                   " Step: " + ("%4.0f  " % self.stepCount) + 
                   #lastState + ':' + str(lastObState) + ' --' + str(action) + '-> ' + self.state + ':' + str(stateAsInt) +
                   lastState + ' --' + str(action) + '-> ' + self.state + 
                   ' R=' + ("% 2.2f" % reward) + ' totalR=' + ("% 3.2f" % self.totalReward) + 
                   ' cost=' + ("%4.0f" % cost) + ' customerR=' + ("%4.0f" % customerReward) + ' optimum=' + ("%4.0f" % self.optimum)      
                   )

        if done and not self.isDone:
            self.envEpisodeCount += 1
            if BeraterEnv.showDone or (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                episodes = BeraterEnv.envEpisodeModulo
                if (self.envEpisodeCount % BeraterEnv.envEpisodeModulo != 0):
                    episodes = self.envEpisodeCount % BeraterEnv.envEpisodeModulo
                print( "Done: " + 
                        ("episodes=%6.0f  " % self.envEpisodeCount) + 
                        ("avgSteps=%6.2f  " % (self.envStepCount/episodes)) + 
                        ("avgTotalReward=% 3.2f" % (self.envReward/episodes) )
                        )
                if (self.envEpisodeCount%BeraterEnv.envEpisodeModulo) == 0:
                    self.envReward = 0
                    self.envStepCount = 0

        self.isDone = done
        observation = self.getObservation(stateAsInt)

        return observation, reward, done, info

    def getObservation(self, position):
        result = numpy.array([ position, 
                               self.getEdgeObservation('S','A'),
                               self.getEdgeObservation('S','B'),
                               self.getEdgeObservation('S','C'),
                               self.getEdgeObservation('A','B'),
                               self.getEdgeObservation('A','C'),
                               self.getEdgeObservation('B','C'),
                              ],
                             dtype=numpy.float32)
        return result

    def getEdgeObservation(self, source, target):
        reward = self.customer_reward[target] 
        cost = self.getCost(source,target)
        result = reward - cost

        return result

    def getCost(self, source, target):
        paths = self.map[source]
        targetIndex=state_name_to_int(target)
        for destination, cost in paths:
            if destination == target:
                result = cost
                break

        return result

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

    def reset(self):
        # print("Reset")
        
        self.totalReward = 0
        self.stepCount = 0
        self.isDone = False
        reward_per_customer = 1000
        self.customer_reward = {
            'S': 0,
            'A': reward_per_customer,
            'B': reward_per_customer,
            'C': reward_per_customer,
        }

        self.state = 'S'
        return self.getObservation(state_name_to_int(self.state))

    def render(self, mode='human'):
        if BeraterEnv.showRender:
            print( ("steps=%4.0f  " % self.stepCount) + ' totalR=' + ("% 3.2f" % self.totalReward) + ' done=' + str(self.isDone))

Register Einvornment


In [4]:
from gym.envs.registration import register

cnt += 1
id = "Berater-v{}".format(cnt)
register(
    id=id,
    entry_point=BeraterEnv
)   

print(id)


Berater-v1

Try out Environment


In [5]:
BeraterEnv.showStep = True
BeraterEnv.showDone = True

env = gym.make(id)
observation = env.reset()
print(env)

for t in range(1000):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        env.render()
        break
env.close()


<BeraterEnv<Berater-v1>>
Episode:    0   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode:    0   Step:    2  A --1-> C R= 0.20 totalR= 0.50 cost= 400 customerR=1000 optimum=3000
Episode:    0   Step:    3  C --0-> A R=-0.13 totalR= 0.37 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:    4  A --1-> C R=-0.13 totalR= 0.23 cost= 400 customerR=   0 optimum=3000
Episode:    0   Step:    5  C --1-> B R= 0.25 totalR= 0.48 cost= 250 customerR=1000 optimum=3000
Episode:    0   Step:    6  B --2-> S R=-0.13 totalR= 0.35 cost= 400 customerR=   0 optimum=3000
Done: episodes=     1  avgSteps=  6.00  avgTotalReward= 0.35

Train model

  • 0.73 would be perfect total reward

In [6]:
%env OPENAI_LOGDIR=/content/logs/berater
# %env OPENAI_LOG_FORMAT=csv


env: OPENAI_LOGDIR=/content/logs/berater
env: OPENAI_LOG_FORMAT=csv

In [7]:
%env


Out[7]:
{'CLICOLOR': '1',
 'CLOUDSDK_CONFIG': '/content/.config',
 'COLAB_GPU': '1',
 'CUDA_PKG_VERSION': '9-2=9.2.148-1',
 'CUDA_VERSION': '9.2.148',
 'CUDNN_VERSION': '7.4.1.5',
 'DATALAB_SETTINGS_OVERRIDES': '{"kernelManagerProxyPort":6000,"kernelManagerProxyHost":"172.28.0.3","jupyterArgs":["--ip=\\"172.28.0.2\\""]}',
 'DEBIAN_FRONTEND': 'noninteractive',
 'ENV': '/root/.bashrc',
 'GIT_PAGER': 'cat',
 'GLIBCPP_FORCE_NEW': '1',
 'GLIBCXX_FORCE_NEW': '1',
 'HOME': '/root',
 'HOSTNAME': '4664b721f1d5',
 'JPY_PARENT_PID': '68',
 'LANG': 'en_US.UTF-8',
 'LD_LIBRARY_PATH': '/usr/lib64-nvidia',
 'LD_PRELOAD': '/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'NCCL_VERSION': '2.3.7',
 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility',
 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.2',
 'NVIDIA_VISIBLE_DEVICES': 'all',
 'OLDPWD': '/',
 'OPENAI_LOGDIR': '/content/logs/berater',
 'OPENAI_LOG_FORMAT': 'csv',
 'PAGER': 'cat',
 'PATH': '/usr/local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin',
 'PWD': '/',
 'PYTHONPATH': '/env/python',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'TERM': 'xterm-color',
 'TF_FORCE_GPU_ALLOW_GROWTH': 'true',
 '_': '/tools/node/bin/forever',
 '__EGL_VENDOR_LIBRARY_DIRS': '/usr/lib64-nvidia:/usr/share/glvnd/egl_vendor.d/'}

In [8]:
import gym
from baselines import deepq

# 

BeraterEnv.showStep = False
BeraterEnv.showDone = False

env = gym.make(id)

# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
%time model = deepq.learn(\
        env,\
        seed=42,\
        network='mlp',\
        lr=1e-3,\
        total_timesteps=30000,\
        buffer_size=50000,\
        exploration_fraction=0.5,\
        exploration_final_eps=0.02,\
        print_freq=1000)


/usr/local/lib/python3.6/dist-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
Done: episodes=   100  avgSteps=  8.88  avgTotalReward= 0.21
Done: episodes=   200  avgSteps=  8.21  avgTotalReward= 0.27
Done: episodes=   300  avgSteps=  7.92  avgTotalReward= 0.30
Done: episodes=   400  avgSteps=  7.50  avgTotalReward= 0.36
Done: episodes=   500  avgSteps=  6.92  avgTotalReward= 0.39
Done: episodes=   600  avgSteps=  6.53  avgTotalReward= 0.43
Done: episodes=   700  avgSteps=  6.28  avgTotalReward= 0.45
Done: episodes=   800  avgSteps=  6.36  avgTotalReward= 0.46
Done: episodes=   900  avgSteps=  6.02  avgTotalReward= 0.48
Done: episodes=  1000  avgSteps=  5.73  avgTotalReward= 0.50
Done: episodes=  1100  avgSteps=  5.69  avgTotalReward= 0.52
Done: episodes=  1200  avgSteps=  5.30  avgTotalReward= 0.55
Done: episodes=  1300  avgSteps=  5.15  avgTotalReward= 0.57
Done: episodes=  1400  avgSteps=  5.03  avgTotalReward= 0.58
Done: episodes=  1500  avgSteps=  4.93  avgTotalReward= 0.60
Done: episodes=  1600  avgSteps=  4.87  avgTotalReward= 0.60
Done: episodes=  1700  avgSteps=  4.94  avgTotalReward= 0.60
Done: episodes=  1800  avgSteps=  4.62  avgTotalReward= 0.65
Done: episodes=  1900  avgSteps=  4.81  avgTotalReward= 0.62
Done: episodes=  2000  avgSteps=  4.58  avgTotalReward= 0.65
Done: episodes=  2100  avgSteps=  4.58  avgTotalReward= 0.65
Done: episodes=  2200  avgSteps=  4.52  avgTotalReward= 0.66
Done: episodes=  2300  avgSteps=  4.36  avgTotalReward= 0.69
Done: episodes=  2400  avgSteps=  4.17  avgTotalReward= 0.70
Done: episodes=  2500  avgSteps=  4.13  avgTotalReward= 0.71
Done: episodes=  2600  avgSteps=  4.17  avgTotalReward= 0.71
Done: episodes=  2700  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  2800  avgSteps=  4.01  avgTotalReward= 0.73
Done: episodes=  2900  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  3000  avgSteps=  4.06  avgTotalReward= 0.73
Done: episodes=  3100  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  3200  avgSteps=  4.03  avgTotalReward= 0.73
Done: episodes=  3300  avgSteps=  4.10  avgTotalReward= 0.72
Done: episodes=  3400  avgSteps=  4.02  avgTotalReward= 0.73
Done: episodes=  3500  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  3600  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  3700  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  3800  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  3900  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  4000  avgSteps=  4.02  avgTotalReward= 0.73
Done: episodes=  4100  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  4200  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  4300  avgSteps=  4.09  avgTotalReward= 0.72
Done: episodes=  4400  avgSteps=  4.03  avgTotalReward= 0.73
Done: episodes=  4500  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  4600  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  4700  avgSteps=  4.05  avgTotalReward= 0.72
Done: episodes=  4800  avgSteps=  4.06  avgTotalReward= 0.72
Done: episodes=  4900  avgSteps=  4.07  avgTotalReward= 0.72
Done: episodes=  5000  avgSteps=  4.06  avgTotalReward= 0.72
Done: episodes=  5100  avgSteps=  4.03  avgTotalReward= 0.73
Done: episodes=  5200  avgSteps=  4.09  avgTotalReward= 0.72
Done: episodes=  5300  avgSteps=  4.01  avgTotalReward= 0.73
Done: episodes=  5400  avgSteps=  4.07  avgTotalReward= 0.72
Done: episodes=  5500  avgSteps=  4.10  avgTotalReward= 0.72
Done: episodes=  5600  avgSteps=  4.02  avgTotalReward= 0.73
Done: episodes=  5700  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  5800  avgSteps=  4.04  avgTotalReward= 0.73
Done: episodes=  5900  avgSteps=  4.02  avgTotalReward= 0.73
Done: episodes=  6000  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  6100  avgSteps=  4.05  avgTotalReward= 0.73
Done: episodes=  6200  avgSteps=  4.01  avgTotalReward= 0.73
Done: episodes=  6300  avgSteps=  4.03  avgTotalReward= 0.73
Done: episodes=  6400  avgSteps=  4.06  avgTotalReward= 0.72
CPU times: user 6min 29s, sys: 56.8 s, total: 7min 26s
Wall time: 5min 11s

In [15]:
# !cat logs/berater/progress.csv


% time spent exploring,episodes,mean 100 episode reward,steps
54,1000,0.5,7030
21,2000,0.6,12021
2,3000,0.7,16232
2,4000,0.7,20276
2,5000,0.7,24329
2,6000,0.7,28376

In [11]:
# from baselines.common import plot_util as pu
# results = pu.load_results('/content/logs/berater')


skipping /content/logs/berater: no monitor files

In [0]:
# import matplotlib.pyplot as plt
# import numpy as np
# r = results[0]
# plt.plot(r.progress.total_timesteps, r.progress.eprewmean)

Enjoy model


In [16]:
import numpy as np 

observation = env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

BeraterEnv.showStep = True
BeraterEnv.showDone = False

for t in range(1000):
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions[0])
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


Episode: 6402   Step:    1  S --0-> A R= 0.30 totalR= 0.30 cost= 100 customerR=1000 optimum=3000
Episode: 6402   Step:    2  A --0-> B R= 0.25 totalR= 0.55 cost= 250 customerR=1000 optimum=3000
Episode: 6402   Step:    3  B --1-> C R= 0.25 totalR= 0.80 cost= 250 customerR=1000 optimum=3000
Episode: 6402   Step:    4  C --2-> S R=-0.07 totalR= 0.73 cost= 200 customerR=   0 optimum=3000
Episode finished after 4 timesteps

In [0]: