Installation (required for colab)


In [1]:
# !pip install -e git+https://github.com/openai/baselines#egg=berater


Obtaining berater from git+https://github.com/openai/baselines#egg=berater
  Cloning https://github.com/openai/baselines to c:\users\olive\development\ai\notebooks\rl\src\berater
Requirement already satisfied: gym in c:\users\olive\development\rl\gym (from baselines) (0.10.8)
Requirement already satisfied: scipy in c:\programdata\anaconda3\lib\site-packages (from baselines) (1.1.0)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from baselines) (4.26.0)
Requirement already satisfied: joblib in c:\programdata\anaconda3\lib\site-packages (from baselines) (0.12.5)
Requirement already satisfied: dill in c:\programdata\anaconda3\lib\site-packages (from baselines) (0.2.8.2)
Requirement already satisfied: progressbar2 in c:\programdata\anaconda3\lib\site-packages (from baselines) (3.38.0)
Requirement already satisfied: cloudpickle in c:\programdata\anaconda3\lib\site-packages (from baselines) (0.6.1)
Requirement already satisfied: click in c:\programdata\anaconda3\lib\site-packages (from baselines) (7.0)
Requirement already satisfied: opencv-python in c:\programdata\anaconda3\lib\site-packages (from baselines) (3.4.0.12)
Requirement already satisfied: numpy>=1.10.4 in c:\programdata\anaconda3\lib\site-packages (from gym->baselines) (1.15.3)
Requirement already satisfied: requests>=2.0 in c:\programdata\anaconda3\lib\site-packages (from gym->baselines) (2.19.1)
Requirement already satisfied: six in c:\programdata\anaconda3\lib\site-packages (from gym->baselines) (1.11.0)
Requirement already satisfied: pyglet>=1.2.0 in c:\programdata\anaconda3\lib\site-packages (from gym->baselines) (1.3.2)
Requirement already satisfied: pyreadline>=1.7.1 in c:\programdata\anaconda3\lib\site-packages (from dill->baselines) (2.1)
Requirement already satisfied: python-utils>=2.3.0 in c:\programdata\anaconda3\lib\site-packages (from progressbar2->baselines) (2.3.0)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.0->gym->baselines) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.0->gym->baselines) (2018.10.15)
Requirement already satisfied: urllib3<1.24,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.0->gym->baselines) (1.23)
Requirement already satisfied: idna<2.8,>=2.5 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.0->gym->baselines) (2.7)
Requirement already satisfied: future in c:\programdata\anaconda3\lib\site-packages (from pyglet>=1.2.0->gym->baselines) (0.16.0)
Installing collected packages: baselines
  Found existing installation: baselines 0.1.5
    Uninstalling baselines-0.1.5:
  Running setup.py (path:C:\Users\olive\Development\ai\notebooks\rl\src\berater\setup.py) egg_info for package berater produced metadata for project name baselines. Fix your #egg=berater fragments.
tensorflow 1.10.0 has requirement numpy<=1.14.5,>=1.13.3, but you'll have numpy 1.15.3 which is incompatible.
tensorflow 1.10.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.
tensorflow-tensorboard 1.5.1 has requirement bleach==1.5.0, but you'll have bleach 3.0.2 which is incompatible.
tensorflow-tensorboard 1.5.1 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.
tensorflow-gpu 1.10.0 has requirement numpy<=1.14.5,>=1.13.3, but you'll have numpy 1.15.3 which is incompatible.
tensorflow-gpu 1.10.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.
spacy 2.0.16 has requirement regex==2018.01.10, but you'll have regex 2018.8.29 which is incompatible.
Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\baselines.egg-link'
Consider using the `--user` option or check the permissions.

You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.

important for colab: restart runtime after installation

Definting our Environment, version using cnt


In [0]:
cnt = 0

In [0]:
import gym
from gym.utils import seeding
from gym import spaces

def state_name_to_int(state):
    state_name_map = {
        'S/Z': 0,
        'A': 1,
        'B': 2,
        'C': 3,
    }
    return state_name_map[state]

def int_to_state_name(state_as_int):
    state_map = {
        0: 'S/Z',
        1: 'A',
        2: 'B',
        3: 'C'
    }
    return state_map[state_as_int]
    
class BeraterEnv(gym.Env):
    """
    The Berater Problem

    Actions: 
    There are 3 discrete deterministic actions:
    - 0: First Direction
    - 1: Second Direction
    - 2: Third Direction / Go home
    """
    metadata = {'render.modes': ['ansi']}
    
    num_envs = 1

    def __init__(self):
        self.map = {
            'S/Z': [('A', 100), ('B', 400), ('C', 200 )],
            'A': [('B', 250), ('C', 400), ('S/Z', 100 )],
            'B': [('A', 250), ('C', 250), ('S/Z', 400 )],
            'C': [('A', 400), ('B', 250), ('S/Z', 200 )]
        }
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Discrete(1)

        self.reset()
        self.optimum = self.calculate_customers_reward()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        paths = self.map[self.state]
        destination, cost = paths[action]

        info = {"from": self.state, "to": destination}

        self.state = destination
        reward = (-cost + self.customer_reward[destination]) / self.optimum
        self.customer_visited(destination)
        done = destination == 'S/Z' and self.all_customers_visited()
        return state_name_to_int(self.state), reward, done, info

    def customer_visited(self, customer):
        self.customer_reward[customer] = 0

    def all_customers_visited(self):
        return self.calculate_customers_reward() == 0

    def calculate_customers_reward(self):
        sum = 0
        for value in self.customer_reward.values():
            sum += value
        return sum

    def reset(self):
        # print("Reset")
        reward_per_customer = 1000
        self.customer_reward = {
            'S/Z': 0,
            'A': reward_per_customer,
            'B': reward_per_customer,
            'C': reward_per_customer,
        }

        self.state = 'S/Z'
        return state_name_to_int(self.state)

    def render(self, mode='human'):
        print(self.state)
        print(self.customer_reward)

from gym.envs.registration import register
cnt += 1
id = "Berater-v{}".format(cnt)
register(
    id=id,
    entry_point=BeraterEnv
)        
env = gym.make(id)

Try out Environment


In [3]:
# Run a demo of the environment
observation = env.reset()
for t in range(1000):
    env.render()
    action = env.action_space.sample()
    print("Action: {}".format(action))
    observation, reward, done, info = env.step(action)
    print(reward)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
env.close()


S/Z
{'S/Z': 0, 'A': 1000, 'B': 1000, 'C': 1000}
Action: 0
0.3
A
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 1000}
Action: 1
0.2
C
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 0}
Action: 0
-0.13333333333333333
A
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 0}
Action: 1
-0.13333333333333333
C
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 0}
Action: 1
0.25
B
{'S/Z': 0, 'A': 0, 'B': 0, 'C': 0}
Action: 2
-0.13333333333333333
Episode finished after 6 timesteps

Train PPO2 using MLP Neural Network Architecture


In [4]:
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.ppo2 import ppo2

wrapped_env = DummyVecEnv([lambda: gym.make(id)])

model = ppo2.learn(network='mlp', env=wrapped_env, total_timesteps=20000)


Logging to /tmp/openai-2018-11-08-16-33-12-360900
---------------------------------------
| approxkl           | 4.212758e-05   |
| clipfrac           | 0.0            |
| eplenmean          | nan            |
| eprewmean          | nan            |
| explained_variance | -1             |
| fps                | 404            |
| nupdates           | 1              |
| policy_entropy     | 1.0985694      |
| policy_loss        | -0.00033217986 |
| serial_timesteps   | 2048           |
| time_elapsed       | 5.06           |
| total_timesteps    | 2048           |
| value_loss         | 0.06086097     |
---------------------------------------

Use trained model to do actual planning


In [5]:
observation = wrapped_env.reset()

import numpy as np 

state = np.zeros((1, 2*128))
dones = np.zeros((1))

for t in range(1000):
    wrapped_env.render()
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    print("Action: {}".format(actions))
    observation, reward, done, info = wrapped_env.step(actions)
    print(reward)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
wrapped_env.close()


S/Z
{'S/Z': 0, 'A': 1000, 'B': 1000, 'C': 1000}
Action: [0]
[0.3]
A
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 1000}
Action: [2]
[-0.03333334]
S/Z
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 1000}
Action: [2]
[0.26666668]
C
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 0}
Action: [0]
[-0.13333334]
A
{'S/Z': 0, 'A': 0, 'B': 1000, 'C': 0}
Action: [0]
[0.25]
B
{'S/Z': 0, 'A': 0, 'B': 0, 'C': 0}
Action: [2]
[-0.13333334]
Episode finished after 6 timesteps

In [0]: