Introduction to OpenAI on Colab

# !apt-get install python-opengl -y  >/dev/null
# !apt install xvfb -y >/dev/null

# !pip install pyvirtualdisplay >/dev/null
# !pip install piglet >/dev/null

# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()

!pip install git+ >/dev/null
!pip install gym >/dev/null

!pip install JSAnimation >/dev/null

%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

def display_frames_as_gif(frames):
    Displays a list of frames as a gif, with controls
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])

    def animate(i):

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))

Step 2: Understanding Standard Environments

The OpenAI gym provides us with a number of environments. You could view then as simulators that

  1. deiliver observations and rewards based on
  2. actions executed by an agent who learns to behave well in the simulated world

The environment also defines which actions are allowed (actions space) and how observations look like (observation space). The agent should try to maximize the cumulated reward.


import gym
# env = gym.make('CartPole-v0')
# env = gym.make('MountainCar-v0')
# env = gym.make('Pendulum-v0')


env = gym.make('MsPacman-v0')
# env = gym.make('SpaceInvaders-v0')

Box(210, 160, 3)

# Run a demo of the environment
observation = env.reset()
cumulated_reward = 0

frames = []
for t in range(1000):
#     print(observation)
    frames.append(env.render(mode = 'rgb_array'))
    # very stupid agent, just makes a random action within the allowd action space
    action = env.action_space.sample()
#     print("Action: {}".format(t+1))    
    observation, reward, done, info = env.step(action)
#     print(reward)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))

Episode finished after 640 timesteps, accumulated reward = 260.0

Once Loop Reflect

!rm -r logs
!mkdir logs
!mkdir logs/pacman

import tensorflow as tf


import gym
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor

from baselines.ppo2 import ppo2

wrapped_env = DummyVecEnv([lambda: gym.make("MsPacmanDeterministic-v4")])
log_dir = '/content/logs/pacman/'
monitored_env = VecMonitor(wrapped_env, log_dir)

%time model = ppo2.learn(network='cnn', env=monitored_env, total_timesteps=500000, gamma=1.0, lr=0.1, ent_coef=0.01)

| approxkl           | 2116830.8   |
| clipfrac           | 0.52038574  |
| eplenmean          | 663         |
| eprewmean          | 366.66666   |
| explained_variance | 0.000607    |
| fps                | 139         |
| nupdates           | 1           |
| policy_entropy     | 1.9248302   |
| policy_loss        | 0.1261745   |
| serial_timesteps   | 2048        |
| time_elapsed       | 14.7        |
| total_timesteps    | 2048        |
| value_loss         | 974639200.0 |
| approxkl           | 0.0051047094  |
| clipfrac           | 0.052246094   |
| eplenmean          | 587           |
| eprewmean          | 367.05884     |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 10            |
| policy_entropy     | 1.9745653     |
| policy_loss        | -0.0024337347 |
| serial_timesteps   | 20480         |
| time_elapsed       | 125           |
| total_timesteps    | 20480         |
| value_loss         | 296.83646     |
| approxkl           | 0.005474476    |
| clipfrac           | 0.051635742    |
| eplenmean          | 577            |
| eprewmean          | 427.0          |
| explained_variance | 0              |
| fps                | 170            |
| nupdates           | 20             |
| policy_entropy     | 1.7953465      |
| policy_loss        | -0.00045011763 |
| serial_timesteps   | 40960          |
| time_elapsed       | 247            |
| total_timesteps    | 40960          |
| value_loss         | 1059.5631      |
| approxkl           | 0.003638208  |
| clipfrac           | 0.030273438  |
| eplenmean          | 575          |
| eprewmean          | 468.2        |
| explained_variance | 5.96e-08     |
| fps                | 160          |
| nupdates           | 30           |
| policy_entropy     | 1.5470023    |
| policy_loss        | 0.0002661368 |
| serial_timesteps   | 61440        |
| time_elapsed       | 370          |
| total_timesteps    | 61440        |
| value_loss         | 358.00836    |
| approxkl           | 0.013322699 |
| clipfrac           | 0.23120117  |
| eplenmean          | 583         |
| eprewmean          | 536.7       |
| explained_variance | -1.19e-07   |
| fps                | 169         |
| nupdates           | 40          |
| policy_entropy     | 1.4728658   |
| policy_loss        | 0.001159542 |
| serial_timesteps   | 81920       |
| time_elapsed       | 493         |
| total_timesteps    | 81920       |
| value_loss         | 1149.9407   |
| approxkl           | 0.0026441598  |
| clipfrac           | 0.025512695   |
| eplenmean          | 578           |
| eprewmean          | 558.9         |
| explained_variance | 1.19e-07      |
| fps                | 159           |
| nupdates           | 50            |
| policy_entropy     | 1.4515109     |
| policy_loss        | 0.00060597877 |
| serial_timesteps   | 102400        |
| time_elapsed       | 616           |
| total_timesteps    | 102400        |
| value_loss         | 243.38382     |
| approxkl           | 0.0020198638 |
| clipfrac           | 0.025756836  |
| eplenmean          | 553          |
| eprewmean          | 556.6        |
| explained_variance | -1.19e-07    |
| fps                | 169          |
| nupdates           | 60           |
| policy_entropy     | 1.2929955    |
| policy_loss        | 0.0017362388 |
| serial_timesteps   | 122880       |
| time_elapsed       | 738          |
| total_timesteps    | 122880       |
| value_loss         | 460.94705    |
| approxkl           | 0.00079879456 |
| clipfrac           | 0.0068359375  |
| eplenmean          | 526           |
| eprewmean          | 519.0         |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 70            |
| policy_entropy     | 0.99663067    |
| policy_loss        | -0.0007851219 |
| serial_timesteps   | 143360        |
| time_elapsed       | 861           |
| total_timesteps    | 143360        |
| value_loss         | 312.00696     |
| approxkl           | 0.0036455148  |
| clipfrac           | 0.04736328    |
| eplenmean          | 521           |
| eprewmean          | 538.6         |
| explained_variance | 5.96e-08      |
| fps                | 169           |
| nupdates           | 80            |
| policy_entropy     | 1.2872566     |
| policy_loss        | -0.0040710364 |
| serial_timesteps   | 163840        |
| time_elapsed       | 983           |
| total_timesteps    | 163840        |
| value_loss         | 383.50687     |
| approxkl           | 0.0018113212   |
| clipfrac           | 0.004760742    |
| eplenmean          | 552            |
| eprewmean          | 573.0          |
| explained_variance | -1.19e-07      |
| fps                | 160            |
| nupdates           | 90             |
| policy_entropy     | 1.3603601      |
| policy_loss        | -8.4773026e-05 |
| serial_timesteps   | 184320         |
| time_elapsed       | 1.11e+03       |
| total_timesteps    | 184320         |
| value_loss         | 307.74698      |
| approxkl           | 0.00407336    |
| clipfrac           | 0.031982422   |
| eplenmean          | 575           |
| eprewmean          | 592.1         |
| explained_variance | 0             |
| fps                | 165           |
| nupdates           | 100           |
| policy_entropy     | 1.3091214     |
| policy_loss        | 0.00025795706 |
| serial_timesteps   | 204800        |
| time_elapsed       | 1.23e+03      |
| total_timesteps    | 204800        |
| value_loss         | 869.22296     |
| approxkl           | 0.0052387556  |
| clipfrac           | 0.028198242   |
| eplenmean          | 606           |
| eprewmean          | 616.8         |
| explained_variance | 1.19e-07      |
| fps                | 160           |
| nupdates           | 110           |
| policy_entropy     | 1.2571713     |
| policy_loss        | -0.0009592957 |
| serial_timesteps   | 225280        |
| time_elapsed       | 1.35e+03      |
| total_timesteps    | 225280        |
| value_loss         | 385.84845     |
| approxkl           | 0.0007341085   |
| clipfrac           | 0.011108398    |
| eplenmean          | 624            |
| eprewmean          | 617.8          |
| explained_variance | 0              |
| fps                | 169            |
| nupdates           | 120            |
| policy_entropy     | 1.1074623      |
| policy_loss        | -0.00054699735 |
| serial_timesteps   | 245760         |
| time_elapsed       | 1.48e+03       |
| total_timesteps    | 245760         |
| value_loss         | 3705.3557      |
| approxkl           | 0.0030282184   |
| clipfrac           | 0.010009766    |
| eplenmean          | 633            |
| eprewmean          | 640.7          |
| explained_variance | 0              |
| fps                | 159            |
| nupdates           | 130            |
| policy_entropy     | 0.9837659      |
| policy_loss        | -0.00048666942 |
| serial_timesteps   | 266240         |
| time_elapsed       | 1.6e+03        |
| total_timesteps    | 266240         |
| value_loss         | 585.3759       |
| approxkl           | 0.0070445905 |
| clipfrac           | 0.099487305  |
| eplenmean          | 648          |
| eprewmean          | 653.0        |
| explained_variance | 5.96e-08     |
| fps                | 167          |
| nupdates           | 140          |
| policy_entropy     | 1.0342404    |
| policy_loss        | 0.004960365  |
| serial_timesteps   | 286720       |
| time_elapsed       | 1.72e+03     |
| total_timesteps    | 286720       |
| value_loss         | 423.20764    |
| approxkl           | 0.0027373526 |
| clipfrac           | 0.052734375  |
| eplenmean          | 636          |
| eprewmean          | 640.5        |
| explained_variance | 0            |
| fps                | 159          |
| nupdates           | 150          |
| policy_entropy     | 1.154133     |
| policy_loss        | 0.0011678651 |
| serial_timesteps   | 307200       |
| time_elapsed       | 1.85e+03     |
| total_timesteps    | 307200       |
| value_loss         | 398.85202    |
| approxkl           | 0.00037935114 |
| clipfrac           | 0.0           |
| eplenmean          | 620           |
| eprewmean          | 605.0         |
| explained_variance | 0             |
| fps                | 167           |
| nupdates           | 160           |
| policy_entropy     | 1.0728668     |
| policy_loss        | 0.00062472606 |
| serial_timesteps   | 327680        |
| time_elapsed       | 1.97e+03      |
| total_timesteps    | 327680        |
| value_loss         | 405.73245     |
| approxkl           | 0.0035332744  |
| clipfrac           | 0.020629883   |
| eplenmean          | 608           |
| eprewmean          | 604.8         |
| explained_variance | -1.19e-07     |
| fps                | 161           |
| nupdates           | 170           |
| policy_entropy     | 1.0658996     |
| policy_loss        | -0.0020019226 |
| serial_timesteps   | 348160        |
| time_elapsed       | 2.09e+03      |
| total_timesteps    | 348160        |
| value_loss         | 941.761       |
| approxkl           | 0.0019810013  |
| clipfrac           | 0.01965332    |
| eplenmean          | 635           |
| eprewmean          | 621.0         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 180           |
| policy_entropy     | 0.994569      |
| policy_loss        | -0.0007587494 |
| serial_timesteps   | 368640        |
| time_elapsed       | 2.22e+03      |
| total_timesteps    | 368640        |
| value_loss         | 1037.8472     |
| approxkl           | 0.0043497547  |
| clipfrac           | 0.03137207    |
| eplenmean          | 641           |
| eprewmean          | 639.3         |
| explained_variance | 5.96e-08      |
| fps                | 160           |
| nupdates           | 190           |
| policy_entropy     | 1.1119797     |
| policy_loss        | 0.00058483414 |
| serial_timesteps   | 389120        |
| time_elapsed       | 2.34e+03      |
| total_timesteps    | 389120        |
| value_loss         | 463.77753     |
| approxkl           | 0.004697526    |
| clipfrac           | 0.02331543     |
| eplenmean          | 656            |
| eprewmean          | 671.4          |
| explained_variance | 5.96e-08       |
| fps                | 165            |
| nupdates           | 200            |
| policy_entropy     | 1.2276391      |
| policy_loss        | -0.00052699965 |
| serial_timesteps   | 409600         |
| time_elapsed       | 2.46e+03       |
| total_timesteps    | 409600         |
| value_loss         | 1049.8129      |
| approxkl           | 0.008425733  |
| clipfrac           | 0.091796875  |
| eplenmean          | 634          |
| eprewmean          | 651.7        |
| explained_variance | 0            |
| fps                | 160          |
| nupdates           | 210          |
| policy_entropy     | 1.2020785    |
| policy_loss        | 0.0008530125 |
| serial_timesteps   | 430080       |
| time_elapsed       | 2.58e+03     |
| total_timesteps    | 430080       |
| value_loss         | 451.70117    |
| approxkl           | 0.006313326   |
| clipfrac           | 0.10559082    |
| eplenmean          | 625           |
| eprewmean          | 635.1         |
| explained_variance | 5.96e-08      |
| fps                | 168           |
| nupdates           | 220           |
| policy_entropy     | 1.1612798     |
| policy_loss        | -0.0024226017 |
| serial_timesteps   | 450560        |
| time_elapsed       | 2.71e+03      |
| total_timesteps    | 450560        |
| value_loss         | 1187.2731     |
| approxkl           | 0.0010650215  |
| clipfrac           | 0.0           |
| eplenmean          | 619           |
| eprewmean          | 615.9         |
| explained_variance | 0             |
| fps                | 159           |
| nupdates           | 230           |
| policy_entropy     | 1.2587514     |
| policy_loss        | 0.00029374912 |
| serial_timesteps   | 471040        |
| time_elapsed       | 2.83e+03      |
| total_timesteps    | 471040        |
| value_loss         | 808.3304      |
| approxkl           | 0.0009994785  |
| clipfrac           | 0.019042969   |
| eplenmean          | 633           |
| eprewmean          | 614.9         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 240           |
| policy_entropy     | 1.3081067     |
| policy_loss        | -0.0005007773 |
| serial_timesteps   | 491520        |
| time_elapsed       | 2.95e+03      |
| total_timesteps    | 491520        |
| value_loss         | 506.26608     |
CPU times: user 43min 49s, sys: 6min 48s, total: 50min 38s
Wall time: 50min 8s

from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(0, .75)
plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
# plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))

[<matplotlib.lines.Line2D at 0x7fc7195515c0>]

import numpy as np 

observation = wrapped_env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

frames = []
cumulated_reward = 0

for t in range(1000):
    frames.append(wrapped_env.render(mode = 'rgb_array'))
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = wrapped_env.step(actions)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))

Episode finished after 563 timesteps, accumulated reward = [580.]

Once Loop Reflect

