Introduction to OpenAI on Colab


In [0]:
# !apt-get install python-opengl -y  >/dev/null
# !apt install xvfb -y >/dev/null

In [0]:
# !pip install pyvirtualdisplay >/dev/null
# !pip install piglet >/dev/null

In [0]:
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()

In [0]:
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null

In [0]:
!pip install JSAnimation >/dev/null

In [0]:
%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))

Step 2: Understanding Standard Environments

The OpenAI gym provides us with a number of environments. You could view then as simulators that

  1. deiliver observations and rewards based on
  2. actions executed by an agent who learns to behave well in the simulated world

The environment also defines which actions are allowed (actions space) and how observations look like (observation space). The agent should try to maximize the cumulated reward.

Links


In [7]:
import gym
# env = gym.make('CartPole-v0')
# env = gym.make('MountainCar-v0')
# env = gym.make('Pendulum-v0')

# https://github.com/openai/gym#atari
# http://gym.openai.com/envs/MsPacman-v0/

env = gym.make('MsPacman-v0')
# env = gym.make('SpaceInvaders-v0')


/usr/local/lib/python3.6/dist-packages/gym/envs/registration.py:14: PkgResourcesDeprecationWarning: Parameters to load are deprecated.  Call .resolve and .require separately.
  result = entry_point.load(False)

In [8]:
env.action_space


Out[8]:
Discrete(9)

In [9]:
env.observation_space


Out[9]:
Box(210, 160, 3)

In [10]:
# Run a demo of the environment
observation = env.reset()
cumulated_reward = 0

frames = []
for t in range(1000):
#     print(observation)
    frames.append(env.render(mode = 'rgb_array'))
    # very stupid agent, just makes a random action within the allowd action space
    action = env.action_space.sample()
#     print("Action: {}".format(t+1))    
    observation, reward, done, info = env.step(action)
#     print(reward)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
env.close()


Episode finished after 640 timesteps, accumulated reward = 260.0

In [11]:
display_frames_as_gif(frames)




Once Loop Reflect

In [0]:
!rm -r logs
!mkdir logs
!mkdir logs/pacman

In [13]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.12.0

In [14]:
import gym
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor

from baselines.ppo2 import ppo2

wrapped_env = DummyVecEnv([lambda: gym.make("MsPacmanDeterministic-v4")])
log_dir = '/content/logs/pacman/'
monitored_env = VecMonitor(wrapped_env, log_dir)

%time model = ppo2.learn(network='cnn', env=monitored_env, total_timesteps=500000, gamma=1.0, lr=0.1, ent_coef=0.01)


Logging to /tmp/openai-2019-01-07-17-55-24-152288
/usr/local/lib/python3.6/dist-packages/gym/envs/registration.py:14: PkgResourcesDeprecationWarning: Parameters to load are deprecated.  Call .resolve and .require separately.
  result = entry_point.load(False)
------------------------------------
| approxkl           | 2116830.8   |
| clipfrac           | 0.52038574  |
| eplenmean          | 663         |
| eprewmean          | 366.66666   |
| explained_variance | 0.000607    |
| fps                | 139         |
| nupdates           | 1           |
| policy_entropy     | 1.9248302   |
| policy_loss        | 0.1261745   |
| serial_timesteps   | 2048        |
| time_elapsed       | 14.7        |
| total_timesteps    | 2048        |
| value_loss         | 974639200.0 |
------------------------------------
--------------------------------------
| approxkl           | 0.0051047094  |
| clipfrac           | 0.052246094   |
| eplenmean          | 587           |
| eprewmean          | 367.05884     |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 10            |
| policy_entropy     | 1.9745653     |
| policy_loss        | -0.0024337347 |
| serial_timesteps   | 20480         |
| time_elapsed       | 125           |
| total_timesteps    | 20480         |
| value_loss         | 296.83646     |
--------------------------------------
---------------------------------------
| approxkl           | 0.005474476    |
| clipfrac           | 0.051635742    |
| eplenmean          | 577            |
| eprewmean          | 427.0          |
| explained_variance | 0              |
| fps                | 170            |
| nupdates           | 20             |
| policy_entropy     | 1.7953465      |
| policy_loss        | -0.00045011763 |
| serial_timesteps   | 40960          |
| time_elapsed       | 247            |
| total_timesteps    | 40960          |
| value_loss         | 1059.5631      |
---------------------------------------
-------------------------------------
| approxkl           | 0.003638208  |
| clipfrac           | 0.030273438  |
| eplenmean          | 575          |
| eprewmean          | 468.2        |
| explained_variance | 5.96e-08     |
| fps                | 160          |
| nupdates           | 30           |
| policy_entropy     | 1.5470023    |
| policy_loss        | 0.0002661368 |
| serial_timesteps   | 61440        |
| time_elapsed       | 370          |
| total_timesteps    | 61440        |
| value_loss         | 358.00836    |
-------------------------------------
------------------------------------
| approxkl           | 0.013322699 |
| clipfrac           | 0.23120117  |
| eplenmean          | 583         |
| eprewmean          | 536.7       |
| explained_variance | -1.19e-07   |
| fps                | 169         |
| nupdates           | 40          |
| policy_entropy     | 1.4728658   |
| policy_loss        | 0.001159542 |
| serial_timesteps   | 81920       |
| time_elapsed       | 493         |
| total_timesteps    | 81920       |
| value_loss         | 1149.9407   |
------------------------------------
--------------------------------------
| approxkl           | 0.0026441598  |
| clipfrac           | 0.025512695   |
| eplenmean          | 578           |
| eprewmean          | 558.9         |
| explained_variance | 1.19e-07      |
| fps                | 159           |
| nupdates           | 50            |
| policy_entropy     | 1.4515109     |
| policy_loss        | 0.00060597877 |
| serial_timesteps   | 102400        |
| time_elapsed       | 616           |
| total_timesteps    | 102400        |
| value_loss         | 243.38382     |
--------------------------------------
-------------------------------------
| approxkl           | 0.0020198638 |
| clipfrac           | 0.025756836  |
| eplenmean          | 553          |
| eprewmean          | 556.6        |
| explained_variance | -1.19e-07    |
| fps                | 169          |
| nupdates           | 60           |
| policy_entropy     | 1.2929955    |
| policy_loss        | 0.0017362388 |
| serial_timesteps   | 122880       |
| time_elapsed       | 738          |
| total_timesteps    | 122880       |
| value_loss         | 460.94705    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00079879456 |
| clipfrac           | 0.0068359375  |
| eplenmean          | 526           |
| eprewmean          | 519.0         |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 70            |
| policy_entropy     | 0.99663067    |
| policy_loss        | -0.0007851219 |
| serial_timesteps   | 143360        |
| time_elapsed       | 861           |
| total_timesteps    | 143360        |
| value_loss         | 312.00696     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0036455148  |
| clipfrac           | 0.04736328    |
| eplenmean          | 521           |
| eprewmean          | 538.6         |
| explained_variance | 5.96e-08      |
| fps                | 169           |
| nupdates           | 80            |
| policy_entropy     | 1.2872566     |
| policy_loss        | -0.0040710364 |
| serial_timesteps   | 163840        |
| time_elapsed       | 983           |
| total_timesteps    | 163840        |
| value_loss         | 383.50687     |
--------------------------------------
---------------------------------------
| approxkl           | 0.0018113212   |
| clipfrac           | 0.004760742    |
| eplenmean          | 552            |
| eprewmean          | 573.0          |
| explained_variance | -1.19e-07      |
| fps                | 160            |
| nupdates           | 90             |
| policy_entropy     | 1.3603601      |
| policy_loss        | -8.4773026e-05 |
| serial_timesteps   | 184320         |
| time_elapsed       | 1.11e+03       |
| total_timesteps    | 184320         |
| value_loss         | 307.74698      |
---------------------------------------
--------------------------------------
| approxkl           | 0.00407336    |
| clipfrac           | 0.031982422   |
| eplenmean          | 575           |
| eprewmean          | 592.1         |
| explained_variance | 0             |
| fps                | 165           |
| nupdates           | 100           |
| policy_entropy     | 1.3091214     |
| policy_loss        | 0.00025795706 |
| serial_timesteps   | 204800        |
| time_elapsed       | 1.23e+03      |
| total_timesteps    | 204800        |
| value_loss         | 869.22296     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0052387556  |
| clipfrac           | 0.028198242   |
| eplenmean          | 606           |
| eprewmean          | 616.8         |
| explained_variance | 1.19e-07      |
| fps                | 160           |
| nupdates           | 110           |
| policy_entropy     | 1.2571713     |
| policy_loss        | -0.0009592957 |
| serial_timesteps   | 225280        |
| time_elapsed       | 1.35e+03      |
| total_timesteps    | 225280        |
| value_loss         | 385.84845     |
--------------------------------------
---------------------------------------
| approxkl           | 0.0007341085   |
| clipfrac           | 0.011108398    |
| eplenmean          | 624            |
| eprewmean          | 617.8          |
| explained_variance | 0              |
| fps                | 169            |
| nupdates           | 120            |
| policy_entropy     | 1.1074623      |
| policy_loss        | -0.00054699735 |
| serial_timesteps   | 245760         |
| time_elapsed       | 1.48e+03       |
| total_timesteps    | 245760         |
| value_loss         | 3705.3557      |
---------------------------------------
---------------------------------------
| approxkl           | 0.0030282184   |
| clipfrac           | 0.010009766    |
| eplenmean          | 633            |
| eprewmean          | 640.7          |
| explained_variance | 0              |
| fps                | 159            |
| nupdates           | 130            |
| policy_entropy     | 0.9837659      |
| policy_loss        | -0.00048666942 |
| serial_timesteps   | 266240         |
| time_elapsed       | 1.6e+03        |
| total_timesteps    | 266240         |
| value_loss         | 585.3759       |
---------------------------------------
-------------------------------------
| approxkl           | 0.0070445905 |
| clipfrac           | 0.099487305  |
| eplenmean          | 648          |
| eprewmean          | 653.0        |
| explained_variance | 5.96e-08     |
| fps                | 167          |
| nupdates           | 140          |
| policy_entropy     | 1.0342404    |
| policy_loss        | 0.004960365  |
| serial_timesteps   | 286720       |
| time_elapsed       | 1.72e+03     |
| total_timesteps    | 286720       |
| value_loss         | 423.20764    |
-------------------------------------
-------------------------------------
| approxkl           | 0.0027373526 |
| clipfrac           | 0.052734375  |
| eplenmean          | 636          |
| eprewmean          | 640.5        |
| explained_variance | 0            |
| fps                | 159          |
| nupdates           | 150          |
| policy_entropy     | 1.154133     |
| policy_loss        | 0.0011678651 |
| serial_timesteps   | 307200       |
| time_elapsed       | 1.85e+03     |
| total_timesteps    | 307200       |
| value_loss         | 398.85202    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00037935114 |
| clipfrac           | 0.0           |
| eplenmean          | 620           |
| eprewmean          | 605.0         |
| explained_variance | 0             |
| fps                | 167           |
| nupdates           | 160           |
| policy_entropy     | 1.0728668     |
| policy_loss        | 0.00062472606 |
| serial_timesteps   | 327680        |
| time_elapsed       | 1.97e+03      |
| total_timesteps    | 327680        |
| value_loss         | 405.73245     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0035332744  |
| clipfrac           | 0.020629883   |
| eplenmean          | 608           |
| eprewmean          | 604.8         |
| explained_variance | -1.19e-07     |
| fps                | 161           |
| nupdates           | 170           |
| policy_entropy     | 1.0658996     |
| policy_loss        | -0.0020019226 |
| serial_timesteps   | 348160        |
| time_elapsed       | 2.09e+03      |
| total_timesteps    | 348160        |
| value_loss         | 941.761       |
--------------------------------------
--------------------------------------
| approxkl           | 0.0019810013  |
| clipfrac           | 0.01965332    |
| eplenmean          | 635           |
| eprewmean          | 621.0         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 180           |
| policy_entropy     | 0.994569      |
| policy_loss        | -0.0007587494 |
| serial_timesteps   | 368640        |
| time_elapsed       | 2.22e+03      |
| total_timesteps    | 368640        |
| value_loss         | 1037.8472     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0043497547  |
| clipfrac           | 0.03137207    |
| eplenmean          | 641           |
| eprewmean          | 639.3         |
| explained_variance | 5.96e-08      |
| fps                | 160           |
| nupdates           | 190           |
| policy_entropy     | 1.1119797     |
| policy_loss        | 0.00058483414 |
| serial_timesteps   | 389120        |
| time_elapsed       | 2.34e+03      |
| total_timesteps    | 389120        |
| value_loss         | 463.77753     |
--------------------------------------
---------------------------------------
| approxkl           | 0.004697526    |
| clipfrac           | 0.02331543     |
| eplenmean          | 656            |
| eprewmean          | 671.4          |
| explained_variance | 5.96e-08       |
| fps                | 165            |
| nupdates           | 200            |
| policy_entropy     | 1.2276391      |
| policy_loss        | -0.00052699965 |
| serial_timesteps   | 409600         |
| time_elapsed       | 2.46e+03       |
| total_timesteps    | 409600         |
| value_loss         | 1049.8129      |
---------------------------------------
-------------------------------------
| approxkl           | 0.008425733  |
| clipfrac           | 0.091796875  |
| eplenmean          | 634          |
| eprewmean          | 651.7        |
| explained_variance | 0            |
| fps                | 160          |
| nupdates           | 210          |
| policy_entropy     | 1.2020785    |
| policy_loss        | 0.0008530125 |
| serial_timesteps   | 430080       |
| time_elapsed       | 2.58e+03     |
| total_timesteps    | 430080       |
| value_loss         | 451.70117    |
-------------------------------------
--------------------------------------
| approxkl           | 0.006313326   |
| clipfrac           | 0.10559082    |
| eplenmean          | 625           |
| eprewmean          | 635.1         |
| explained_variance | 5.96e-08      |
| fps                | 168           |
| nupdates           | 220           |
| policy_entropy     | 1.1612798     |
| policy_loss        | -0.0024226017 |
| serial_timesteps   | 450560        |
| time_elapsed       | 2.71e+03      |
| total_timesteps    | 450560        |
| value_loss         | 1187.2731     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0010650215  |
| clipfrac           | 0.0           |
| eplenmean          | 619           |
| eprewmean          | 615.9         |
| explained_variance | 0             |
| fps                | 159           |
| nupdates           | 230           |
| policy_entropy     | 1.2587514     |
| policy_loss        | 0.00029374912 |
| serial_timesteps   | 471040        |
| time_elapsed       | 2.83e+03      |
| total_timesteps    | 471040        |
| value_loss         | 808.3304      |
--------------------------------------
--------------------------------------
| approxkl           | 0.0009994785  |
| clipfrac           | 0.019042969   |
| eplenmean          | 633           |
| eprewmean          | 614.9         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 240           |
| policy_entropy     | 1.3081067     |
| policy_loss        | -0.0005007773 |
| serial_timesteps   | 491520        |
| time_elapsed       | 2.95e+03      |
| total_timesteps    | 491520        |
| value_loss         | 506.26608     |
--------------------------------------
CPU times: user 43min 49s, sys: 6min 48s, total: 50min 38s
Wall time: 50min 8s

In [15]:
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(0, .75)
plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
# plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))


/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility
Out[15]:
[<matplotlib.lines.Line2D at 0x7fc7195515c0>]

In [16]:
import numpy as np 

observation = wrapped_env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

frames = []
cumulated_reward = 0

for t in range(1000):
    frames.append(wrapped_env.render(mode = 'rgb_array'))
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = wrapped_env.step(actions)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
wrapped_env.close()


Episode finished after 563 timesteps, accumulated reward = [580.]

In [17]:
display_frames_as_gif(frames)




Once Loop Reflect

In [0]: