Introduction to OpenAI on Colab

Step 1: Installation for Colab - just execute these cells and do not worry too much



In [0]:

    
# !apt-get install python-opengl -y  >/dev/null
# !apt install xvfb -y >/dev/null



In [0]:

    
# !pip install pyvirtualdisplay >/dev/null
# !pip install piglet >/dev/null



In [0]:

    
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()



In [0]:

    
!pip install git+https://github.com/openai/baselines >/dev/null
!pip install gym >/dev/null



In [0]:

    
!pip install JSAnimation >/dev/null



In [0]:

    
%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))

Step 2: Understanding Standard Environments

The OpenAI gym provides us with a number of environments. You could view then as simulators that

deiliver observations and rewards based on
actions executed by an agent who learns to behave well in the simulated world

The environment also defines which actions are allowed (actions space) and how observations look like (observation space). The agent should try to maximize the cumulated reward.

Links



In [7]:

    
import gym
# env = gym.make('CartPole-v0')
# env = gym.make('MountainCar-v0')
# env = gym.make('Pendulum-v0')

# https://github.com/openai/gym#atari
# http://gym.openai.com/envs/MsPacman-v0/

env = gym.make('MsPacman-v0')
# env = gym.make('SpaceInvaders-v0')









    



/usr/local/lib/python3.6/dist-packages/gym/envs/registration.py:14: PkgResourcesDeprecationWarning: Parameters to load are deprecated.  Call .resolve and .require separately.
  result = entry_point.load(False)



In [8]:

    
env.action_space









    Out[8]:





Discrete(9)



In [9]:

    
env.observation_space









    Out[9]:





Box(210, 160, 3)



In [10]:

    
# Run a demo of the environment
observation = env.reset()
cumulated_reward = 0

frames = []
for t in range(1000):
#     print(observation)
    frames.append(env.render(mode = 'rgb_array'))
    # very stupid agent, just makes a random action within the allowd action space
    action = env.action_space.sample()
#     print("Action: {}".format(t+1))    
    observation, reward, done, info = env.step(action)
#     print(reward)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
env.close()









    



Episode finished after 640 timesteps, accumulated reward = 260.0



In [11]:

    
display_frames_as_gif(frames)

Step 3: Baselines



In [0]:

    
!rm -r logs
!mkdir logs
!mkdir logs/pacman



In [13]:

    
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)



In [14]:

    
import gym
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor

from baselines.ppo2 import ppo2

wrapped_env = DummyVecEnv([lambda: gym.make("MsPacmanDeterministic-v4")])
log_dir = '/content/logs/pacman/'
monitored_env = VecMonitor(wrapped_env, log_dir)

%time model = ppo2.learn(network='cnn', env=monitored_env, total_timesteps=500000, gamma=1.0, lr=0.1, ent_coef=0.01)









    



Logging to /tmp/openai-2019-01-07-17-55-24-152288






    



/usr/local/lib/python3.6/dist-packages/gym/envs/registration.py:14: PkgResourcesDeprecationWarning: Parameters to load are deprecated.  Call .resolve and .require separately.
  result = entry_point.load(False)






    



------------------------------------
| approxkl           | 2116830.8   |
| clipfrac           | 0.52038574  |
| eplenmean          | 663         |
| eprewmean          | 366.66666   |
| explained_variance | 0.000607    |
| fps                | 139         |
| nupdates           | 1           |
| policy_entropy     | 1.9248302   |
| policy_loss        | 0.1261745   |
| serial_timesteps   | 2048        |
| time_elapsed       | 14.7        |
| total_timesteps    | 2048        |
| value_loss         | 974639200.0 |
------------------------------------
--------------------------------------
| approxkl           | 0.0051047094  |
| clipfrac           | 0.052246094   |
| eplenmean          | 587           |
| eprewmean          | 367.05884     |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 10            |
| policy_entropy     | 1.9745653     |
| policy_loss        | -0.0024337347 |
| serial_timesteps   | 20480         |
| time_elapsed       | 125           |
| total_timesteps    | 20480         |
| value_loss         | 296.83646     |
--------------------------------------
---------------------------------------
| approxkl           | 0.005474476    |
| clipfrac           | 0.051635742    |
| eplenmean          | 577            |
| eprewmean          | 427.0          |
| explained_variance | 0              |
| fps                | 170            |
| nupdates           | 20             |
| policy_entropy     | 1.7953465      |
| policy_loss        | -0.00045011763 |
| serial_timesteps   | 40960          |
| time_elapsed       | 247            |
| total_timesteps    | 40960          |
| value_loss         | 1059.5631      |
---------------------------------------
-------------------------------------
| approxkl           | 0.003638208  |
| clipfrac           | 0.030273438  |
| eplenmean          | 575          |
| eprewmean          | 468.2        |
| explained_variance | 5.96e-08     |
| fps                | 160          |
| nupdates           | 30           |
| policy_entropy     | 1.5470023    |
| policy_loss        | 0.0002661368 |
| serial_timesteps   | 61440        |
| time_elapsed       | 370          |
| total_timesteps    | 61440        |
| value_loss         | 358.00836    |
-------------------------------------
------------------------------------
| approxkl           | 0.013322699 |
| clipfrac           | 0.23120117  |
| eplenmean          | 583         |
| eprewmean          | 536.7       |
| explained_variance | -1.19e-07   |
| fps                | 169         |
| nupdates           | 40          |
| policy_entropy     | 1.4728658   |
| policy_loss        | 0.001159542 |
| serial_timesteps   | 81920       |
| time_elapsed       | 493         |
| total_timesteps    | 81920       |
| value_loss         | 1149.9407   |
------------------------------------
--------------------------------------
| approxkl           | 0.0026441598  |
| clipfrac           | 0.025512695   |
| eplenmean          | 578           |
| eprewmean          | 558.9         |
| explained_variance | 1.19e-07      |
| fps                | 159           |
| nupdates           | 50            |
| policy_entropy     | 1.4515109     |
| policy_loss        | 0.00060597877 |
| serial_timesteps   | 102400        |
| time_elapsed       | 616           |
| total_timesteps    | 102400        |
| value_loss         | 243.38382     |
--------------------------------------
-------------------------------------
| approxkl           | 0.0020198638 |
| clipfrac           | 0.025756836  |
| eplenmean          | 553          |
| eprewmean          | 556.6        |
| explained_variance | -1.19e-07    |
| fps                | 169          |
| nupdates           | 60           |
| policy_entropy     | 1.2929955    |
| policy_loss        | 0.0017362388 |
| serial_timesteps   | 122880       |
| time_elapsed       | 738          |
| total_timesteps    | 122880       |
| value_loss         | 460.94705    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00079879456 |
| clipfrac           | 0.0068359375  |
| eplenmean          | 526           |
| eprewmean          | 519.0         |
| explained_variance | 0             |
| fps                | 160           |
| nupdates           | 70            |
| policy_entropy     | 0.99663067    |
| policy_loss        | -0.0007851219 |
| serial_timesteps   | 143360        |
| time_elapsed       | 861           |
| total_timesteps    | 143360        |
| value_loss         | 312.00696     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0036455148  |
| clipfrac           | 0.04736328    |
| eplenmean          | 521           |
| eprewmean          | 538.6         |
| explained_variance | 5.96e-08      |
| fps                | 169           |
| nupdates           | 80            |
| policy_entropy     | 1.2872566     |
| policy_loss        | -0.0040710364 |
| serial_timesteps   | 163840        |
| time_elapsed       | 983           |
| total_timesteps    | 163840        |
| value_loss         | 383.50687     |
--------------------------------------
---------------------------------------
| approxkl           | 0.0018113212   |
| clipfrac           | 0.004760742    |
| eplenmean          | 552            |
| eprewmean          | 573.0          |
| explained_variance | -1.19e-07      |
| fps                | 160            |
| nupdates           | 90             |
| policy_entropy     | 1.3603601      |
| policy_loss        | -8.4773026e-05 |
| serial_timesteps   | 184320         |
| time_elapsed       | 1.11e+03       |
| total_timesteps    | 184320         |
| value_loss         | 307.74698      |
---------------------------------------
--------------------------------------
| approxkl           | 0.00407336    |
| clipfrac           | 0.031982422   |
| eplenmean          | 575           |
| eprewmean          | 592.1         |
| explained_variance | 0             |
| fps                | 165           |
| nupdates           | 100           |
| policy_entropy     | 1.3091214     |
| policy_loss        | 0.00025795706 |
| serial_timesteps   | 204800        |
| time_elapsed       | 1.23e+03      |
| total_timesteps    | 204800        |
| value_loss         | 869.22296     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0052387556  |
| clipfrac           | 0.028198242   |
| eplenmean          | 606           |
| eprewmean          | 616.8         |
| explained_variance | 1.19e-07      |
| fps                | 160           |
| nupdates           | 110           |
| policy_entropy     | 1.2571713     |
| policy_loss        | -0.0009592957 |
| serial_timesteps   | 225280        |
| time_elapsed       | 1.35e+03      |
| total_timesteps    | 225280        |
| value_loss         | 385.84845     |
--------------------------------------
---------------------------------------
| approxkl           | 0.0007341085   |
| clipfrac           | 0.011108398    |
| eplenmean          | 624            |
| eprewmean          | 617.8          |
| explained_variance | 0              |
| fps                | 169            |
| nupdates           | 120            |
| policy_entropy     | 1.1074623      |
| policy_loss        | -0.00054699735 |
| serial_timesteps   | 245760         |
| time_elapsed       | 1.48e+03       |
| total_timesteps    | 245760         |
| value_loss         | 3705.3557      |
---------------------------------------
---------------------------------------
| approxkl           | 0.0030282184   |
| clipfrac           | 0.010009766    |
| eplenmean          | 633            |
| eprewmean          | 640.7          |
| explained_variance | 0              |
| fps                | 159            |
| nupdates           | 130            |
| policy_entropy     | 0.9837659      |
| policy_loss        | -0.00048666942 |
| serial_timesteps   | 266240         |
| time_elapsed       | 1.6e+03        |
| total_timesteps    | 266240         |
| value_loss         | 585.3759       |
---------------------------------------
-------------------------------------
| approxkl           | 0.0070445905 |
| clipfrac           | 0.099487305  |
| eplenmean          | 648          |
| eprewmean          | 653.0        |
| explained_variance | 5.96e-08     |
| fps                | 167          |
| nupdates           | 140          |
| policy_entropy     | 1.0342404    |
| policy_loss        | 0.004960365  |
| serial_timesteps   | 286720       |
| time_elapsed       | 1.72e+03     |
| total_timesteps    | 286720       |
| value_loss         | 423.20764    |
-------------------------------------
-------------------------------------
| approxkl           | 0.0027373526 |
| clipfrac           | 0.052734375  |
| eplenmean          | 636          |
| eprewmean          | 640.5        |
| explained_variance | 0            |
| fps                | 159          |
| nupdates           | 150          |
| policy_entropy     | 1.154133     |
| policy_loss        | 0.0011678651 |
| serial_timesteps   | 307200       |
| time_elapsed       | 1.85e+03     |
| total_timesteps    | 307200       |
| value_loss         | 398.85202    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00037935114 |
| clipfrac           | 0.0           |
| eplenmean          | 620           |
| eprewmean          | 605.0         |
| explained_variance | 0             |
| fps                | 167           |
| nupdates           | 160           |
| policy_entropy     | 1.0728668     |
| policy_loss        | 0.00062472606 |
| serial_timesteps   | 327680        |
| time_elapsed       | 1.97e+03      |
| total_timesteps    | 327680        |
| value_loss         | 405.73245     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0035332744  |
| clipfrac           | 0.020629883   |
| eplenmean          | 608           |
| eprewmean          | 604.8         |
| explained_variance | -1.19e-07     |
| fps                | 161           |
| nupdates           | 170           |
| policy_entropy     | 1.0658996     |
| policy_loss        | -0.0020019226 |
| serial_timesteps   | 348160        |
| time_elapsed       | 2.09e+03      |
| total_timesteps    | 348160        |
| value_loss         | 941.761       |
--------------------------------------
--------------------------------------
| approxkl           | 0.0019810013  |
| clipfrac           | 0.01965332    |
| eplenmean          | 635           |
| eprewmean          | 621.0         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 180           |
| policy_entropy     | 0.994569      |
| policy_loss        | -0.0007587494 |
| serial_timesteps   | 368640        |
| time_elapsed       | 2.22e+03      |
| total_timesteps    | 368640        |
| value_loss         | 1037.8472     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0043497547  |
| clipfrac           | 0.03137207    |
| eplenmean          | 641           |
| eprewmean          | 639.3         |
| explained_variance | 5.96e-08      |
| fps                | 160           |
| nupdates           | 190           |
| policy_entropy     | 1.1119797     |
| policy_loss        | 0.00058483414 |
| serial_timesteps   | 389120        |
| time_elapsed       | 2.34e+03      |
| total_timesteps    | 389120        |
| value_loss         | 463.77753     |
--------------------------------------
---------------------------------------
| approxkl           | 0.004697526    |
| clipfrac           | 0.02331543     |
| eplenmean          | 656            |
| eprewmean          | 671.4          |
| explained_variance | 5.96e-08       |
| fps                | 165            |
| nupdates           | 200            |
| policy_entropy     | 1.2276391      |
| policy_loss        | -0.00052699965 |
| serial_timesteps   | 409600         |
| time_elapsed       | 2.46e+03       |
| total_timesteps    | 409600         |
| value_loss         | 1049.8129      |
---------------------------------------
-------------------------------------
| approxkl           | 0.008425733  |
| clipfrac           | 0.091796875  |
| eplenmean          | 634          |
| eprewmean          | 651.7        |
| explained_variance | 0            |
| fps                | 160          |
| nupdates           | 210          |
| policy_entropy     | 1.2020785    |
| policy_loss        | 0.0008530125 |
| serial_timesteps   | 430080       |
| time_elapsed       | 2.58e+03     |
| total_timesteps    | 430080       |
| value_loss         | 451.70117    |
-------------------------------------
--------------------------------------
| approxkl           | 0.006313326   |
| clipfrac           | 0.10559082    |
| eplenmean          | 625           |
| eprewmean          | 635.1         |
| explained_variance | 5.96e-08      |
| fps                | 168           |
| nupdates           | 220           |
| policy_entropy     | 1.1612798     |
| policy_loss        | -0.0024226017 |
| serial_timesteps   | 450560        |
| time_elapsed       | 2.71e+03      |
| total_timesteps    | 450560        |
| value_loss         | 1187.2731     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0010650215  |
| clipfrac           | 0.0           |
| eplenmean          | 619           |
| eprewmean          | 615.9         |
| explained_variance | 0             |
| fps                | 159           |
| nupdates           | 230           |
| policy_entropy     | 1.2587514     |
| policy_loss        | 0.00029374912 |
| serial_timesteps   | 471040        |
| time_elapsed       | 2.83e+03      |
| total_timesteps    | 471040        |
| value_loss         | 808.3304      |
--------------------------------------
--------------------------------------
| approxkl           | 0.0009994785  |
| clipfrac           | 0.019042969   |
| eplenmean          | 633           |
| eprewmean          | 614.9         |
| explained_variance | 0             |
| fps                | 169           |
| nupdates           | 240           |
| policy_entropy     | 1.3081067     |
| policy_loss        | -0.0005007773 |
| serial_timesteps   | 491520        |
| time_elapsed       | 2.95e+03      |
| total_timesteps    | 491520        |
| value_loss         | 506.26608     |
--------------------------------------
CPU times: user 43min 49s, sys: 6min 48s, total: 50min 38s
Wall time: 50min 8s



In [15]:

    
from baselines.common import plot_util as pu
results = pu.load_results(log_dir)

import matplotlib.pyplot as plt
import numpy as np
r = results[0]
# plt.ylim(0, .75)
plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
# plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=100))









    



/usr/local/lib/python3.6/dist-packages/baselines/bench/monitor.py:164: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  df.headers = headers # HACK to preserve backwards compatibility






    Out[15]:





[<matplotlib.lines.Line2D at 0x7fc7195515c0>]



In [16]:

    
import numpy as np 

observation = wrapped_env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))

frames = []
cumulated_reward = 0

for t in range(1000):
    frames.append(wrapped_env.render(mode = 'rgb_array'))
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = wrapped_env.step(actions)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
wrapped_env.close()









    



Episode finished after 563 timesteps, accumulated reward = [580.]



In [17]:

    
display_frames_as_gif(frames)



In [0]: