In [0]:
    
# !apt-get install python-opengl -y  >/dev/null
# !apt install xvfb -y >/dev/null
    
In [0]:
    
# !pip install pyvirtualdisplay >/dev/null
# !pip install piglet >/dev/null
    
In [3]:
    
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
    
    Out[3]:
In [0]:
    
# !pip install git+https://github.com/openai/baselines >/dev/null
# !pip install gym >/dev/null
    
In [0]:
    
# !pip install JSAnimation >/dev/null
    
In [0]:
    
%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))
    
In [0]:
    
!rm -r /content/logs/dqn
    
In [8]:
    
%env OPENAI_LOGDIR=/content/logs/dqn
# %env OPENAI_LOG_FORMAT=csv
    
    
In [9]:
    
%env
    
    Out[9]:
In [10]:
    
import gym
from baselines import deepq
env = gym.make("MsPacman-v0")
# https://en.wikipedia.org/wiki/Q-learning#Influence_of_variables
# %time model = deepq.learn(\
#         env,\
#         seed=42,\                          
#         network='mlp',\
#         lr=1e-3,\
#         total_timesteps=100000,\
#         buffer_size=50000,\
#         exploration_fraction=0.1,\
#         exploration_final_eps=0.02,\
#         print_freq=10)
%time model = deepq.learn(\
        env,\
        seed=42,\
        network='cnn',\
        lr=1e-3,\
        total_timesteps=50000,\
        buffer_size=50000,\
        exploration_fraction=0.5,\
        exploration_final_eps=0.02,\
        print_freq=10)
    
    
    
    
In [0]:
    
# from baselines.common import plot_util as pu
# results = pu.load_results('/content/logs/dqn')
    
In [0]:
    
# import matplotlib.pyplot as plt
# import numpy as np
# r = results[0]
# plt.plot(r.progress.total_timesteps, r.progress.eprewmean)
    
In [13]:
    
import numpy as np 
observation = env.reset()
state = np.zeros((1, 2*128))
dones = np.zeros((1))
frames = []
cumulated_reward = 0
for t in range(1000):
    frames.append(env.render(mode = 'rgb_array'))
    actions, _, state, _ = model.step(observation, S=state, M=dones)
    observation, reward, done, info = env.step(actions)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
env.close()
    
    
In [14]:
    
display_frames_as_gif(frames)
    
    
In [0]: