In [ ]:
#@title Agreement
# Copyright (c) 2021 Kevin P. Murphy (murphyk@gmail.com) and Mahmoud Soliman (mjs@aucegypt.edu)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
In [ ]:
#@title Attribution
# This notebook is based on the following:
# https://github.com/mjsML/VizDoom-Keras-RL
# https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/rl/ipynb/actor_critic_cartpole.ipynb
In [ ]:
#@title Imports
from tensorflow.python.client import device_lib
from psutil import virtual_memory
import cv2
from google.colab.patches import cv2_imshow
%tensorflow_version 2.x
import tensorflow as tf
import os
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.datasets.samples_generator import make_blobs
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import pathlib
import shutil
import tempfile
from tqdm import tqdm
In [ ]:
#@title Hardware check
def find_accelerator():
mem = virtual_memory()
devices=device_lib.list_local_devices()
RAM="Physical RAM: {:.2f} GB".format(mem.total/(1024*1024*1024))
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
device=["TPU at "+str(tpu.cluster_spec().as_dict()['worker'])]
except ValueError:
device =[d.physical_device_desc for d in devices if d.device_type=="GPU"]
if not device:
return None, RAM
return device , RAM
a,r=find_accelerator()
print("Please make sure that the statement below says Accelerator found")
print("Accelerator found:",a,r)
In [ ]:
#@title Install the extra required packages if any
# Installation of libs as per
# https://stackoverflow.com/questions/50667565/how-to-install-vizdoom-using-google-colab
%%bash
# Install deps from
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux
apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip
apt-get install libboost-all-dev
apt-get install liblua5.1-dev
We will start by exploring POMDPs , the states of the environment, $z_{t}$ , are hidden from the agent. The agent gets to see partial observations derived from the hidden state, which we denote by $s_{t} \in \mathcal{S}$ these are sampled from the observation model, $p(s_{t}|z_{t})$.
In this example we will work with ViZDoom and Deep Recurrent Q Network.
Note that this is a quick overview example, the details will be discussed later.
In [ ]:
#@title Install ViZDoom... takes few mins
!pip install vizdoom
In [ ]:
#@title Clone PyProbML repo and set enviroment variables
!git clone https://github.com/probml/pyprobml/ -q
os.environ["PYPROBML"]='/content/pyprobml/'
In [ ]:
#@title Clone ViZDoom-Keras-RL repo and imports
# Clone VizDoom-Keras-RL
!git clone https://github.com/mjsML/VizDoom-Keras-RL.git
%cd /content/VizDoom-Keras-RL
from __future__ import print_function
import skimage as skimage
from skimage import transform, color, exposure
from skimage.viewer import ImageViewer
import random
from random import choice
import numpy as np
from collections import deque
import time
import json
from keras.models import model_from_json
from keras.models import Sequential, load_model, Model
from keras.layers.wrappers import TimeDistributed
from keras.layers.core import Dense, Dropout, Activation, Flatten, RepeatVector, Masking
from keras.layers import Convolution2D, Dense, Flatten, MaxPooling2D, Input, AveragePooling2D, Lambda, Activation, Embedding
#tf.keras.layers.Concatenate(axis=1)([x, y])
from keras.layers.recurrent import LSTM, GRU
from keras.optimizers import SGD, Adam, rmsprop
from keras import backend as K
from vizdoom import DoomGame, ScreenResolution
from vizdoom import *
import itertools as it
from time import sleep
import tensorflow as tf
from networks import Networks
In [ ]:
#@title Setup ViZDoom with defend the center scenario
#TF2 TF1 compatibility
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
from drqn import ReplayMemory,DoubleDQNAgent,preprocessImg
game = DoomGame()
game.load_config("/content/VizDoom-Keras-RL/defend_the_center.cfg")
game.set_sound_enabled(True)
game.set_screen_resolution(ScreenResolution.RES_640X480)
game.set_window_visible(False)
game.init()
game.new_episode()
game_state = game.get_state()
misc = game_state.game_variables # [KILLCOUNT, AMMO, HEALTH]
prev_misc = misc
action_size = game.get_available_buttons_size()
img_rows, img_cols = 64, 64
img_channels = 3 # Color channel
trace_length = 4 # Temporal Dimension
state_size = (trace_length, img_rows, img_cols, img_channels)
agent = DoubleDQNAgent(state_size, action_size, trace_length)
agent.model = Networks.drqn(state_size, action_size, agent.learning_rate)
agent.target_model = Networks.drqn(
state_size, action_size, agent.learning_rate)
s_t = game_state.screen_buffer # 480 x 640
s_t = preprocessImg(s_t, size=(img_rows, img_cols))
is_terminated = game.is_episode_finished()
In [ ]:
#@title Start training DRQN Agent
epsilon = agent.initial_epsilon
GAME = 0
t = 0
max_life = 0 # Maximum episode life (Proxy for agent performance)
life = 0
episode_buf = [] # Save entire episode
# Buffer to compute rolling statistics
life_buffer, ammo_buffer, kills_buffer = [], [], []
while not game.is_episode_finished():
loss = 0
Q_max = 0
r_t = 0
a_t = np.zeros([action_size])
# Epsilon Greedy
if len(episode_buf) > agent.trace_length:
# 1x8x64x64x3
state_series = np.array(
[trace[-1] for trace in episode_buf[-agent.trace_length:]])
state_series = np.expand_dims(state_series, axis=0)
action_idx = agent.get_action(state_series)
else:
action_idx = random.randrange(agent.action_size)
a_t[action_idx] = 1
a_t = a_t.astype(int)
game.set_action(a_t.tolist())
skiprate = agent.frame_per_action
game.advance_action(skiprate)
game_state = game.get_state() # Observe again after we take the action
is_terminated = game.is_episode_finished()
# each frame we get reward of 0.1, so 4 frames will be 0.4
r_t = game.get_last_reward()
if (is_terminated):
if (life > max_life):
max_life = life
GAME += 1
life_buffer.append(life)
ammo_buffer.append(misc[1])
kills_buffer.append(misc[0])
print("Episode Finish ", misc)
game.new_episode()
game_state = game.get_state()
misc = game_state.game_variables
s_t1 = game_state.screen_buffer
s_t1 = game_state.screen_buffer
misc = game_state.game_variables
s_t1 = preprocessImg(s_t1, size=(img_rows, img_cols))
r_t = agent.shape_reward(r_t, misc, prev_misc, t)
if (is_terminated):
life = 0
else:
life += 1
# update the cache
prev_misc = misc
# Update epsilon
if agent.epsilon > agent.final_epsilon and t > agent.observe:
agent.epsilon -= (agent.initial_epsilon -
agent.final_epsilon) / agent.explore
# Do the training
if t > agent.observe:
Q_max, loss = agent.train_replay()
# save the sample <s, a, r, s'> to episode buffer
episode_buf.append([s_t, action_idx, r_t, s_t1])
if (is_terminated):
agent.memory.add(episode_buf)
episode_buf = [] # Reset Episode Buf
s_t = s_t1
t += 1
# save progress every 10000 iterations
if t % 10000 == 0:
print("Now we save model")
agent.model.save_weights("./models/drqn.h5", overwrite=True)
# print info
state = ""
if t <= agent.observe:
state = "observe"
elif t > agent.observe and t <= agent.observe + agent.explore:
state = "explore"
else:
state = "train"
if (is_terminated):
print("TIME", t, "/ GAME", GAME, "/ STATE", state,
"/ EPSILON", agent.epsilon, "/ ACTION", action_idx, "/ REWARD", r_t,
"/ Q_MAX %e" % np.max(Q_max), "/ LIFE", max_life, "/ LOSS", loss)
# Save Agent's Performance Statistics
if GAME % agent.stats_window_size == 0 and t > agent.observe:
print("Update Rolling Statistics")
agent.mavg_score.append(np.mean(np.array(life_buffer)))
agent.var_score.append(np.var(np.array(life_buffer)))
agent.mavg_ammo_left.append(np.mean(np.array(ammo_buffer)))
agent.mavg_kill_counts.append(np.mean(np.array(kills_buffer)))
# Reset rolling stats buffer
life_buffer, ammo_buffer, kills_buffer = [], [], []
# Write Rolling Statistics to file
with open("statistics/drqn_stats.txt", "w") as stats_file:
stats_file.write('Game: ' + str(GAME) + '\n')
stats_file.write('Max Score: ' + str(max_life) + '\n')
stats_file.write('mavg_score: ' +
str(agent.mavg_score) + '\n')
stats_file.write(
'var_score: ' + str(agent.var_score) + '\n')
stats_file.write('mavg_ammo_left: ' +
str(agent.mavg_ammo_left) + '\n')
stats_file.write('mavg_kill_counts: ' +
str(agent.mavg_kill_counts) + '\n')
As an agent takes actions and moves through an environment, it learns to map the observed state of the environment to two possible outputs:
Recommended action:
A probabiltiy value for each action in the action space. The part of the agent responsible for this output is called the actor.
Estimated rewards in the future:
Sum of all rewards it expects to receive in the future. The part of the agent responsible for this output is the critic.
Agent and Critic learn to perform their tasks, such that the recommended actions from the actor maximize the rewards.
CartPole-V0
A pole is attached to a cart placed on a frictionless track. The agent has to apply force to move the cart. It is rewarded for every time step the pole remains upright. The agent, therefore, must learn to keep the pole from falling over.
In [ ]:
#@title Imports
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Configuration parameters for the whole setup
seed = 42
gamma = 0.99 # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0") # Create the environment
env.seed(seed)
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0
In [ ]:
#@title Define Model
num_inputs = 4
num_actions = 2
num_hidden = 128
inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)
model = keras.Model(inputs=inputs, outputs=[action, critic])
In [ ]:
#@title Train model
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
while True: # Run until solved
state = env.reset()
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs, critic_value = model(state)
critic_value_history.append(critic_value[0, 0])
# Sample action from action probability distribution
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action_probs_history.append(tf.math.log(action_probs[0, action]))
# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
rewards_history.append(reward)
episode_reward += reward
if done:
break
# Update running reward to check condition for solving
running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = []
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.insert(0, discounted_sum)
# Normalize
returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
actor_losses = []
critic_losses = []
for log_prob, value, ret in history:
# At this point in history, the critic estimated that we would get a
# total reward = `value` in the future. We took an action with log probability
# of `log_prob` and ended up recieving a total reward = `ret`.
# The actor must be updated so that it predicts an action that leads to
# high rewards (compared to critic's estimate) with high probability.
diff = ret - value
actor_losses.append(-log_prob * diff) # actor loss
# The critic must be updated so that it predicts a better estimate of
# the future rewards.
critic_losses.append(
huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
)
# Backpropagation
loss_value = sum(actor_losses) + sum(critic_losses)
grads = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Log details
episode_count += 1
if episode_count % 10 == 0:
template = "running reward: {:.2f} at episode {}"
print(template.format(running_reward, episode_count))
if running_reward > 195: # Condition to consider the task solved
print("Solved at episode {}!".format(episode_count))
break