In [0]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import deque
from time import time
In [2]:
seed = 23041974
random.seed(seed)
print('Seed: {}'.format(seed))
The game the Q-agents will need to learn is made of a board with 4 cells. The agent will receive a reward of +1 every time it fills a vacant cell, and will receive a penalty of -1 when it tries to fill an already occupied cell. The game ends when the board is full.
In [0]:
class Game:
board = None
board_size = 0
def __init__(self, board_size = 4):
self.board_size = board_size
self.reset()
def reset(self):
self.board = np.zeros(self.board_size)
def play(self, cell):
# Returns a tuple: (reward, game_over?)
if self.board[cell] == 0:
self.board[cell] = 1
game_over = len(np.where(self.board == 0)[0]) == 0
return (1, game_over)
else:
return (-1, False)
In [4]:
def state_to_str(state):
return str(list(map(int, state.tolist())))
all_states = list()
for i in range(2):
for j in range(2):
for k in range(2):
for l in range(2):
s = np.array([i, j, k, l])
all_states.append(state_to_str(s))
print('All possible states: ')
for s in all_states:
print(s)
Initialize the game:
In [0]:
game = Game()
Starting with a table-based Q-learning algorithm
In [0]:
num_of_games = 2000
epsilon = 0.1
gamma = 1
Initializing the Q-table:
In [0]:
q_table = pd.DataFrame(0,
index = np.arange(4),
columns = all_states)
Letting the agent play and learn:
In [8]:
r_list = [] ## Store the total reward of each game so we could plot it later
for g in range(num_of_games):
game_over = False
game.reset()
total_reward = 0
while not game_over:
state = np.copy(game.board)
if random.random() < epsilon:
action = random.randint(0, 3)
else:
action = q_table[state_to_str(state)].idxmax()
reward, game_over = game.play(action)
total_reward += reward
if np.sum(game.board) == 4: ## Terminal state
next_state_max_q_value = 0
else:
next_state = np.copy(game.board)
next_state_max_q_value = q_table[state_to_str(next_state)].max()
q_table.loc[action, state_to_str(state)] = reward + gamma * next_state_max_q_value
r_list.append(total_reward)
q_table
Out[8]:
Let's verify that the agent indeed learned a correct strategy by seeing what action it will choose in each one of the possible states:
In [9]:
for i in range(2):
for j in range(2):
for k in range(2):
for l in range(2):
b = np.array([i, j, k, l])
if len(np.where(b == 0)[0]) != 0:
action = q_table[state_to_str(b)].idxmax()
pred = q_table[state_to_str(b)].tolist()
print('board: {b}\tpredicted Q values: {p} \tbest action: {a}\t correct action? {s}'
.format(b = b, p = pred, a = action, s = b[action] == 0))
We can see that the agent indeed picked up the right way to play the game. Still, when looking at the predicted Q values, we see that there are some states where it didn't pick up the correct Q values.
Q Learning is a greedy algorithm, and it prefers choosing the best action at each state rather than exploring. We can solve this issue by increasing (epsilon), which controls the exploration of this algorithm and was set to 0.1, OR by letting the agent play more games.
Let's plot the total reward the agent received per game:
In [10]:
plt.figure(figsize = (14, 7))
plt.plot(range(len(r_list)), r_list)
plt.xlabel('Games played')
plt.ylabel('Reward')
plt.show()
Let's move on to neural network-based modeling. We'll design the Q network first.
Remember that the output of the network, (self.output) is an array of the predicted Q value for each action taken from the input state (self.states). Comparing with the Q-table algorithm, the output is an entire column of the table
In [0]:
class QNetwork:
def __init__(self, hidden_layers_size, gamma, learning_rate,
input_size = 4, output_size = 4):
self.q_target = tf.placeholder(shape = (None, output_size), dtype = tf.float32)
self.r = tf.placeholder(shape = None, dtype = tf.float32)
self.states = tf.placeholder(shape = (None, input_size), dtype = tf.float32)
self.enum_actions = tf.placeholder(shape = (None, 2), dtype = tf.int32)
layer = self.states
for l in hidden_layers_size:
layer = tf.layers.dense(inputs = layer, units = l,
activation = tf.nn.relu,
kernel_initializer = tf.contrib.layers.xavier_initializer(seed = seed))
self.output = tf.layers.dense(inputs = layer, units = output_size,
kernel_initializer = tf.contrib.layers.xavier_initializer(seed = seed))
self.predictions = tf.gather_nd(self.output, indices = self.enum_actions)
self.labels = self.r + gamma * tf.reduce_max(self.q_target, axis = 1)
self.cost = tf.reduce_mean(tf.losses.mean_squared_error(labels = self.labels,
predictions = self.predictions))
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(self.cost)
Designing the Experience Replay memory which will be used, using a cyclic memory buffer:
In [0]:
class ReplayMemory:
memory = None
counter = 0
def __init__(self, size):
self.memory = deque(maxlen = size)
def append(self, element):
self.memory.append(element)
self.counter += 1
def sample(self, n):
return random.sample(self.memory, n)
Setting up the parameters. Note that here we used gamma = 0.99 and not 1 like in the Q-table algorithm, as the literature recommends working with a discount factor of . It probably won't matter much in this specific case, but it's good to get used to this.
In [0]:
num_of_games = 2000
epsilon = 0.1
gamma = 0.99
batch_size = 10
memory_size = 2000
Initializing the Q-network:
In [16]:
tf.reset_default_graph()
tf.set_random_seed(seed)
qnn = QNetwork(hidden_layers_size = [20, 20],
gamma = gamma,
learning_rate = 0.001)
memory = ReplayMemory(memory_size)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
Training the network. Compare this code to the above Q-table training.
In [17]:
r_list = []
c_list = [] ## Same as r_list, but for storing the cost
counter = 0 ## Will be used to trigger network training
for g in range(num_of_games):
game_over = False
game.reset()
total_reward = 0
while not game_over:
counter += 1
state = np.copy(game.board)
if random.random() < epsilon:
action = random.randint(0, 3)
else:
pred = np.squeeze(sess.run(qnn.output,
feed_dict = {qnn.states: np.expand_dims(game.board, axis = 0)}))
action = np.argmax(pred)
reward, game_over = game.play(action)
total_reward += reward
next_state = np.copy(game.board)
memory.append({'state': state, 'action': action,
'reward': reward, 'next_state': next_state,
'game_over': game_over})
if counter % batch_size == 0:
## Network training
batch = memory.sample(batch_size)
q_target = sess.run(qnn.output, feed_dict = {qnn.states: np.array(list(map(lambda x: x['next_state'], batch)))})
terminals = np.array(list(map(lambda x: x['game_over'], batch)))
for i in range(terminals.size):
if terminals[i]:
## Remember we use the network's own predictions for the next state while calculating loss.
## Terminal states have no Q-value, and so we manually set them to 0, as the network's predictions
## for these states is meaningless
q_target[i] = np.zeros(game.board_size)
_, cost = sess.run([qnn.optimizer, qnn.cost],
feed_dict = {qnn.states: np.array(list(map(lambda x: x['state'], batch))),
qnn.r: np.array(list(map(lambda x: x['reward'], batch))),
qnn.enum_actions: np.array(list(enumerate(map(lambda x: x['action'], batch)))),
qnn.q_target: q_target})
c_list.append(cost)
r_list.append(total_reward)
print('Final cost: {}'.format(c_list[-1]))
Again, let's verify that the agent indeed learned a correct strategy:
In [18]:
for i in range(2):
for j in range(2):
for k in range(2):
for l in range(2):
b = np.array([i, j, k, l])
if len(np.where(b == 0)[0]) != 0:
pred = np.squeeze(sess.run(qnn.output,
feed_dict = {qnn.states: np.expand_dims(b, axis = 0)}))
pred = list(map(lambda x: round(x, 3), pred))
action = np.argmax(pred)
print('board: {b} \tpredicted Q values: {p} \tbest action: {a} \tcorrect action? {s}'
.format(b = b, p = pred, a = action, s = b[action] == 0))
Here too, we see that the agent learned a correct strategy, and again, Q values are not what we would've expectdd.
Let's plot the rewards the agent received:
In [19]:
plt.figure(figsize = (14, 7))
plt.plot(range(len(r_list)), r_list)
plt.xlabel('Games played')
plt.ylabel('Reward')
plt.show()
Zooming in, so we can compare the Q-table algorithm:
In [20]:
plt.figure(figsize = (14, 7))
plt.plot(range(len(r_list)), r_list)
plt.xlabel('Game played')
plt.ylabel('Reward')
plt.ylim(-2, 4.5)
plt.show()
Let's plot the cost too. Remember that here the x-axis reflects the number of trainings, and not the number of games. The number of training depends on the number of actions taken, and the agent can take any number of actions during each game.
In [21]:
plt.figure(figsize = (14, 7))
plt.plot(range(len(c_list)), c_list)
plt.xlabel('Trainings')
plt.ylabel('Cost')
plt.show()