In [ ]:
%matplotlib inline
In [ ]:
from windy import WindyGridWorld
import numpy as np
from collections import deque
import pickle
import matplotlib.pyplot as plt
In [ ]:
def epsilon_greedy_policy(q_vector, epsilon):
is_greedy_action = False if np.random.uniform() <= epsilon else True
if is_greedy_action:
#Random choice, if there are at least 2 Q-s with the same values
# TODO: Check, if randomizing the q_vector in lookup_action_value() around 0, what would happen
max_q_args = np.argwhere(q_vector == np.amax(q_vector))
if len(max_q_args) > 1:
action = np.random.choice(max_q_args.ravel(), 1)[0] + 1
else:
action = np.argmax(q_vector) + 1
else:
action = np.random.randint(1, 9)
return action
In [ ]:
def td0_update_action_value(current_action_value, next_action_value, reward, alpha=0.25, gamma=0.95):
return current_action_value + alpha * (reward + gamma * next_action_value - current_action_value)
In [ ]:
def td0_update(next_state, action_value_vect, action, reward, epsilon):
current = action_value_vect[action - 1]
n_action_value = lookup_action_value_wo_update(next_state)
n_action = epsilon_greedy_policy(n_action_value, epsilon)
new_q = td0_update_action_value(current, n_action_value[n_action - 1], reward)
action_value_vector[action - 1] = new_q
return action_value_vector
In [ ]:
def random_policy():
return np.random.randint(1, 5)
In [ ]:
def action_value_update(q, discounted_reward, n):
return q + (discounted_reward - q) / n
In [ ]:
def lookup_action_value(state):
if state not in action_values_table:
action_values_table[state] = [0, 0, 0, 0, 0, 0, 0, 0]
return action_values_table[state]
In [ ]:
def lookup_action_value_wo_update(state):
av = action_values_table.get(state)
return av if av else [0, 0, 0, 0, 0, 0, 0, 0]
In [ ]:
def try_table():
game = WindyGridWorld(GRID_SIZE, WINNER_TILE, WINDY_ARRAY, START_TILE, False)
current_pos = game.current_pos()
is_ended = False
agent_positions = deque()
states_reward_list = []
actions_deque = deque()
epsilon = 0.005
while not is_ended:
# Append current agent position
agent_positions.append(current_pos)
# Lookup action value belonging to current state
action_value_vector = lookup_action_value(current_pos)
# Choose action based on current action value vector and epsilon
action = epsilon_greedy_policy(action_value_vector, epsilon)
# Append action
actions_deque.append(action)
# Get next state, reward
current_pos, reward, is_ended = game.step(action)
return agent_positions, actions_deque, current_pos
In [ ]:
def show_moves(visited_states, grid_size):
arr = np.asarray(visited_states).T
range_x = (0.5, grid_size[1] + 0.5)
range_y = (0.5, grid_size[0] + 0.5)
ax = plt.gca()
ax.scatter(arr[1], arr[0])
ax.quiver(arr[1,:-1],arr[0,:-1],arr[1,1:]-arr[1,:-1],arr[0,1:]-arr[0,:-1], scale_units='xy', angles='xy', scale=1)
ax.set_xticks(np.arange(*range_x), minor=True)
ax.set_yticks(np.arange(*range_y), minor=True)
ax.set_xlim(*range_x)
ax.set_ylim(*range_y)
ax.set_xlabel("Valami")
ax.invert_yaxis()
ax.get_xaxis().set_tick_params(labeltop="on", labelbottom="off")
plt.grid(which="minor")
plt.show()
In [ ]:
GRID_SIZE = (20, 20)
WINNER_TILE = (10, 20)
#WINDY_ARRAY = (0, 1, 1, 2, -2, -1, -1, 1, 1, 0, 1, 2, -3, 3, -1, -1, 2, -1, 2, 0)
WINDY_ARRAY = np.zeros(20)
START_TILE = None
In [ ]:
np.random.randint(-2, 3, 10)
In [ ]:
action_values_table = {}
epsilon = 1
episode_number = 0
average = 0
#Külső ciklus
for _ in range(3000):
print("-", end="")
episode_number += 1
if episode_number % 500 == 0:
print(str(episode_number) + ". játék")
print("Lépések száma: " + str(average / 500))
average = 0
epsilon = 1 / episode_number ** 1/3
# Containers
agent_positions = deque()
states_reward_list = []
actions_deque = deque()
# End bool
is_ended = False
# New game
game = WindyGridWorld(GRID_SIZE, WINNER_TILE, WINDY_ARRAY, START_TILE, only_first_row=False)
current_pos = game.current_pos()
#Belső ciklus
while not is_ended:
# Append current agent position
agent_positions.append(current_pos)
# Lookup action value belonging to current state
action_value_vector = lookup_action_value(current_pos)
# Choose action based on current action value vector and epsilon
action = epsilon_greedy_policy(action_value_vector, epsilon)
# Append action
actions_deque.append(action)
# Get next state, reward
current_pos, reward, is_ended = game.step(action)
# Append reward
states_reward_list.append(reward)
# Update action value
action_values_table[agent_positions[-1]] = td0_update(current_pos, action_value_vector,
action, reward, epsilon)
optimal = agent_positions[0][0] if agent_positions[0][0] >= agent_positions[0][1] else agent_positions[0][1]
average += len(agent_positions) / optimal
In [ ]:
with open("/home/atoth/temp/act_vals.p", "wb") as f:
pickle.dump(action_values_table, f)
In [ ]:
a, b, c = try_table()
a.append(c)
show_moves(a, GRID_SIZE)
In [ ]: