notebook.community

Edit and run



In [ ]:

    
%matplotlib inline



In [ ]:

    
from windy import WindyGridWorld
import numpy as np
from collections import deque
import pickle
import matplotlib.pyplot as plt



In [ ]:

    
def discounted_reward(rewards, gamma=0.95):
    r = np.asarray(rewards)
    g = np.full_like(rewards, gamma, dtype=np.float64) ** np.arange(len(rewards))
    return np.sum(r * g)



In [ ]:

    
def epsilon_greedy_policy(q_vector, epsilon):
    is_greedy_action = False if np.random.uniform() <= epsilon else True
    if is_greedy_action:
        #Random choice, if there are at least 2 Q-s with the same values
        # TODO: Check, if randomizing the q_vector in lookup_action_value() around 0, what would happen
        max_q_args = np.argwhere(q_vector == np.amax(q_vector))
        if len(max_q_args) > 1:
            action = np.random.choice(max_q_args.ravel(), 1)[0] + 1
        else:
            action = np.argmax(q_vector) + 1
    else:
        action = np.random.randint(1, 5)
    return action



In [ ]:

    
def random_policy():
    return np.random.randint(1, 9)



In [ ]:

    
def action_value_update(q, discounted_reward, n):
    return q + (discounted_reward - q) / n



In [ ]:

    
def lookup_action_value(state):
    if state not in action_values_table:
        action_values_table[state] = [0, 0, 0, 0]
    return action_values_table[state]



In [ ]:

    
def lookup_action_value_wo_update(state):
    av = action_values_table.get(state)
    return av if av else [0, 0, 0, 0]



In [ ]:

    
def try_table():
    game = WindyGridWorld(GRID_SIZE, WINNER_TILE, WINDY_ARRAY, START_TILE)
    current_pos = game.current_pos()
    is_ended = False
    agent_positions = deque()
    states_reward_list = []
    actions_deque = deque()
    epsilon = 0.001
    
    while not is_ended:
        
        # Append current agent position
        agent_positions.append(current_pos)

        # Lookup action value belonging to current state
        action_value_vector = lookup_action_value(current_pos)

        # Choose action based on current action value vector and epsilon
        action = epsilon_greedy_policy(action_value_vector, epsilon)
        # Append action
        actions_deque.append(action)

        # Get next state, reward
        current_pos, reward, is_ended = game.step(action)
    
    return agent_positions, actions_deque, current_pos



In [ ]:

    
def show_moves(visited_states, grid_size):
    arr = np.asarray(visited_states).T
    range_x = (0.5, grid_size[1] + 0.5)
    range_y = (0.5, grid_size[0] + 0.5)
    ax = plt.gca()
    ax.scatter(arr[1], arr[0])
    ax.quiver(arr[1,:-1],arr[0,:-1],arr[1,1:]-arr[1,:-1],arr[0,1:]-arr[0,:-1], scale_units='xy', angles='xy', scale=1)
    ax.set_xticks(np.arange(*range_x), minor=True)
    ax.set_yticks(np.arange(*range_y), minor=True)
    ax.set_xlim(*range_x)
    ax.set_ylim(*range_y)
    ax.invert_yaxis()
    ax.get_xaxis().set_tick_params(labeltop="on", labelbottom="off")
    plt.grid(which="minor")
    plt.show()

Game parameters



In [ ]:

    
GRID_SIZE = (10, 7)
WINNER_TILE = (1, 7)
WINDY_ARRAY = (0, 1, 1, 2, -2, -1, 0)
START_TILE = (4, 1)

Main learning loop



In [ ]:

    
action_values_table = {}
#with open("c:\\Work\\Jupyter\\2048\\action_value_dict.p", "rb") as f:
#    action_values_table = pickle.load(f)

epsilon = 1
episode_number = 0

#Külső ciklus
for _ in range(1000):
    episode_number += 1
    epsilon = 1 / np.sqrt(episode_number)
    # Containers
    agent_positions = deque()
    states_reward_list = []
    actions_deque = deque()
    # End bool
    is_ended = False
    # New game
    game = WindyGridWorld(GRID_SIZE, WINNER_TILE, WINDY_ARRAY, START_TILE)
    current_pos = game.current_pos()
    #Belső ciklus
    while not is_ended:
        
        # Append current agent position
        agent_positions.append(current_pos)

        # Lookup action value belonging to current state
        action_value_vector = lookup_action_value(current_pos)

        # Choose action based on current action value vector and epsilon
        action = epsilon_greedy_policy(action_value_vector, epsilon)
        # Append action
        actions_deque.append(action)

        # Make the move in game (environment)
        current_pos, reward, is_ended =game.step(action)
        #game.move(action)
        # Get next state, reward
        #game_state, reward, is_ended = game.table_as_array(), game.reward(), game.is_ended()

        # Append reward
        states_reward_list.append(reward)

    # Update action values loop
    for i, (state, action) in enumerate(zip(agent_positions, actions_deque)):
        action_value_vect = lookup_action_value(state)
        disc_rew = discounted_reward(states_reward_list[i:])
        q = action_values_table[state][action - 1]
        action_values_table[state][action - 1] = action_value_update(q, disc_rew, len(states_reward_list[i:]))
    
    # Check after some games
    print("-", end="")
    if episode_number % 100 == 0:
        print(str(episode_number) + ". játék")
        print("Lépések száma: " + str(len(actions_deque)))
        #with open("c:\\Work\\Jupyter\\rl_games\\2048\\action_value_dict_-10_reward.p", "wb") as f:
        #    pickle.dump(action_values_table, f)



In [ ]:

    
a, b, c = try_table()
a.append(c)
show_moves(a)