notebook.community

Edit and run



In [ ]:

    
from twenty_forty_eight_linux import TwentyFortyEight
import numpy as np
#from scipy.stats import rv_discrete
from collections import deque
import pickle



In [ ]:

    
def preprocess_game_state(prev_game_state, current_game_state):
    return tuple(np.asarray(current_game_state) / np.asarray(prev_game_state))



In [ ]:

    
def discounted_reward(rewards, gamma=0.5):
    r = np.asarray(rewards)
    g = np.full_like(rewards, gamma, dtype=np.float64) ** np.arange(len(rewards))
    return np.sum(r * g)



In [ ]:

    
def epsilon_greedy_policy(q_vector, epsilon):
    prob_max = epsilon / 4 + 1 - epsilon
    prob_random = epsilon / 4
    is_greedy_action = False if np.random.uniform() <= epsilon else True
    if is_greedy_action:
        #Random choice, if there are at least 2 Q-s with the same values
        # TODO: Check, if randomizing the q_vector in lookup_action_value() around 0, what would happen
        max_q_args = np.argwhere(q_vector == np.amax(q_vector))
        if len(max_q_args) > 1:
            action = np.random.choice(max_q_args.ravel(), 1)[0] + 1
        else:
            action = np.argmax(q_vector) + 1
    else:
        action = np.random.randint(1, 5)
    return action



In [ ]:

    
def td0_update_action_value(current_action_value, next_action_value, reward, alpha=0.25, gamma=0.95):
    return current_action_value + alpha * (reward + gamma * next_action_value - current_action_value)



In [ ]:

    
def td0_update(next_state, action_value_vect, action, reward, epsilon):
    current = action_value_vect[action - 1]
    n_action_value = lookup_action_value_wo_update(next_state)
    n_action = epsilon_greedy_policy(n_action_value, epsilon)
    new_q = td0_update_action_value(current, n_action_value[n_action - 1], reward)
    action_value_vector[action - 1] = new_q
    return action_value_vector



In [ ]:

    
def random_policy():
    return np.random.randint(1, 5)



In [ ]:

    
def action_value_update(q, discounted_reward, n):
    return q + (discounted_reward - q) / n



In [ ]:

    
def lookup_action_value(state):
    if state not in action_values_table:
        action_values_table[state] = [0, 0, 0, 0]
    return action_values_table[state]



In [ ]:

    
def lookup_action_value_wo_update(state):
    av = action_values_table.get(state)
    return av if av else [0, 0, 0, 0]



In [ ]:

    
#action_values_table = {}
#with open("c:\\Work\\Jupyter\\2048\\action_value_dict.p", "rb") as f:
#    action_values_table = pickle.load(f)

epsilon = 1
episode_number = 0

#Külső ciklus
for _ in range(20000):
    print("-", end="")
    episode_number += 1
    if episode_number % 1000 == 0:
        print(str(episode_number) + ". játék")
        print("Lépések száma: " + str(len(actions_deque)))
        print("Maximum kocka: " + str(max(game_state)))
        
        #with open("c:\\Work\\Jupyter\\rl_games\\2048\\action_value_dict_-10_reward.p", "wb") as f:
        #    pickle.dump(action_values_table, f)

    epsilon = 1 / np.sqrt(episode_number)
    # Containers
    states_input_deque = deque()
    states_reward_list = []
    actions_deque = deque()
    # End bool
    is_ended = False
    # New game
    game = TwentyFortyEight(4, 4)
    game_state = game.table_as_array()
    prev_game_state = tuple(np.ones(16))
    #Belső ciklus
    while not is_ended:
        
        # Append current game state diff
        states_input_deque.append(game_state)

        # Lookup action value belonging to current state
        action_value_vector = lookup_action_value(game_state)

        # Choose action based on current action value vector and epsilon
        action = epsilon_greedy_policy(action_value_vector, epsilon)
        # Append action
        actions_deque.append(action)

        # Make the move in game (environment)
        game.move(action)
        # Get next state, reward
        game_state, reward, is_ended = game.table_as_array(), game.reward(), game.is_ended()

        # Append reward
        states_reward_list.append(reward)
        
        # Update action value
        action_values_table[states_input_deque[-1]] = td0_update(game_state, action_value_vector,
                                                                 action, reward, epsilon)



In [ ]:

    
len(action_values_table)



In [ ]:

    
ll = deque()
for k in action_values_table.keys():
    ll.append(max(k))
max(ll)



In [ ]:

    
for i, k in enumerate(action_values_table.values()):
    if 1000 < i < 1015:
        print(k)
    if i == 1015:
        break



In [ ]:

    
def try_table():
    gam = TwentyFortyEight(4, 4)
    game_state = gam.table_as_array()
    prev_game_state = tuple(np.ones(16))
    is_ended = False
    states_input_deque = deque()
    states_reward_list = []
    actions_deque = deque()
    
    while not is_ended:
        
        # Append current game state diff
        states_input_deque.append(game_state)

        # Lookup action value belonging to current state
        action_value_vector = lookup_action_value_wo_update(game_state)

        # Choose action based on random_policy
        action = random_policy()
        # Append action
        actions_deque.append(action)

        # Make the move in game (environment)
        gam.move(action)
        # Get next state, reward
        game_state, reward, is_ended = gam.table_as_array(), gam.reward(), gam.is_ended()

        # Append reward
        states_reward_list.append(reward)
    
    return states_input_deque, states_reward_list, actions_deque, gam



In [ ]:

    
with open("/home/atoth/Jupyter_notebooks/2048/action_value_dict_2016_10_08_19_10.p", "rb") as f:
    action_values_table = pickle.load(f)



In [ ]:

    
np.sum(np.asarray([sum(try_table()[3].table_as_array()) for ga in range(3000)])) / 3000



In [ ]:

    
aaa = []
for v in action_values_table.values():
    if v.count(0) == 0:
        aaa.append(v) 
len(aaa)



In [ ]:

    
np.sum(np.asarray([sum(try_table()[0][-1]) for ga in range(1000)])) / 1000



In [ ]:

    
a, b, c, gam = try_table()



In [ ]:

    
print(gam)