(Simplified) Blackjack - Monte Carlo Learning (Simple table lookup)


In [11]:
from games.blackjack.blackjack import BlackJack
from bandits import BanditAlgorithm
import pandas as pd
import rl_learning as rl
from IPython.display import display, HTML
import cPickle as pickle

In [12]:
def test_training_monte_carlo_for_blackjack(model_class, epochs):
    blackjack = BlackJack()
    policy, model = rl.train_reinforcement_learning_strategy(num_sims=epochs, game_obs=blackjack, model_class=model_class)
    # TODO Add ipython notebook 3D ghaph
    return policy, model

def test_policy(game_obs, model=None):
    print "---------- Testing policy:-----------"
    banditAlgorithm = BanditAlgorithm(params=0.1)
    game_obs.initiate_game()
    print "Initial state:"
    print game_obs.state
    move = 1

    # Unpickle if model obs not provided
    if not model:
        model = pickle.load(open(game_obs.base_folder_name + '/model_obs.pkl', mode='rb'))

    if model.model_class == 'vw_python':
        from vowpal_wabbit import pyvw
        model.model = pyvw.vw("--quiet -i {0}".format(model.model_path))
    while game_obs.game_status == 'in process':
        new_qval_table = banditAlgorithm.return_decision_reward_tuples(game_obs.state, model)
        best_action, value_estimate = banditAlgorithm.return_decision_with_max_reward(new_qval_table)
        print('Move #: %s; Taking action: %s' % (move, best_action))
        reward = game_obs.play(best_action)
        print game_obs.state
        if game_obs.game_status != 'in process': print "Summary: " + game_obs.game_status + " :Player Reward: " + str(reward)
        move += 1

        if move > 15:
            print "Too many moves"
            break

In [3]:
policy, model = test_training_monte_carlo_for_blackjack(model_class='lookup_table', epochs=25000)


: took time:9

In [4]:
df = pd.DataFrame(policy).T
df.columns = ['player_value', 'dealer_value', 'decision', 'score']
policy_Q_table = df.pivot('player_value', 'dealer_value')['decision']
display(policy_Q_table)


dealer_value 2 3 4 5 6 7 8 9 10 11
player_value
12 stay hit stay stay stay hit hit hit hit hit
13 stay hit hit hit stay hit hit stay hit hit
14 stay hit stay stay hit hit hit hit hit hit
15 hit stay stay stay stay hit hit stay stay hit
16 hit stay stay stay stay hit stay hit hit hit
17 hit stay stay stay stay stay hit hit hit hit
18 stay stay stay stay stay stay stay stay stay stay
19 stay stay stay stay stay stay stay stay stay stay
20 stay stay stay stay stay stay stay stay stay stay

In [5]:
policy_Q_score = df.pivot('player_value', 'dealer_value')['score']
display(policy_Q_score)


dealer_value 2 3 4 5 6 7 8 9 10 11
player_value
12 -0.19802 -0.101942 -0.302752 -0.131034 -0.0798122 -0.206897 -0.257732 -0.38674 -0.414489 -0.604348
13 -0.280488 -0.2 -0.361991 -0.252101 -0.14 -0.191111 -0.39819 -0.570815 -0.45672 -0.570281
14 -0.307087 -0.34715 -0.256545 -0.156757 -0.212766 -0.185022 -0.32093 -0.398104 -0.539195 -0.566265
15 -0.345946 -0.164948 -0.274854 -0.288889 -0.298077 -0.311404 -0.404858 -0.560345 -0.58598 -0.645669
16 -0.497487 -0.205128 -0.287356 -0.146919 -0.19774 -0.441441 -0.582609 -0.521053 -0.522333 -0.551181
17 -0.511013 -0.136126 -0.021164 1.38066e-17 -0.052381 -0.290323 -0.336032 -0.512931 -0.570558 -0.661597
18 0.140909 0.268041 0.144444 0.234043 0.184358 0.374468 0.101124 -0.216216 -0.241546 -0.349776
19 0.368889 0.534653 0.347561 0.460123 0.573034 0.539007 0.660465 0.252632 -0.00215517 -0.148325
20 0.639456 0.680921 0.59919 0.644128 0.658915 0.786408 0.864238 0.770642 0.401249 0.0721003

In [13]:
blackjack = BlackJack()
test_policy(blackjack)


---------- Testing policy:-----------
Initial state:
state_info(player_value=20, dealer_value=3)
Move #: 1; Taking action: stay
state_info(player_value=20, dealer_value=17)
Summary: player wins :Player Reward: 1

In [ ]: