(Simplified) Blackjack - Monte Carlo Learning (Simple table lookup)



In [11]:

    
from games.blackjack.blackjack import BlackJack
from bandits import BanditAlgorithm
import pandas as pd
import rl_learning as rl
from IPython.display import display, HTML
import cPickle as pickle



In [12]:

    
def test_training_monte_carlo_for_blackjack(model_class, epochs):
    blackjack = BlackJack()
    policy, model = rl.train_reinforcement_learning_strategy(num_sims=epochs, game_obs=blackjack, model_class=model_class)
    # TODO Add ipython notebook 3D ghaph
    return policy, model

def test_policy(game_obs, model=None):
    print "---------- Testing policy:-----------"
    banditAlgorithm = BanditAlgorithm(params=0.1)
    game_obs.initiate_game()
    print "Initial state:"
    print game_obs.state
    move = 1

    # Unpickle if model obs not provided
    if not model:
        model = pickle.load(open(game_obs.base_folder_name + '/model_obs.pkl', mode='rb'))

    if model.model_class == 'vw_python':
        from vowpal_wabbit import pyvw
        model.model = pyvw.vw("--quiet -i {0}".format(model.model_path))
    while game_obs.game_status == 'in process':
        new_qval_table = banditAlgorithm.return_decision_reward_tuples(game_obs.state, model)
        best_action, value_estimate = banditAlgorithm.return_decision_with_max_reward(new_qval_table)
        print('Move #: %s; Taking action: %s' % (move, best_action))
        reward = game_obs.play(best_action)
        print game_obs.state
        if game_obs.game_status != 'in process': print "Summary: " + game_obs.game_status + " :Player Reward: " + str(reward)
        move += 1

        if move > 15:
            print "Too many moves"
            break



In [3]:

    
policy, model = test_training_monte_carlo_for_blackjack(model_class='lookup_table', epochs=25000)









    



: took time:9



In [4]:

    
df = pd.DataFrame(policy).T
df.columns = ['player_value', 'dealer_value', 'decision', 'score']
policy_Q_table = df.pivot('player_value', 'dealer_value')['decision']
display(policy_Q_table)









    






  
    
      dealer_value
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
    
    
      player_value
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      12
      stay
      hit
      stay
      stay
      stay
      hit
      hit
      hit
      hit
      hit
    
    
      13
      stay
      hit
      hit
      hit
      stay
      hit
      hit
      stay
      hit
      hit
    
    
      14
      stay
      hit
      stay
      stay
      hit
      hit
      hit
      hit
      hit
      hit
    
    
      15
      hit
      stay
      stay
      stay
      stay
      hit
      hit
      stay
      stay
      hit
    
    
      16
      hit
      stay
      stay
      stay
      stay
      hit
      stay
      hit
      hit
      hit
    
    
      17
      hit
      stay
      stay
      stay
      stay
      stay
      hit
      hit
      hit
      hit
    
    
      18
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
    
    
      19
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
    
    
      20
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay
      stay



In [5]:

    
policy_Q_score = df.pivot('player_value', 'dealer_value')['score']
display(policy_Q_score)









    






  
    
      dealer_value
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
    
    
      player_value
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      12
      -0.19802
      -0.101942
      -0.302752
      -0.131034
      -0.0798122
      -0.206897
      -0.257732
      -0.38674
      -0.414489
      -0.604348
    
    
      13
      -0.280488
      -0.2
      -0.361991
      -0.252101
      -0.14
      -0.191111
      -0.39819
      -0.570815
      -0.45672
      -0.570281
    
    
      14
      -0.307087
      -0.34715
      -0.256545
      -0.156757
      -0.212766
      -0.185022
      -0.32093
      -0.398104
      -0.539195
      -0.566265
    
    
      15
      -0.345946
      -0.164948
      -0.274854
      -0.288889
      -0.298077
      -0.311404
      -0.404858
      -0.560345
      -0.58598
      -0.645669
    
    
      16
      -0.497487
      -0.205128
      -0.287356
      -0.146919
      -0.19774
      -0.441441
      -0.582609
      -0.521053
      -0.522333
      -0.551181
    
    
      17
      -0.511013
      -0.136126
      -0.021164
      1.38066e-17
      -0.052381
      -0.290323
      -0.336032
      -0.512931
      -0.570558
      -0.661597
    
    
      18
      0.140909
      0.268041
      0.144444
      0.234043
      0.184358
      0.374468
      0.101124
      -0.216216
      -0.241546
      -0.349776
    
    
      19
      0.368889
      0.534653
      0.347561
      0.460123
      0.573034
      0.539007
      0.660465
      0.252632
      -0.00215517
      -0.148325
    
    
      20
      0.639456
      0.680921
      0.59919
      0.644128
      0.658915
      0.786408
      0.864238
      0.770642
      0.401249
      0.0721003



In [13]:

    
blackjack = BlackJack()
test_policy(blackjack)









    



---------- Testing policy:-----------
Initial state:
state_info(player_value=20, dealer_value=3)
Move #: 1; Taking action: stay
state_info(player_value=20, dealer_value=17)
Summary: player wins :Player Reward: 1



In [ ]:

dealer_value	2	3	4	5	6	7	8	9	10	11
player_value
12	stay	hit	stay	stay	stay	hit	hit	hit	hit	hit
13	stay	hit	hit	hit	stay	hit	hit	stay	hit	hit
14	stay	hit	stay	stay	hit	hit	hit	hit	hit	hit
15	hit	stay	stay	stay	stay	hit	hit	stay	stay	hit
16	hit	stay	stay	stay	stay	hit	stay	hit	hit	hit
17	hit	stay	stay	stay	stay	stay	hit	hit	hit	hit
18	stay	stay	stay	stay	stay	stay	stay	stay	stay	stay
19	stay	stay	stay	stay	stay	stay	stay	stay	stay	stay
20	stay	stay	stay	stay	stay	stay	stay	stay	stay	stay

dealer_value	2	3	4	5	6	7	8	9	10	11
player_value
12	-0.19802	-0.101942	-0.302752	-0.131034	-0.0798122	-0.206897	-0.257732	-0.38674	-0.414489	-0.604348
13	-0.280488	-0.2	-0.361991	-0.252101	-0.14	-0.191111	-0.39819	-0.570815	-0.45672	-0.570281
14	-0.307087	-0.34715	-0.256545	-0.156757	-0.212766	-0.185022	-0.32093	-0.398104	-0.539195	-0.566265
15	-0.345946	-0.164948	-0.274854	-0.288889	-0.298077	-0.311404	-0.404858	-0.560345	-0.58598	-0.645669
16	-0.497487	-0.205128	-0.287356	-0.146919	-0.19774	-0.441441	-0.582609	-0.521053	-0.522333	-0.551181
17	-0.511013	-0.136126	-0.021164	1.38066e-17	-0.052381	-0.290323	-0.336032	-0.512931	-0.570558	-0.661597
18	0.140909	0.268041	0.144444	0.234043	0.184358	0.374468	0.101124	-0.216216	-0.241546	-0.349776
19	0.368889	0.534653	0.347561	0.460123	0.573034	0.539007	0.660465	0.252632	-0.00215517	-0.148325
20	0.639456	0.680921	0.59919	0.644128	0.658915	0.786408	0.864238	0.770642	0.401249	0.0721003