In [11]:
    
from games.blackjack.blackjack import BlackJack
from bandits import BanditAlgorithm
import pandas as pd
import rl_learning as rl
from IPython.display import display, HTML
import cPickle as pickle
    
In [12]:
    
def test_training_monte_carlo_for_blackjack(model_class, epochs):
    blackjack = BlackJack()
    policy, model = rl.train_reinforcement_learning_strategy(num_sims=epochs, game_obs=blackjack, model_class=model_class)
    # TODO Add ipython notebook 3D ghaph
    return policy, model
def test_policy(game_obs, model=None):
    print "---------- Testing policy:-----------"
    banditAlgorithm = BanditAlgorithm(params=0.1)
    game_obs.initiate_game()
    print "Initial state:"
    print game_obs.state
    move = 1
    # Unpickle if model obs not provided
    if not model:
        model = pickle.load(open(game_obs.base_folder_name + '/model_obs.pkl', mode='rb'))
    if model.model_class == 'vw_python':
        from vowpal_wabbit import pyvw
        model.model = pyvw.vw("--quiet -i {0}".format(model.model_path))
    while game_obs.game_status == 'in process':
        new_qval_table = banditAlgorithm.return_decision_reward_tuples(game_obs.state, model)
        best_action, value_estimate = banditAlgorithm.return_decision_with_max_reward(new_qval_table)
        print('Move #: %s; Taking action: %s' % (move, best_action))
        reward = game_obs.play(best_action)
        print game_obs.state
        if game_obs.game_status != 'in process': print "Summary: " + game_obs.game_status + " :Player Reward: " + str(reward)
        move += 1
        if move > 15:
            print "Too many moves"
            break
    
In [3]:
    
policy, model = test_training_monte_carlo_for_blackjack(model_class='lookup_table', epochs=25000)
    
    
In [4]:
    
df = pd.DataFrame(policy).T
df.columns = ['player_value', 'dealer_value', 'decision', 'score']
policy_Q_table = df.pivot('player_value', 'dealer_value')['decision']
display(policy_Q_table)
    
    
In [5]:
    
policy_Q_score = df.pivot('player_value', 'dealer_value')['score']
display(policy_Q_score)
    
    
In [13]:
    
blackjack = BlackJack()
test_policy(blackjack)
    
    
In [ ]: