In [11]:
from games.blackjack.blackjack import BlackJack
from bandits import BanditAlgorithm
import pandas as pd
import rl_learning as rl
from IPython.display import display, HTML
import cPickle as pickle
In [12]:
def test_training_monte_carlo_for_blackjack(model_class, epochs):
blackjack = BlackJack()
policy, model = rl.train_reinforcement_learning_strategy(num_sims=epochs, game_obs=blackjack, model_class=model_class)
# TODO Add ipython notebook 3D ghaph
return policy, model
def test_policy(game_obs, model=None):
print "---------- Testing policy:-----------"
banditAlgorithm = BanditAlgorithm(params=0.1)
game_obs.initiate_game()
print "Initial state:"
print game_obs.state
move = 1
# Unpickle if model obs not provided
if not model:
model = pickle.load(open(game_obs.base_folder_name + '/model_obs.pkl', mode='rb'))
if model.model_class == 'vw_python':
from vowpal_wabbit import pyvw
model.model = pyvw.vw("--quiet -i {0}".format(model.model_path))
while game_obs.game_status == 'in process':
new_qval_table = banditAlgorithm.return_decision_reward_tuples(game_obs.state, model)
best_action, value_estimate = banditAlgorithm.return_decision_with_max_reward(new_qval_table)
print('Move #: %s; Taking action: %s' % (move, best_action))
reward = game_obs.play(best_action)
print game_obs.state
if game_obs.game_status != 'in process': print "Summary: " + game_obs.game_status + " :Player Reward: " + str(reward)
move += 1
if move > 15:
print "Too many moves"
break
In [3]:
policy, model = test_training_monte_carlo_for_blackjack(model_class='lookup_table', epochs=25000)
In [4]:
df = pd.DataFrame(policy).T
df.columns = ['player_value', 'dealer_value', 'decision', 'score']
policy_Q_table = df.pivot('player_value', 'dealer_value')['decision']
display(policy_Q_table)
In [5]:
policy_Q_score = df.pivot('player_value', 'dealer_value')['score']
display(policy_Q_score)
In [13]:
blackjack = BlackJack()
test_policy(blackjack)
In [ ]: