In [1]:
import numpy as np
import sys
if "../" not in sys.path:
sys.path.append("../")
from collections import defaultdict
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
from envs.blackjack import BlackjackEnv
from utils import plotting
from mc_control_importance_sampling import mc_control_importance_sampling
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
In [2]:
env = BlackjackEnv()
In [3]:
def create_random_policy(nA):
"""
Creates a random policy function.
Args:
nA: Number of actions in the environment.
Returns:
A function that takes an observation as input and returns a vector
of action probabilities
"""
A = np.ones(nA, dtype=float) / nA
def policy_fn(observation):
return A
return policy_fn
In [4]:
random_policy = create_random_policy(env.action_space.n)
Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy)
In [5]:
# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, action_values in Q.items():
action_value = np.max(action_values)
V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")