In [10]:
import cPickle
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from math import sqrt
from mpl_toolkits.mplot3d import axes3d
import os
os.environ["FONTCONFIG_PATH"]="/etc/fonts"


#
# import time
#
# def procedure():
#     time.sleep(2.5)
#
# # measure process time
# t0 = time.clock()
# procedure()
# print time.clock() - t0, "seconds process time"

from easy21 import *

pkl_file = open('Qtable_monte_carlo_1e6.pkl', 'rb')
Q_table_mc = cPickle.load(pkl_file)

pkl_file.close()





def compare_qtables(Qtable,Q_table_mc):
    #np.sum(((Qtable-Q_table_mc)**2).flatten())
    return  sqrt(mean_squared_error(Qtable.flatten(), Q_table_mc.flatten()))



def runepisode():

    #initialize the state and acion randomly at beginning of episode
    state = np.random.randint(low = 1, high=10, size=None),np.random.randint(low = 1, high=10, size=None) #player, dealer
    A = policy(state)

    terminated = False
    while( not terminated):

        reward, successor, terminated = step(state[0],state[1],A)
        #print("successor" , successor)
        if not terminated:
            A_prime = policy(successor)
            Qsprime_aprime = Qtable[successor[0]-1,successor[1]-1,A_prime]
        else:
            Qsprime_aprime = 0

        delta = reward + Qsprime_aprime - Qtable[state[0]-1,state[1]-1,A]
        episode.append((state,A,reward))

        #counting state visits
        Nsa[state[0]-1,state[1]-1,A] += 1
        Esa[state[0]-1,state[1]-1,A] += 1

        for s, a, reward in episode:
            alpha = 1/Nsa[s[0]-1,s[1]-1,a]
            Qtable[s[0]-1,s[1]-1,a] += alpha*delta*Esa[s[0]-1,s[1]-1,a]
            Esa[s[0]-1,s[1]-1,a]*= lambda_



        if not terminated:
            A = A_prime
            state = successor


def policy(state):
    # print(Nsa)
    # print Nsa.shape
    Ns = Nsa[state[0]-1,state[1]-1,0] + Nsa[state[0]-1,state[1]-1,1]
    N_0 = 100
    epsilon = N_0/(N_0 + Ns)

    explore = np.random.choice([1,0],p=[epsilon, 1-epsilon])
    if not explore:
        return np.argmax(Qtable[state[0]-1,state[1]-1,:])
    else:
        return np.random.choice([1,0])

numiter = 1000


error_lists = []

mse_1000 = []

for lambda_ in np.linspace(0,1,11):


    mse = []


    Qtable = np.zeros((21,10,2))
    Nsa = np.zeros((21,10,2))

    print lambda_

    # policy = np.argmax(Qtable[state])
    for i in range(numiter):
        episode = []  #just one episode
        Esa = np.zeros((21,10,2))
        #print i
        runepisode()

        #compare q tables
        if i%1000 == 0:
            mse.append(compare_qtables(Qtable,Q_table_mc))

    mse_1000.append(compare_qtables(Qtable,Q_table_mc))

    error_lists.append(mse)

opt_Valuefunction = np.max(Qtable,2)
print(opt_Valuefunction.shape)


## save to file
save = False
if save:
    output = open('Qtable_monte_carlo_1e6.pkl', 'wb')
    cPickle.dump(opt_Valuefunction, output)
    output.close()


0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
(21, 10)

In [11]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
X, Y = np.meshgrid(range(1,11), range(1,22))
# print(X.shape,Y.shape)
ax.plot_wireframe(X,Y, opt_Valuefunction)
ax.set_xlabel("dealer")
ax.set_ylabel("player")
ax.set_zlabel("value")


fig = plt.figure()
opt_policy = np.argmax(Qtable,2)
plt.imshow(opt_policy,cmap=plt.get_cmap('gray'),interpolation='none')
plt.xlabel("dealer")
plt.ylabel("player")

fig = plt.figure()
plt.plot(mse_1000)
plt.xlabel("lambda")
plt.ylabel("mean squared errror from 1e6 monte carlo")


fig = plt.figure()
for i in range(11):
    line1, = plt.plot(range(len(error_lists[i])), error_lists[i], label=r'$\lambda$'+"="+str(np.linspace(0,1,11)[i]))
    
plt.legend()

plt.xlabel("1000 episodes")
plt.ylabel("Mean squared error to Monte Carlo 1e6 ")

plt.show()

In [ ]: