Example 5.1:

Monte Carlo and DP solution of simple blackjack.

https://webdocs.cs.ualberta.ca/~sutton/book/code/blackjack1.lisp


In [1]:
import numpy as np

class blackjack(object):

    def __init__(self):

        shape = (11, 22, 2)
        self.V = np.zeros(shape)
        self.N = np.zeros(shape, dtype=np.int)
        self.policy = np.ones(shape, dtype=np.int)
        self.policy[1:11, 20:22, 0:2] = 0

        self.pc = 0
        self.ace = False
        self._episode = []

    def card(self):
        return min(10, 1 + np.random.randint(13))

    def bust(self):
        return (self.pc > 21)

    def draw_card(self):
        card = self.card()
        self.pc += card
        if ((not self.ace) and (card == 1)):
            self.pc += 10
            self.ace = True
        if (self.ace and (self.pc > 21)):
            self.pc -= 10
            self.ace = False

    def episode(self):
        self._episode = []
        dc_hidden = self.card()
        dc = self.card()
        pcard1 = self.card()
        pcard2 = self.card()
        self.ace = ((1 == pcard1) or (1 == pcard2))
        self.pc = pcard1 + pcard2
        if self.ace:
            self.pc += 10
        if self.pc != 21:
            while True:
                self._episode.append((dc, self.pc, self.ace))
                if (1 != self.policy[dc, self.pc, (1 if self.ace else 0)]):
                    break
                self.draw_card()
                if self.bust():
                    break
        self.learn(self.outcome(dc, dc_hidden))

    def learn(self, outcome):
        for (dc, pc, ace_boolean) in self._episode:
            ace = (1 if ace_boolean else 0)
            if (pc > 11):
                self.N[dc, pc, ace] += 1
                self.V[dc, pc, ace] += (outcome - self.V[dc, pc, ace]) / self.N[dc, pc, ace]

    def outcome(self, dc, dc_hidden):
        dace = ((1 == dc) or (1 == dc_hidden))
        dcount = dc + dc_hidden
        if dace:
            dcount += 10
        dnatural = (dcount == 21)
        pnatural = not self._episode
        if pnatural and dnatural:
            return 0
        if pnatural:
            return 1
        if dnatural:
            return -1
        if self.bust():
            return -1
        while (dcount < 17):
            card = self.card()
            dcount += card
            if ((not dace) and (card == 1)):
                dcount += 10
                dace = True
            if (dace and (dcount > 21)):
                dcount -= 10
                dace = False
        if (dcount > 21):
            return 1
        if (dcount > self.pc):
            return -1
        if (dcount == self.pc):
            return 0
        return 1

    _fig_count = 0
    def gr(self):
        import matplotlib.pyplot as plt
        plt.figure(self._fig_count)
        self._fig_count += 1

        plt.subplot(1, 2, 1)
        plt.imshow(self.V[:,:,0], interpolation='none', origin='lower')
        plt.xlim((11.5, 21.5))
        plt.ylim((0.5, 10.5))

        plt.subplot(1, 2, 2)
        plt.imshow(self.V[:,:,1], interpolation='none', origin='lower')
        plt.xlim((11.5, 21.5))
        plt.ylim((0.5, 10.5))

        plt.show()

In [2]:
# experiment
bj = blackjack()
for count in xrange(10):
    ar0 = np.zeros((10, 10))
    ar1 = np.zeros((10, 10))
    print count
    for _ in xrange(100000):
        bj.episode()
    bj.gr()


0
1
2
3
4
5
6
7
8
9