In [19]:
import numpy as np
import matplotlib.pyplot as plt
 
num_traial = 500

P = [0.1, 0.5, 0.5, 0.5, 0.7]
num_bandit = len(P)

In [81]:
class Bandit(object):

    def __init__(self):
        self.RandomState = np.random.RandomState()
        self.observed = [[] for _ in range(5)]
        self.reward = []
        self.choice = []
 
    def get_reward(self, mac):
        return self.RandomState.binomial(1, P[mac])

    def draw(self, num_trial):
        for _ in range(num_trial):
            choice = self.choose()
            reward = self.get_reward(choice)
            self.choice.append(choice)
            self.reward.append(reward)
            self.observed[choice].append(reward)
        return self
    
    def get_avg_reward(self):
        return np.cumsum(self.reward) / (1.+np.arange(len(self.reward)))
 
    def estimate_E(self):
        return [
            (sum(observed) / float(len(observed)) if len(observed)!=0 else 1)
            for observed in self.observed]

class EpsilonGreedy(Bandit):

    def __init__(self, eps):
        self.eps = eps
        super(EpsilonGreedy, self).__init__()
 
    def choose(self):
        if self.RandomState.binomial(1, self.eps) == 1:
            choise = self.RandomState.randint(num_bandit)
        else:
            choise = np.argmax(self.estimate_E())
        return choise

class UCB(Bandit):

    def choose(self):
        _eps = 1e-16
        est = self.estimate_E()
        for i in range(len(est)):
            N = len(self.reward) + _eps
            n = len(self.observed[i]) + _eps
            est[i] += np.sqrt(np.log(N) / n)
        choise = np.argmax(est)
        return choise

class UCBtuned(Bandit):

    def choose(self):
        _eps = 1e-16
        est = self.estimate_E()
        for i in range(len(est)):
            N = len(self.reward)
            n = len(self.observed[i]) + _eps
            V = np.var(self.observed[i])
            est[i] += np.sqrt(np.log(N) * V / n)
        choise = np.argmax(est)
        return choise

# model = EpsilonGreedy(eps=0.01).draw(num_traial)

In [68]:
MAR_greedy = np.mean([
    EpsilonGreedy(eps=0).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
    axis=0)

In [69]:
MAR_eps1 = np.mean([
    EpsilonGreedy(eps=0.1).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
    axis=0)

In [70]:
MAR_eps5 = np.mean([
    EpsilonGreedy(eps=0.5).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
    axis=0)

In [71]:
MAR_UCB = np.mean([
    UCB().draw(num_traial).get_avg_reward() for _ in xrange(1000)],
    axis=0)

In [82]:
MAR_UCBt = np.mean([
    UCBtuned().draw(num_traial).get_avg_reward() for _ in xrange(1000)],
    axis=0)


-c:63: RuntimeWarning: divide by zero encountered in log

In [85]:
plt.plot(MAR_greedy)
plt.plot(MAR_eps1)
plt.plot(MAR_eps5)
plt.plot(MAR_UCB)


plt.ylim([0.5, 0.7])


Out[85]:
(0.5, 0.7)

In [ ]: