In [19]:
import numpy as np
import matplotlib.pyplot as plt
num_traial = 500
P = [0.1, 0.5, 0.5, 0.5, 0.7]
num_bandit = len(P)
In [81]:
class Bandit(object):
def __init__(self):
self.RandomState = np.random.RandomState()
self.observed = [[] for _ in range(5)]
self.reward = []
self.choice = []
def get_reward(self, mac):
return self.RandomState.binomial(1, P[mac])
def draw(self, num_trial):
for _ in range(num_trial):
choice = self.choose()
reward = self.get_reward(choice)
self.choice.append(choice)
self.reward.append(reward)
self.observed[choice].append(reward)
return self
def get_avg_reward(self):
return np.cumsum(self.reward) / (1.+np.arange(len(self.reward)))
def estimate_E(self):
return [
(sum(observed) / float(len(observed)) if len(observed)!=0 else 1)
for observed in self.observed]
class EpsilonGreedy(Bandit):
def __init__(self, eps):
self.eps = eps
super(EpsilonGreedy, self).__init__()
def choose(self):
if self.RandomState.binomial(1, self.eps) == 1:
choise = self.RandomState.randint(num_bandit)
else:
choise = np.argmax(self.estimate_E())
return choise
class UCB(Bandit):
def choose(self):
_eps = 1e-16
est = self.estimate_E()
for i in range(len(est)):
N = len(self.reward) + _eps
n = len(self.observed[i]) + _eps
est[i] += np.sqrt(np.log(N) / n)
choise = np.argmax(est)
return choise
class UCBtuned(Bandit):
def choose(self):
_eps = 1e-16
est = self.estimate_E()
for i in range(len(est)):
N = len(self.reward)
n = len(self.observed[i]) + _eps
V = np.var(self.observed[i])
est[i] += np.sqrt(np.log(N) * V / n)
choise = np.argmax(est)
return choise
# model = EpsilonGreedy(eps=0.01).draw(num_traial)
In [68]:
MAR_greedy = np.mean([
EpsilonGreedy(eps=0).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
axis=0)
In [69]:
MAR_eps1 = np.mean([
EpsilonGreedy(eps=0.1).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
axis=0)
In [70]:
MAR_eps5 = np.mean([
EpsilonGreedy(eps=0.5).draw(num_traial).get_avg_reward() for _ in xrange(1000)],
axis=0)
In [71]:
MAR_UCB = np.mean([
UCB().draw(num_traial).get_avg_reward() for _ in xrange(1000)],
axis=0)
In [82]:
MAR_UCBt = np.mean([
UCBtuned().draw(num_traial).get_avg_reward() for _ in xrange(1000)],
axis=0)
In [85]:
plt.plot(MAR_greedy)
plt.plot(MAR_eps1)
plt.plot(MAR_eps5)
plt.plot(MAR_UCB)
plt.ylim([0.5, 0.7])
Out[85]:
In [ ]: