In [4]:
%matplotlib inline
from __future__ import division, print_function
from matplotlib import pyplot as plt
import numpy as np
from ipywidgets import interact
In [9]:
def play(bandits, n_repetitions, mu, metric='average', noise=.3):
mu_star = np.max(mu)
rewards = {name: np.zeros(n_repetitions) for name in bandits}
for i in range(n_repetitions):
for name, bandit in bandits.items():
arm = bandit.play()
rewards[name][i] = mu[arm]
bandit.feedback(arm, np.clip(mu[arm] + noise * np.random.randn(), 0, 1))
plt.hold(True)
for name, reward in rewards.items():
if metric == 'regret':
data = mu_star - reward
elif metric == 'cumulative': # Also known as total regret.
data = np.cumsum(mu_star - reward)
else:
assert metric == 'average'
data = np.cumsum(mu_star - reward) / np.linspace(1, n_repetitions, n_repetitions)
plt.plot(data, '--', label=name)
plt.xlabel("Rounds")
plt.ylabel("Metric (%s)" % metric)
plt.legend()
plt.show()
This strategy involves picking a (small) $\epsilon$ and then at any stage after every arm has been played at least once, explore with a probability of $\epsilon$, and exploit otherwise.
For a suitable choice of $\epsilon_t$, $R_T = O(k \log T)$, which means that $\frac{R_T}{T} = O\left( \frac{\log T}{T} \right)$, which goes to 0 as T goes to $\infty$.
In [10]:
class EpsGreedy:
def __init__(self, n_arms, eps=0):
self.eps = eps
self.n_arms = n_arms
self.payoffs = np.zeros(n_arms)
self.n_plays = np.zeros(n_arms)
def play(self):
# Note that the theory tells us to pick epsilon as O(1/t), not constant (which we use here).
idx = np.argmin(self.n_plays)
if self.n_plays[idx] == 0:
return idx
if np.random.rand() <= self.eps:
return np.random.randint(self.n_arms)
else:
return np.argmax(self.payoffs / self.n_plays)
def feedback(self, arm, reward):
self.payoffs[arm] += reward
self.n_plays[arm] += 1
This algorithms keeps track of the upper confidence bound for every arm, and always picks the arm with the best upper confidence bound.
At any point in time $t$, we know each arm's draw count $n_i^{(t)}$, as well as its average payoff $\hat{\mu}_i^{(t)}$. Based on this, we can compute every arm's upper confidence bound (or UCB):
\begin{equation} \operatorname{UCB}(i) = \hat{\mu}_i + \sqrt{\frac{2\ln t}{n_i}} \end{equation}Also no-regret, just like $\epsilon$-greedy. The math is just a bit fluffier (see slide dm-11:20).
In [19]:
class UCB:
def __init__(self, n_arms, tau):
self.n_arms = n_arms
self.means = np.zeros(n_arms)
# Note that the UCB1 algorithm has tau=1.
self.n_plays = np.zeros(n_arms)
self.tau = tau
self.t = 0
def play(self, plot=True):
# If plot is true, it will plot the means + bounds every 100 iterations.
self.t += 1
idx = np.argmin(self.n_plays)
if self.n_plays[idx] == 0:
return idx
ub = self.tau * np.sqrt(2 * np.log(self.t) / self.n_plays)
ucb = self.means + ub
if plot and self.t % 100 == 0:
plt.errorbar(list(range(self.n_arms)), self.means, yerr=ub)
plt.show()
print('chose arm', np.argmax(ucb))
return np.argmax(ucb)
def feedback(self, arm, reward):
self.n_plays[arm] += 1
self.means[arm] += 1 / (self.n_plays[arm]) * (reward - self.means[arm])
In [20]:
@interact(n_arms=(10, 100, 1), n_rounds=(100, 1000, 10), eps=(0, 1, .01) , tau=(0, 1, .01))
def run(n_arms, n_rounds, eps, tau):
np.random.seed(123)
# Initialize the arm payoffs.
mu = np.random.randn(n_arms)
# Some other strategies for sampling.
# mu = np.random.standard_cauchy(n_arms)
# mu = np.random.gamma(shape=.1, size=(n_arms, 1))
mu = np.abs(mu)
mu /= np.max(mu)
plt.bar(list(range(n_arms)), mu)
plt.xlabel('arms')
plt.ylabel('rewards')
plt.show()
bandits = {
'eps-{0}'.format(eps) : EpsGreedy(n_arms, eps=eps),
'ucb-{0}'.format(tau) : UCB(n_arms, tau=tau)
}
play(bandits, n_rounds, mu)
# Hint: You can also plot the upper bound from UCB1 and see how tight it is.
In [ ]: