In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn
from joblib import Parallel, delayed
In [4]:
k = 2
bandits = 2000
pulls = 20000
epsilon = 0.1
In [5]:
%%time
def run_bandit():
average_rewards = np.zeros(pulls)
q_star = np.random.normal(5, 1, k)
values = np.zeros(k)
counts = np.zeros(k)
for p in range(pulls):
if np.random.uniform() <= epsilon:
arm = np.random.choice(range(k))
else:
arm = np.argmax(values)
# Play that arm
reward = np.random.normal(q_star[arm], 1)
counts[arm] += 1
# Update
values[arm] += (1 / counts[arm]) * (reward - values[arm])
average_rewards[p] = reward
return average_rewards
results = [Parallel(n_jobs=-1)(delayed(run_bandit)()
for _ in range(bandits))]
results = np.vstack(results[0])
print(results.mean(axis=0).max())
pd.Series(results.mean(axis=0)).plot()