In [10]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn
In [11]:
number_arms = 100
number_pulls = 50000
number_bandits = 2000 # number of randomly generated bandit configurations
epsilon = 0.1
In [12]:
%%time
q_star = np.random.normal(5, 1, (number_bandits, number_arms)) # Q* with mean 5 and var 1
q = np.zeros((number_bandits, number_arms))
counts = np.zeros((number_bandits, number_arms))
rand = np.random.rand(number_pulls, number_bandits) # random values for epsilon-greedy
range_bandits = np.arange(number_bandits) # TODO: this is a hack to get slicing working, is there a better way?
rewards_global = np.zeros((number_bandits, number_pulls))
for p in range(number_pulls):
# select an arm
r = rand[p]
arms = np.where(r < epsilon, np.random.choice(number_arms), np.argmax(q, axis=1))
# Get a reward based on arm
rewards = np.random.normal(q_star[range_bandits, arms], 1)
rewards_global[:, p] = rewards
# Update
tmp = np.zeros_like(counts)
tmp[range_bandits, arms] = 1 # TODO: don't like these tmp stuff, can we get rid of them?
counts += tmp
tmp = np.zeros_like(q)
tmp[range_bandits, arms] = (1 / counts[range_bandits, arms]) * (rewards - q[range_bandits, arms])
q += tmp
In [13]:
pd.Series(rewards_global.mean(axis=0)).plot();