In [22]:
## imports
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [28]:
# Let's define a generic bandit class
n_trials = 2000
t_max = 500
In [29]:
## And three actions
n_actions = 3
def action_0():
return np.random.choice([10,-10], p=[0.5, 0.5])
def action_1():
return np.random.choice([10,-10], p=[0.6, 0.4])
def action_2():
return np.random.choice([10,-10], p=[0.2, 0.8])
rewards = [action_0, action_1, action_2]
In [30]:
class Bandit(object):
def __init__(self,n_actions):
self.counts = np.zeros(n_actions)
self.action_rewards = [[] for i in range(n_actions)]
self.rewards = []
self.n_actions = n_actions
def select_action(self):
"""Selection which arm/action to pull"""
pass
def update(self,action,reward):
"""Update the actions"""
self.counts[action] = self.counts[action] + 1
self.action_rewards[action].append(reward)
self.rewards.append(reward)
def get_Q_values(self):
Q_values = []
for q_v in self.action_rewards:
Q_values.append(np.array(q_v).mean())
return np.array(Q_values)
def get_V_value(self):
return np.array(self.v_value.mean())
## E-greedy
class Egreedy(Bandit):
def __init__(self, epsilon, *args, **kwargs):
super(Egreedy, self).__init__(*args, **kwargs)
self.epsilon = epsilon
def select_action(self):
#never_visited = np.nonzero(self.counts == 0)
# print indices_zero, "dfdf"
never_visited = np.where(self.counts == 0)[0]
#print never_visited
if(len(never_visited)!=0):
return np.random.choice(never_visited)
#print never_visited, "never"
Q_values = self.get_Q_values()
if np.random.random() > self.epsilon:
return np.argmax(Q_values)
else:
return np.random.randint(self.n_actions)
class UCB(Bandit):
def __init__(self, *args, **kwargs):
super(UCB, self).__init__(*args, **kwargs)
def select_action(self):
never_visited = np.where(self.counts == 0)[0]
#print never_visited
if(len(never_visited)!=0):
return np.random.choice(never_visited)
#print self.counts
#print never_visited, "never"
Q_values = self.get_Q_values()
#UCB = Q_values + np.sqrt(2 * np.log(len(self.v_values) + 1) / numPlays)
#print self.counts
for i in range(0,len(Q_values)):
#print np.sqrt((2 * np.log(len(self.v_values) + 1)) / len(self.q_values[i]))
Q_values[i]+= 0.5 * np.sqrt(( np.log(len(self.rewards))) / len(self.action_rewards[i]))
return np.argmax(Q_values)
class BootstrapThompson(Bandit):
def __init__(self, *args, **kwargs):
super(BootstrapThompson, self).__init__(*args, **kwargs)
#self.heads = [[[] for range(self.n_actions)] for range(n_heads)]
#self.n_heads = n_heads
def select_action(self):
never_visited = np.where(self.counts < 5)[0]
#print never_visited
if(len(never_visited)!=0):
return np.random.choice(never_visited)
Q_values = []
for i,q_v in enumerate(self.action_rewards):
b_sample = np.random.choice(q_v, len(q_v), replace=True)
Q_values.append(b_sample.mean())
#print i, q_v, "qv"
#print Q_values
#print ("===========")
return np.array(Q_values).argmax()
In [31]:
data = []
bandits = [#("random", lambda: Egreedy(epsilon = 1.0, n_actions = n_actions)),
#("greedy", lambda: Egreedy(epsilon = 0.0, n_actions = n_actions)),
#("0.2-greedy", lambda: Egreedy(epsilon = 0.2, n_actions = n_actions)),
("0.2-decreasing", lambda: Egreedy(epsilon = 0.2, n_actions = n_actions)),
("UCB", lambda: UCB( n_actions = n_actions)),
("BootstrapTS", lambda: BootstrapThompson( n_actions = n_actions)),
]
for b in bandits:
columns =["Step", "Cumulative Regret", "trial", "Algorithm"]
df = pd.DataFrame()
for trial in (range(n_trials)):
gaps = []
bandit = b[1]()
for i in range(t_max):
# select action
action = bandit.select_action()
# get the reward
reward = rewards[action]()
# update the
bandit.update(action, reward)
# Super hack for the lazy
if(b[0].endswith("decreasing")):
#print "decreasing"
bandit.epsilon*=0.99
gaps.append(2 - reward)
regret = np.array(gaps).sum()
data.append([i,regret,trial, b[0]])
#print df.head()
#print trial
df = df.append(data)
df.columns = columns
df.head()
plt.figure(figsize=(10, 5))
axes = sns.tsplot(time="Step", value="Cumulative Regret",
unit="trial", condition="Algorithm",data=df)
axes.set_ylim([0,40])
axes.set_xlim([0,100])
plt.savefig(b[0] + ".pdf",bbox_inches='tight')
In [ ]:
In [ ]: