In [1]:
import tensorflow as tf
import numpy as np
Here we define our bandits. For this example we are using a four-armed bandit. The pullBandit function generates a random number from a normal distribution with a mean of 0. The lower the bandit number, the more likely a positive reward will be returned. We want our agent to learn to always choose the bandit that will give that positive reward.
In [2]:
bandits = [0.2, 0, -0.2, -5] # Random order
num_bandits = len(bandits)
def pullBandit(bandit):
# Get a random number
result = np.random.randn(1)
if result > bandit:
# return a positive reward
return 1
else:
return -1
The code below established our simple neural agent. It consists of a set of values for each of the bandits. Each value is an estimate of the value of the return from choosing the bandit. We use a policy gradient method to update the agent by moving the value for the selected action toward the recieved reward.
In [6]:
tf.reset_default_graph()
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)
reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
action_holder = tf.placeholder(shape = [1], dtype = tf.int32)
responsible_weight = tf.slice(weights, action_holder, [1])
loss = -(tf.log(responsible_weight)* reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
update = optimizer.minimize(loss)
In [11]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.
init = tf.initialize_all_variables()
# Launch the tensorflow graph
with tf.Session() as sess:
sess.run(init)
i = 0
while i < total_episodes:
#Choose either a random action or one from our network.
if np.random.rand(1) < e:
action = np.random.randint(num_bandits)
else:
action = sess.run(chosen_action)
reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
#Update the network.
_,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
#Update our running tally of scores.
total_reward[action] += reward
if i % 50 == 0:
print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
i+=1
print("The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandits)):
print("...and it was right!")
else:
print("...and it was wrong!")
In [ ]: