The code here is refernced from https://github.com/awjuliani/DeepRL-Agents/blob/master/Contextual-Policy.ipynb

In [1]:
#Importing dependencies
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [2]:
#Defining contextual bandits, four four arm bandits
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, 1 and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5],[-5,0.2,0,1]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits)) #Returns a random state for each episode.
        return self.state
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [3]:
#Defining Policy based agent. It takes input current state and retruns action
class agent():
    def __init__(self, lr, s_size,a_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
        output = slim.fully_connected(state_in_OH,a_size,\
            biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,0)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [4]:
#Training network by getting state from environment, take an action and receive reward
tf.reset_default_graph() #Clear the Tensorflow graph.

cBandit = contextual_bandit() #Load the bandits.
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions) #Load the agent.
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

total_episodes = 10000 #Set total number of episodes to train agent on.
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit() #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        #Update the network.
        feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)
        
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
        i+=1
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")


Mean reward for each of the 4 bandits: [ 0.    0.25  0.    0.  ]
Mean reward for each of the 4 bandits: [  3.5   19.5   30.25  31.  ]
Mean reward for each of the 4 bandits: [  5.5   48.5   59.    62.75]
Mean reward for each of the 4 bandits: [  8.75  75.25  83.75  91.5 ]
Mean reward for each of the 4 bandits: [   7.5   100.5   109.5   120.25]
Mean reward for each of the 4 bandits: [  16.5   130.5   133.25  148.5 ]
Mean reward for each of the 4 bandits: [  47.5   157.25  157.75  176.25]
Mean reward for each of the 4 bandits: [  76.    184.25  186.    207.5 ]
Mean reward for each of the 4 bandits: [ 104.75  214.25  212.25  236.5 ]
Mean reward for each of the 4 bandits: [ 133.    240.25  242.25  263.25]
Mean reward for each of the 4 bandits: [ 156.    271.25  269.75  292.75]
Mean reward for each of the 4 bandits: [ 186.75  301.75  298.75  317.5 ]
Mean reward for each of the 4 bandits: [ 212.25  329.25  328.    347.25]
Mean reward for each of the 4 bandits: [ 240.25  360.    353.    376.  ]
Mean reward for each of the 4 bandits: [ 270.25  391.5   373.25  403.75]
Mean reward for each of the 4 bandits: [ 293.25  421.25  401.25  433.5 ]
Mean reward for each of the 4 bandits: [ 322.25  448.5   428.75  460.75]
Mean reward for each of the 4 bandits: [ 350.25  477.75  454.25  489.  ]
Mean reward for each of the 4 bandits: [ 376.    511.25  478.    518.  ]
Mean reward for each of the 4 bandits: [ 404.75  537.75  505.25  546.5 ]
The agent thinks action 4 for bandit 1 is the most promising....
...and it was right!
The agent thinks action 2 for bandit 2 is the most promising....
...and it was right!
The agent thinks action 1 for bandit 3 is the most promising....
...and it was right!
The agent thinks action 1 for bandit 4 is the most promising....
...and it was right!

In [5]:
# Now assuming each bandit to be person and each arm to be a genre of film. 
#Creating a dataset which includes ratings of 4 persons for 4 different movie genres
import pandas as pd
genre_names = ["Adventure", "Sci-Fi", "Romance", "Horror"]
person_names = ["person1", "person2", "person3", "person4"]
df = pd.DataFrame(index = person_names, columns = genre_names)

In [6]:
#Now let person 1 rate his preferences of genre on a scale of 5
df.loc['person1'] = pd.Series({'Adventure':1, 'Sci-Fi':5, 'Romance':2, 'Horror':1})
#Similarly for others
df.loc['person2'] = pd.Series({'Adventure':1, 'Sci-Fi':2, 'Romance':1, 'Horror':5})
df.loc['person3'] = pd.Series({'Adventure':5, 'Sci-Fi':3, 'Romance':2, 'Horror':2})
df.loc['person4'] = pd.Series({'Adventure':1, 'Sci-Fi':2, 'Romance':5, 'Horror':1})

In [7]:
#Looking at the dataset
df


Out[7]:
Adventure Sci-Fi Romance Horror
person1 1 5 2 1
person2 1 2 1 5
person3 5 3 2 2
person4 1 2 5 1

In [8]:
#Now repersenting this dataset in terms of bandit problem. Lower the bandit number more likely a positive reward.
#Convert all positive scores to negative and train the bandit problem as defined above.
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #Assuming bandits to be persons and arms as genre. Currently genres Sci-Fi,Horror,Adventure,Romance are 
        #the most optimal respectively
        self.bandits = np.array([[-1,-5,-2,-1],[-1,-2,-1,-5],[-5,-3,-2,-2],[-1,-2,-5,-1]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits)) #Returns a random state for each episode.
        return self.state
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [9]:
#Training network by getting state from environment, take an action and receive reward
tf.reset_default_graph() #Clear the Tensorflow graph.

cBandit = contextual_bandit() #Load the bandits.
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions) #Load the agent.
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

total_episodes = 10000 #Set total number of episodes to train agent on.
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit() #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        #Update the network.
        feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)
        
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " persons: " + str(np.mean(total_reward,axis=1)))
        i+=1
for a in range(cBandit.num_bandits):
    print("The agent thinks genre " + str(np.argmax(ww[a])+1) + " for person " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")


Mean reward for each of the 4 persons: [ 0.    0.25  0.    0.  ]
Mean reward for each of the 4 persons: [ 16.25  23.25  32.75  18.5 ]
Mean reward for each of the 4 persons: [ 36.5   47.    64.25  39.5 ]
Mean reward for each of the 4 persons: [ 57.    70.75  94.5   60.5 ]
Mean reward for each of the 4 persons: [  79.     88.5   127.75   78.5 ]
Mean reward for each of the 4 persons: [ 102.75  109.5   162.5    96.  ]
Mean reward for each of the 4 persons: [ 131.25  133.    187.75  117.75]
Mean reward for each of the 4 persons: [ 157.    152.5   216.5   142.25]
Mean reward for each of the 4 persons: [ 178.5   179.5   247.    162.75]
Mean reward for each of the 4 persons: [ 202.5   204.25  281.    183.5 ]
Mean reward for each of the 4 persons: [ 224.25  231.    311.25  200.25]
Mean reward for each of the 4 persons: [ 250.5   252.25  344.    218.5 ]
Mean reward for each of the 4 persons: [ 272.75  273.    375.    237.5 ]
Mean reward for each of the 4 persons: [ 292.25  292.25  408.    258.25]
Mean reward for each of the 4 persons: [ 310.25  312.25  440.25  280.5 ]
Mean reward for each of the 4 persons: [ 330.75  335.    474.    303.  ]
Mean reward for each of the 4 persons: [ 351.75  355.5   502.    327.5 ]
Mean reward for each of the 4 persons: [ 372.75  373.25  532.5   347.25]
Mean reward for each of the 4 persons: [ 398.5   392.75  567.75  368.75]
Mean reward for each of the 4 persons: [ 422.    414.5   598.5   390.25]
The agent thinks genre 1 for person 1 is the most promising....
...and it was wrong!
The agent thinks genre 3 for person 2 is the most promising....
...and it was wrong!
The agent thinks genre 1 for person 3 is the most promising....
...and it was right!
The agent thinks genre 1 for person 4 is the most promising....
...and it was wrong!

In [ ]:
#It does not get it right for all persons.I am assuming the use case might be wrong or data needs some pre-processing.
#Will have to learn more about it