Deep Q Network (DQN)

$Q_{s,a} \Leftarrow R_{s,a} + \gamma . max\left(Q_{s', a^*}\right)$

$Q_{s,a}$ should be equal the immediate reward ($R_{s,a}$) plus the best reward of the future state ($max\left(Q_{s', a^*}\right)$)



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import gym
import numpy as np
import random
import tensorflow as tf
env = gym.make('FrozenLake-v0')
alpha=0.1
beta = 0.1
gamma = .95
num_episodes = 1000









    



[2017-10-14 21:12:00,337] Making new env: FrozenLake-v0



In [2]:

    
#Forward
tf.reset_default_graph()
tf_s = tf.placeholder(shape=[1,16],dtype=tf.float32)
tf_W = tf.Variable(tf.random_uniform([16,4],0,0.01))
tf_b = tf.Variable(tf.random_uniform([1,4],0,0.01))
tf_Q = tf.add(tf.matmul(tf_s,tf_W),tf_b)
tf_a = tf.argmax(tf_Q,1)

#Backward
tf_Qn = tf.placeholder(shape=[1,4],dtype=tf.float32)
Error = tf.reduce_sum(tf.square(tf_Qn - tf_Q))
train= tf.train.GradientDescentOptimizer(learning_rate=alpha).minimize(Error)



In [3]:

    
init = tf.global_variables_initializer()

reward_list=[]
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        s = env.reset()
        done = False
        while done==False:
            a,allQ = sess.run([tf_a,tf_Q],feed_dict={tf_s:np.identity(16)[s:s+1]})
            if np.random.rand(1) < beta:
                a[0] = env.action_space.sample()
            #take action
            s1,reward,done,info = env.step(a[0])
            if done:#reaching goal or falling into a hole
                if reward:
                    beta=beta*0.99
                    r=1
                else:
                    r=-1
            else:#reaching maximum step
                r = 0.0
            #obtian Q of the next step
            Q_next= sess.run(tf_Q,feed_dict={tf_s:np.identity(16)[s1:s1+1]})
            #find the best reward
            allQ[0,a[0]] = r + gamma*np.max(Q_next)
            sess.run([train],feed_dict={tf_s:np.identity(16)[s:s+1],tf_Qn:allQ})
            s = s1
            if done == True:
                break
        reward_list.append(reward)
    W,b=sess.run([tf_W,tf_b])
plt.plot(np.convolve(np.ones(100),reward_list,"valid"))









    Out[3]:





[<matplotlib.lines.Line2D at 0x17a9edda2b0>]



In [4]:

    
np.identity(16)[[2]]









    Out[4]:





array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.]])



In [5]:

    
a=np.random.randint(10,size=(3,3))



In [6]:

    
a









    Out[6]:





array([[6, 6, 7],
       [2, 8, 0],
       [6, 4, 6]])



In [7]:

    
a.T[[1]]









    Out[7]:





array([[6, 8, 4]])



In [ ]: