Q Learning

$Q_{s,a} = (1-\alpha).Q_{s,a} + \alpha \left( R_{s,a} + \gamma . max(Q_{s',a'})\right)$


In [8]:
%matplotlib inline
from matplotlib import pyplot as plt
import gym
import numpy as np
from gym.envs.registration import register
from gym import wrappers
import shutil

env = gym.make('FrozenLake-v0')

In [69]:
Q= np.zeros((env.observation_space.n, env.action_space.n)) #16x4
alpha=0.1
beta=0.001
gamma=0.95
num_episodes = 1000
reward_list=[]
for i in range(num_episodes):
    s = env.reset();
    done=False
    while done==False:
        if np.random.rand() < beta:
            a = np.random.randint(env.action_space.n)
        else:
            a = np.argmax(Q[s,:] )
        s_next,reward,done,info = env.step(a)
        if done:#reaching goal or falling into a hole
            r = 1.0 if reward > 0.0 else -1.0
        else:#reaching maximum step
            r = 0.0
        Q[s,a]= (1-alpha)*Q[s,a] + alpha*( r + gamma*np.max(Q[s_next,:]) )
        if done == True:
            break
        s=s_next
    reward_list.append(reward)
                                        
plt.plot(np.convolve(np.ones(100),reward_list,"valid"))


Out[69]:
[<matplotlib.lines.Line2D at 0x11a920a20>]

In [91]:
Q= np.zeros((env.observation_space.n, env.action_space.n)) #16x4
alpha=0.1
beta=0.8
gamma=0.95
num_episodes = 1000
reward_list=[]
for i in range(num_episodes):
    s = env.reset();
    done=False
    while done==False:
        if np.random.rand() < beta:
            a = np.random.randint(env.action_space.n)
        else:
            a = np.argmax(Q[s,:] )
        s_next,reward,done,info = env.step(a)
        if done:
            if reward:
                beta=beta*0.8
                r=1.0
            else:
                r=-1.0
        else:
            r=0.0
        Q[s,a]= (1-alpha)*Q[s,a] + alpha*( r + gamma*np.max(Q[s_next,:]) )
        if done == True:
            break
        s=s_next
    reward_list.append(reward)
                                        
plt.plot(np.convolve(np.ones(100),reward_list,"valid"))


Out[91]:
[<matplotlib.lines.Line2D at 0x11c318fd0>]

In [38]:
print("Final Q-Table Values")
print(Q)


Final Q-Table Values
[[ 0.1479462  -0.11722328 -0.14986112 -0.1131554 ]
 [-0.22252525 -0.31275729 -0.32022589  0.01289748]
 [-0.25324413 -0.17120073 -0.25778181 -0.24802181]
 [-0.37508302 -0.28421851 -0.28924321 -0.14126051]
 [ 0.17230254 -0.40551402 -0.39557484 -0.21985928]
 [ 0.          0.          0.          0.        ]
 [-0.56797402 -0.5988329  -0.29660334 -0.62491104]
 [ 0.          0.          0.          0.        ]
 [-0.19243743 -0.53446208 -0.36717141  0.21624092]
 [-0.34666298  0.2913575  -0.21430814 -0.407001  ]
 [ 0.29637695 -0.21388738 -0.14513224 -0.21129386]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.14600718 -0.17888614  0.48617864 -0.12385322]
 [ 0.0840284   0.80000165  0.34125469  0.19159725]
 [ 0.          0.          0.          0.        ]]

In [6]:
s = env.reset()
d=False
n=0
while d==False:
    n+=1
    a = np.argmax(Q[s,:])
    s,r,d,x = env.step(a)
    #print("%s %s %s %s"%(s,r,d,x))
    env.render()
print(n)


  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
27

In [7]:
env.close()
#gym.upload('/tmp/FrozenLake_01', api_key='sk_o9OoYpSkKamkW8MrKuHw')

In [ ]: