Q Learning

$Q_{s,a} = (1-\alpha).Q_{s,a} + \alpha \left( R_{s,a} + \gamma . max(Q_{s',a'})\right)$


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import gym
import numpy as np
from gym.envs.registration import register
from gym import wrappers
import shutil

In [2]:
env = gym.make('FrozenLake-v0')
shutil.rmtree('/tmp/FrozenLake_01') 
env = wrappers.Monitor(env, '/tmp/FrozenLake_01')


[2017-10-14 21:03:41,986] Making new env: FrozenLake-v0
[2017-10-14 21:03:42,305] Creating monitor directory /tmp/FrozenLake_01

In [3]:
Q= np.zeros((env.observation_space.n, env.action_space.n)) #16x4
alpha=0.1
beta=0.1
gamma=0.95
num_episodes = 1000
reward_list=[]
for i in range(num_episodes):
    s = env.reset();
    done=False
    while done==False:
        if np.random.rand() < beta:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s,:] )
        s_next,reward,done,info = env.step(a)
        if done:#reaching goal or falling into a hole
            if reward:
                beta=beta*0.99
            else:
                r=-1
        else:#reaching maximum step
            r = 0.0
        Q[s,a]= (1-alpha)*Q[s,a] + alpha*( r + gamma*np.max(Q[s_next,:]) )
        if done == True:
            break
        s=s_next
    reward_list.append(reward)
                                        
plt.plot(np.convolve(np.ones(100),reward_list,"valid"))


[2017-10-14 21:03:42,336] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000000.json
[2017-10-14 21:03:42,342] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000001.json
[2017-10-14 21:03:42,355] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000008.json
[2017-10-14 21:03:42,378] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000027.json
[2017-10-14 21:03:42,417] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000064.json
[2017-10-14 21:03:42,463] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000125.json
[2017-10-14 21:03:42,545] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000216.json
[2017-10-14 21:03:42,694] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000343.json
[2017-10-14 21:03:42,888] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000512.json
[2017-10-14 21:03:43,187] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8676.video000729.json
Out[3]:
[<matplotlib.lines.Line2D at 0x94e0390>]

In [5]:
print "Final Q-Table Values"
print Q


Final Q-Table Values
[[-0.09149649 -0.23377089 -0.21617013 -0.23818903]
 [-0.31280844 -0.4288004  -0.40376696 -0.08594572]
 [-0.24402572 -0.28141842 -0.27359322 -0.0994729 ]
 [-0.40569488 -0.51255241 -0.51304478 -0.15135847]
 [-0.10579963 -0.43293104 -0.29143473 -0.3602942 ]
 [ 0.          0.          0.          0.        ]
 [-0.49285554 -0.64427525 -0.63779959 -0.63933445]
 [ 0.          0.          0.          0.        ]
 [-0.34660231 -0.46498879 -0.46442033 -0.13046356]
 [-0.41639458 -0.17169656 -0.30377459 -0.30236889]
 [-0.29214208 -0.33428237 -0.43835109 -0.35113586]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.2250294  -0.2452645  -0.09712384 -0.28107151]
 [-0.14254959 -0.05614017 -0.1393619  -0.14698056]
 [ 0.          0.          0.          0.        ]]

In [6]:
s = env.reset()
d=False
n=0
while d==False:
    n+=1
    a = np.argmax(Q[s,:])
    s,r,d,x = env.step(a)
    #print("%s %s %s %s"%(s,r,d,x))
    env.render()
print n


[2017-10-14 20:00:18,387] Starting new video recorder writing to C:\tmp\FrozenLake_01\openaigym.video.0.8464.video001000.json
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
100

In [7]:
env.close()
#gym.upload('/tmp/FrozenLake_01', api_key='sk_o9OoYpSkKamkW8MrKuHw')


[2017-10-14 20:00:18,515] Finished writing results. You can upload them to the scoreboard via gym.upload('C:\\tmp\\FrozenLake_01')

In [ ]: