In [16]:
import gym
import numpy as np
In [17]:
env = gym.make('FrozenLake-v0')
In [18]:
#Initialize table with all zeros
#Q = np.zeros([env.observation_space.n,env.action_space.n])
Q= np.random.rand(env.observation_space.n, env.action_space.n) * 0.1 - 0.05
# Set learning p-arameters
lr = 0.03
y = 0.9
num_episodes = 1000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
#The Q-Table learning algorithm
while j < 99:
j+=1
#Choose an action by greedily (with noise) picking from Q table
if np.random.rand() < 0.001:
a = np.random.randint(env.action_space.n)
else:
a = np.argmax(Q[s,:] )
#Get new state and reward from environment
s1,reward,done,_ = env.step(a)
if done:
reward = 1.0 if reward > 0.0 else -1.0
else:
reward = 0.0
#Update Q-Table with new knowledge
Q[s,a] = Q[s,a] + lr*(reward + y*np.max(Q[s1,:]) - Q[s,a])
rAll += reward
s = s1
if d == True:
break
#jList.append(j)
rList.append(rAll)
In [19]:
print "Score over time: " + str(sum(rList)/num_episodes)
In [20]:
print "Final Q-Table Values"
print Q
In [21]:
s = env.reset()
d=False
while d==False:
a = np.argmax(Q[s,:])
s,r,d,x = env.step(a)
print("%s %s %s %s"%(s,r,d,x))
env.render()
In [62]:
s = env.reset()
In [63]:
s,r,d,p = env.step(1)
print("s:%s, r:%s, d:%s, p:%s"%(s,r,d,p))
env.render()
In [21]:
#left
#down
In [ ]: