In [6]:
import numpy as np
import pprint
import sys
if "../" not in sys.path:
sys.path.append("../")
from lib.envs.gridworld import GridworldEnv
In [7]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()
In [8]:
def value_iteration(env, theta=0.0001, discount_factor=1.0):
"""
Value Iteration Algorithm.
Args:
env: OpenAI environment. env.P represents the transition probabilities of the environment.
theta: Stopping threshold. If the value of all states changes less than theta
in one iteration we are done.
discount_factor: lambda time discount factor.
Returns:
A tuple (policy, V) of the optimal policy and the optimal value function.
"""
V = np.zeros(env.nS)
policy = np.zeros([env.nS, env.nA])
while True:
delta = 0
for s in range(env.nS):
# evaluate action values for state s
action_values = np.zeros(env.nA)
for a in range(env.nA):
for prob, next_state, reward, done in env.P[s][a]:
action_values[a] += prob * (reward + discount_factor * V[next_state])
# select the best action based on the action values
best_a = np.argmax(action_values)
v = action_values[best_a]
# delta is the largest change in state value
delta = max(delta, np.abs(v - V[s]))
V[s] = v
# update policy
policy[s] = np.eye(env.nA)[best_a]
if delta < theta:
break
return policy, V
In [9]:
policy, v = value_iteration(env)
print("Policy Probability Distribution:")
print(policy)
print("")
print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")
print("Value Function:")
print(v)
print("")
print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")
In [10]:
# Test the value function
expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)
In [ ]: