In [1]:
# import statements
import matplotlib.pyplot as plt
from __future__ import division
import os, sys, random
import numpy as np
# function to draw the table for Values and greedy policy
from utilities import *
is_latex_output_needed = False # make it true to get latex table outputs
In [2]:
# check if the state, we end up after taking an action is a valid state
# especially to check the corner cases
def get_valid_state(prev_state, curr_state):
# check the validity of the current state
if((curr_state[0] >= 0 and curr_state[0] < 10) and
(curr_state[1] >= 0 and curr_state[1] < 10)):
# if the next state is valid, return the state
return curr_state
else:
return prev_state
In [3]:
# function to get the next state after performing the action
def get_state(current_state, action):
next_state = None
if action == 'UP':
next_state = get_valid_state(current_state,
(current_state[0] - 1, current_state[1]))
elif action == 'DOWN':
next_state = get_valid_state(current_state,
(current_state[0] + 1, current_state[1]))
elif action == 'LEFT':
next_state = get_valid_state(current_state,
(current_state[0], current_state[1] - 1))
else: # RIGHT
next_state = get_valid_state(current_state,
(current_state[0], current_state[1] + 1))
return next_state
In [4]:
# get the other uncertain actions (with probability 0.1) associated
# with an action
def get_uncertain_actions(action):
if action == 'UP' or action == 'DOWN':
return ['LEFT', 'RIGHT']
else:
return ['UP', 'DOWN']
In [5]:
# get the next state after taking an action
def get_transition_states(current_state, action):
transitions = []
# handle the grey IN states (transition is independent of action)
if current_state in [(3, 2), (4, 2), (5, 2), (6, 2)]:
transitions.append([1, (9, 0)])
# handle the brown IN states (transition is independent of action)
elif current_state in [(8, 7)]:
transitions.append([1, (0, 7)])
elif current_state == GOAL:
# do nothing, stay in same state
pass
else:
# handle actual states and actions
transitions.append([0.8, get_state(current_state, action)])
uncertain_actions = get_uncertain_actions(action)
for uncert_action in uncertain_actions:
transitions.append([0.1, get_state(current_state, uncert_action)])
return transitions
In [6]:
# to get all the states of the grid for value iteration
def states_enumeration(is_random=True):
x, y = np.meshgrid(np.arange(10), np.arange(10))
states = list(zip(x.reshape(-1), y.reshape(-1)))
if is_random:
random.shuffle(states)
return states
In [7]:
# function to return immediate reward
def get_immediate_reward(state):
if state == GOAL:
return +100
else:
return -1
In [8]:
# calculation of value for a particular state and action by having the
# previous value table in mind
def get_value(state, action, V):
transitions = get_transition_states(state, action)
current_value = 0
# for each transition from the state, calculate expected return
for transition in transitions:
probability = transition[0]
target_state = transition[1]
current_value += (probability * (get_immediate_reward(target_state)
+ V[target_state]))
return current_value
In [9]:
# function to perform value iteration
def value_iteration(GOAL):
V_trace = []
policy_trace = []
max_costs = []
N = 10
# initialize value of all states
V = np.zeros((N, N))
actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
pi = np.chararray(V.shape)
eps = 1e-10
is_stop = False
# loop for value iteration
while not is_stop:
prev_V = V.copy()
V = np.zeros((N, N))
# get all the states for performing VI
states = states_enumeration()
for state in states:
# do not update value for GOAL state
if state == GOAL:
continue
current_values = []
# for each action in a state, get the values
for action in actions:
current_values.append(get_value(state, action, prev_V))
# find the max value and greedy policy
V[state] = max(current_values)
pi[state] = actions[np.argmax(current_values)][0]
# check for stopping condition
max_error = np.max(np.abs(prev_V - V))
max_costs.append(max_error)
# note down the V,pi after 10, 25 iterations
if len(max_costs) in [10, 25]:
V_trace.append(V.copy())
policy_trace.append(pi.copy())
if max_error <= eps:
is_stop = True
print('In total, ', len(max_costs), ' iterations')
# V,pi after VI is stopped
V_trace.append(V.copy())
policy_trace.append(pi.copy())
return V, pi, max_costs, V_trace, policy_trace
In [10]:
def plot_graph(max_costs, filename):
# function to plot line graph
plt.plot(range(len(max_costs)), max_costs)
plt.xlabel('iterations')
plt.ylabel('difference between consecutive max values')
plt.title('iterations vs value difference')
plt.savefig(filename)
In [11]:
# Q2.1
# perform value iteration and display greedy policy and value table
GOAL = (0, 9)
V, pi, max_costs, V_trace, pi_trace = value_iteration(GOAL)
display_table(pi, is_latex_output_needed)
display_table(np.around(V, decimals=3), is_latex_output_needed)
plot_graph(max_costs, 'goal1.png')
In [12]:
# show the V values and policy after 10, 25, end of iterations
for i, (_v, _pi) in enumerate(zip(V_trace, pi_trace)):
display_table(np.around(_v, decimals=3), is_latex_output_needed)
display_table(pi, is_latex_output_needed)
In [13]:
# GOAL 2
GOAL = (9, 3)
# perform value iteration and display greedy policy and value table
V, pi, max_costs, V_trace, pi_trace = value_iteration(GOAL)
display_table(pi, is_latex_output_needed)
display_table(np.around(V, decimals=3), is_latex_output_needed)
plot_graph(max_costs, 'goal2.png')
In [14]:
# show the V values and policy after 10, 25, end of iterations
for i, (_v, _pi) in enumerate(zip(V_trace, pi_trace)):
display_table(_pi, is_latex_output_needed)
display_table(np.around(_v, decimals=3), is_latex_output_needed)
In [ ]: