In [1]:
from __future__ import division
import os, sys
import numpy as np
import utilities # contains API to display table
reload(utilities)
from utilities import *
In [2]:
# number of states
is_latex_output_needed = False # set to True to get code for latex tables
states = ['A', 'B', 'C']
In [3]:
# buffer to hold transitions probablities
probabilities = {
'A' : np.array([[1/2, 1/4, 1/4],
[1/16, 3/4, 3/16],
[1/4, 1/8, 5/8]]),
'B' : np.array([[1/2, 0, 1/2],
[1/16, 7/8, 1/16]]),
'C' : np.array([[1/4, 1/4, 1/2],
[1/8, 3/4, 1/8],
[3/4, 1/16, 3/16]])
}
In [4]:
# buffer to hold rewards
rewards = {
'A': np.array([[10, 4, 8],
[8, 2, 4],
[4, 6, 4]]),
'B': np.array([[14, 0, 18],
[8, 16, 8]]),
'C': np.array([[10, 2, 8],
[6, 4, 2],
[4, 0, 8]]),
}
In [5]:
# transition probability sanity check (outgoing probability sum to 1)
for state, transition_probs in probabilities.items():
sum_probs = transition_probs.sum(axis=1)
assert np.all(sum_probs == 1), ['transition probabilities of ', state, ' is not valid', transition_probs]
In [6]:
N = 10
J = {}
m = {}
# API to reset the buffers (mainly J, policy \pi)
def reset_buffers():
# implement finite horizon MDPs
global J, mu, N
J = {}
# fill last time step
J[N] = {}
J[N]['A'] = 0
J[N]['B'] = 0
J[N]['C'] = 0
mu = {}
mu[N] = {}
mu[N]['A'] = '-'
mu[N]['B'] = '-'
mu[N]['C'] = '-'
reset_buffers()
In [7]:
def calculate_J(J, k, state, probabilities, rewards):
# function to calculate J (cost-to-go) for a particular time step 'k' from state 'state'
# for given probabilty 'probabilities', and single state cost 'rewards'
# k = timestep, state = A, B, C
if k not in J:
J[k] = {}
mu[k] = {}
value = 0
# if the value is not already calculated, calculate it
if state not in J[k]:
all_values = []
# for each action, find the value based on transition probabilities
for action in range(probabilities[state].shape[0]):
current_value = 0
for next_state in range(probabilities[state].shape[1]):
# calculate the value for this particular action
current_value += (probabilities[state][action, next_state] *
(rewards[state][action, next_state] +
calculate_J(J, k+1, states[next_state], probabilities, rewards)))
all_values.append(np.around(current_value, decimals=4))
J[k][state] = max(all_values)
# print(len(all_values))
mu[k][state] = np.argmax(all_values) + 1
return J[k][state]
In [8]:
# Q1.2)
N = 10
reset_buffers()
# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=10
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))
# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)
In [9]:
# Q1.2)
N = 20
reset_buffers()
# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=20
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))
# display the details
display_table(J, is_latex_output_needed, is_transpose = True)
display_table(mu, is_latex_output_needed)
In [10]:
# Q1.3)
# change the probability to only allow action-2
# buffer to hold transitions probablities
probabilities = {
'A' : np.array([[1/16, 3/4, 3/16]]),
'B' : np.array([[1/16, 7/8, 1/16]]),
'C' : np.array([[1/8, 3/4, 1/8]])
}
# buffer to hold rewards
rewards = {
'A': np.array([[8, 2, 4]]),
'B': np.array([[8, 16, 8]]),
'C': np.array([[6, 4, 2]]),
}
In [11]:
N = 10
reset_buffers()
# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=10
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))
# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)
In [12]:
# Q1.3)
N = 20
reset_buffers()
# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=20
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))
# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)
In [ ]: