In [1]:
from __future__ import division
import os, sys
import numpy as np
import utilities # contains API to display table
reload(utilities)
from utilities import *

In [2]:
# number of states
is_latex_output_needed = False # set to True to get code for latex tables
states = ['A', 'B', 'C']

In [3]:
# buffer to hold transitions probablities
probabilities = {
    'A' : np.array([[1/2, 1/4, 1/4],
                    [1/16, 3/4, 3/16],
                    [1/4, 1/8, 5/8]]),
    
    'B' : np.array([[1/2, 0, 1/2],
                    [1/16, 7/8, 1/16]]),
    
    'C' : np.array([[1/4, 1/4, 1/2],
                    [1/8, 3/4, 1/8],
                    [3/4, 1/16, 3/16]])
}

In [4]:
# buffer to hold rewards
rewards = {
    'A': np.array([[10, 4, 8],
                   [8, 2, 4],
                   [4, 6, 4]]),
    
    'B': np.array([[14, 0, 18],
                   [8, 16, 8]]),
    
    'C': np.array([[10, 2, 8],
                   [6, 4, 2],
                   [4, 0, 8]]),
}

In [5]:
# transition probability sanity check (outgoing probability sum to 1)
for state, transition_probs in probabilities.items():
    sum_probs = transition_probs.sum(axis=1)
    assert np.all(sum_probs == 1), ['transition probabilities of ', state, ' is not valid', transition_probs]

In [6]:
N = 10
J = {}
m = {}

# API to reset the buffers (mainly J, policy \pi)
def reset_buffers():
    # implement finite horizon MDPs
    global J, mu, N

    J = {}

    # fill last time step
    J[N] = {}
    J[N]['A'] = 0
    J[N]['B'] = 0
    J[N]['C'] = 0

    mu = {}
    mu[N] = {}
    mu[N]['A'] = '-'
    mu[N]['B'] = '-'
    mu[N]['C'] = '-'

reset_buffers()

In [7]:
def calculate_J(J, k, state, probabilities, rewards):
    # function to calculate J (cost-to-go) for a particular time step 'k' from state 'state'
    # for given probabilty 'probabilities', and single state cost 'rewards'
    
    # k = timestep, state = A, B, C
    if k not in J:
        J[k] = {}
        mu[k] = {}
    
    value = 0
    
    # if the value is not already calculated, calculate it
    if state not in J[k]:
        all_values = []
        
        # for each action, find the value based on transition probabilities
        for action in range(probabilities[state].shape[0]):
            current_value = 0
            
            for next_state in range(probabilities[state].shape[1]):
                # calculate the value for this particular action                
                current_value += (probabilities[state][action, next_state] * 
                                  (rewards[state][action, next_state] + 
                                   calculate_J(J, k+1, states[next_state], probabilities, rewards)))
            
            all_values.append(np.around(current_value, decimals=4))
        
        J[k][state] = max(all_values)
        # print(len(all_values))
        mu[k][state] = np.argmax(all_values) + 1
    
    return J[k][state]

In [8]:
# Q1.2)

N = 10
reset_buffers()

# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=10
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))

# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)


123.0173
136.8492
124.1938
           A         B         C
0   123.0173  136.8492  124.1938
1   109.6728  123.5047  110.8493
2    96.3283  110.1602   97.5048
3    82.9838   96.8157   84.1602
4    69.6394   83.4711   70.8158
5    56.2960   70.1263   57.4727
6    42.9653   56.7798   44.1377
7    29.6641   43.4219   30.9062
8    17.7500   29.9375   17.8750
9     8.0000   16.0000    7.0000
10    0.0000    0.0000    0.0000


   0   1   2   3   4   5   6   7   8   9  10
A   2   2   2   2   2   2   2   2   1   1  -
B   2   2   2   2   2   2   2   2   2   1  -
C   2   2   2   2   2   2   2   2   2   1  -



In [9]:
# Q1.2)
N = 20
reset_buffers()

# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=20
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))

# display the details
display_table(J, is_latex_output_needed, is_transpose = True)
display_table(mu, is_latex_output_needed)


256.4623
270.2942
257.6388
           A         B         C
0   256.4623  270.2942  257.6388
1   243.1178  256.9497  244.2943
2   229.7733  243.6052  230.9498
3   216.4288  230.2607  217.6053
4   203.0843  216.9162  204.2608
5   189.7398  203.5717  190.9163
6   176.3953  190.2272  177.5718
7   163.0508  176.8827  164.2273
8   149.7063  163.5382  150.8828
9   136.3618  150.1937  137.5383
10  123.0173  136.8492  124.1938
11  109.6728  123.5047  110.8493
12   96.3283  110.1602   97.5048
13   82.9838   96.8157   84.1602
14   69.6394   83.4711   70.8158
15   56.2960   70.1263   57.4727
16   42.9653   56.7798   44.1377
17   29.6641   43.4219   30.9062
18   17.7500   29.9375   17.8750
19    8.0000   16.0000    7.0000
20    0.0000    0.0000    0.0000


   0   1   2   3   4   5   6   7   8   9  ...  11  12  13  14  15  16  17  18  \
A   2   2   2   2   2   2   2   2   2   2 ...   2   2   2   2   2   2   2   1   
B   2   2   2   2   2   2   2   2   2   2 ...   2   2   2   2   2   2   2   2   
C   2   2   2   2   2   2   2   2   2   2 ...   2   2   2   2   2   2   2   2   

   19  20  
A   1   -  
B   1   -  
C   1   -  

[3 rows x 21 columns]



In [10]:
# Q1.3)
# change the probability to only allow action-2

# buffer to hold transitions probablities
probabilities = {
    'A' : np.array([[1/16, 3/4, 3/16]]),
    
    'B' : np.array([[1/16, 7/8, 1/16]]),
    
    'C' : np.array([[1/8, 3/4, 1/8]])
}

# buffer to hold rewards
rewards = {
    'A': np.array([[8, 2, 4]]),
    
    'B': np.array([[8, 16, 8]]),
    
    'C': np.array([[6, 4, 2]]),
}

In [11]:
N = 10
reset_buffers()

# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=10
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))

# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)


121.5003
135.3322
122.6768
           A         B         C
0   121.5003  135.3322  122.6768
1   108.1558  121.9877  109.3323
2    94.8113  108.6432   95.9878
3    81.4668   95.2987   82.6433
4    68.1223   81.9542   69.2988
5    54.7781   68.6096   55.9546
6    41.4360   55.2647   42.6124
7    28.1104   41.9170   29.2871
8    14.9219   28.5469   16.0938
9     2.7500   15.0000    4.0000
10    0.0000    0.0000    0.0000


   0   1   2   3   4   5   6   7   8   9  10
A   1   1   1   1   1   1   1   1   1   1  -
B   1   1   1   1   1   1   1   1   1   1  -
C   1   1   1   1   1   1   1   1   1   1  -



In [12]:
# Q1.3)

N = 20
reset_buffers()

# calculate optimal cost for each state on timestep 0
# by utilizing bottom-up dynamic programming upto N=20
print(calculate_J(J, 0, 'A', probabilities, rewards))
print(calculate_J(J, 0, 'B', probabilities, rewards))
print(calculate_J(J, 0, 'C', probabilities, rewards))

# display the details
display_table(J, is_latex_output_needed, is_transpose=True)
display_table(mu, is_latex_output_needed)


254.9453
268.7772
256.1218
           A         B         C
0   254.9453  268.7772  256.1218
1   241.6008  255.4327  242.7773
2   228.2563  242.0882  229.4328
3   214.9118  228.7437  216.0883
4   201.5673  215.3992  202.7438
5   188.2228  202.0547  189.3993
6   174.8783  188.7102  176.0548
7   161.5338  175.3657  162.7103
8   148.1893  162.0212  149.3658
9   134.8448  148.6767  136.0213
10  121.5003  135.3322  122.6768
11  108.1558  121.9877  109.3323
12   94.8113  108.6432   95.9878
13   81.4668   95.2987   82.6433
14   68.1223   81.9542   69.2988
15   54.7781   68.6096   55.9546
16   41.4360   55.2647   42.6124
17   28.1104   41.9170   29.2871
18   14.9219   28.5469   16.0938
19    2.7500   15.0000    4.0000
20    0.0000    0.0000    0.0000


   0   1   2   3   4   5   6   7   8   9  ...  11  12  13  14  15  16  17  18  \
A   1   1   1   1   1   1   1   1   1   1 ...   1   1   1   1   1   1   1   1   
B   1   1   1   1   1   1   1   1   1   1 ...   1   1   1   1   1   1   1   1   
C   1   1   1   1   1   1   1   1   1   1 ...   1   1   1   1   1   1   1   1   

   19  20  
A   1   -  
B   1   -  
C   1   -  

[3 rows x 21 columns]



In [ ]: