After solving all the questions in the exam save your notebook with the name username.ipynb
and submit it to: https://www.dropbox.com/request/0Eh9d2PvQMdAyJviK4Nl
In [1]:
from mdp import *
from rl import *
In [60]:
class LinearMDP(GridMDP):
"""A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is
specify the grid as a list of lists of rewards; use None for an obstacle
(unreachable state). Also, you should specify the terminal states.
An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
GridMDP.__init__(self, grid=grid, terminals=terminals, init=init, gamma=gamma)
def T(self, state, action):
"""
This function must return a list of tuples (p, state') where p is the probability
of going to state'
"""
# Your code here #
def calculate_v_star(rew_a, gamma):
'''
This function must create an instance of LinearMDP that corresponds to the
MDP in question 2 of HW4 and use it to calculate the expected reward for each state.
The function receives as parameter the reward for state 'a', which in the example
corresponds to 10, but here is variable, and the value of gamma.
The reward for state 'e' is 1. The function must return a dictionary with the V* values.
'''
# Your code here #
mdp =
v_star =
##################
return v_star
In [12]:
class ExtendedMDP(MDP):
def __init__(self, transition_matrix, rewards, terminals, init, gamma=.9):
# All possible actions.
actlist = []
for state in transition_matrix.keys():
actlist.extend(transition_matrix[state])
actlist = list(set(actlist))
MDP.__init__(self, init, actlist, terminals=terminals, gamma=gamma)
self.t = transition_matrix
self.reward = rewards
for state in self.t:
self.states.add(state)
def T(self, state, action):
return [(prob, new_state) for new_state, prob
in self.t[state][action].items()]
def R(self, state, action, statep):
"Returns a numeric reward for the combination the current state, the action and the next state."
return self.reward[state][action][statep]
def ext_value_iteration(mdp, epsilon=0.001):
"Solving an MDP by value iteration. Uses a reward function that depends on s, a and s1 "
U1 = {s: 0 for s in mdp.states}
R, T, gamma = mdp.R, mdp.T, mdp.gamma
while True:
U = U1.copy()
delta = 0
for s in mdp.states:
# Your code here #
U1[s] =
##################
delta = max(delta, abs(U1[s] - U[s]))
if delta < epsilon * (1 - gamma) / gamma:
return U
def ext_calculate_v_star(gamma):
'''
This function must create an instance of ExtendedMDP that corresponds to the
MDP in question 3 of HW4 and use it to calculate the expected reward for each state.
The function receives as parameter the value of gamma.
The function must return a dictionary with the V* values.
'''
# Your code here #
t =
rewards =
##################
emdp = ExtendedMDP(t, rewards, [], None, gamma=gamma)
v_star = ext_value_iteration(emdp, epsilon = 0.00001)
return v_star
In [58]:
def calculate_policy(rew_a, gamma):
'''
This function must create an instance of LinearMDP that corresponds to the
MDP in question 2 of HW4 and use it to calculate an optimal policy by applying
Q-learning. The function receives as parameter the reward for state 'a',
which in the example corresponds to 10, but here is variable, and the value of gamma.
The reward for state 'e' is 1. The function must return a dictionary with an action for
each state.
'''
# Your code here #
mdp =
##################
q_agent = QLearningAgent(mdp, Ne=5, Rplus=10,
alpha=lambda n: 60./(59+n))
for i in range(200):
run_single_trial(q_agent,mdp)
U = defaultdict(lambda: -1000.) # Very Large Negative Value for Comparison see below.
policy = {}
# Your code here #
##################
return policy