In [ ]:
%load_ext autoreload
%autoreload 2
In [ ]:
import numpy as np
policy=np.array([[0.3, 0.2, 0.5], [0.5, 0.4, 0.1], [0.8, 0.1, 0.1]])
# 'raw_rewards' variable contains rewards obtained after transition to each state
# In our example it doesn't depend on source state
raw_rewards = np.array([1.5, -1.833333333, 19.833333333])
# 'rewards' variable contains expected values of the next reward for each state
rewards = np.matmul(policy, raw_rewards)
assert np.allclose(rewards, np.array([10., 2., 3.]))
state_value_function=np.array([0 for i in range(3)])
for i in range(20):
print(state_value_function)
state_value_function=#TODO: Implement the Policy Evaluation Update with a Discount Rate of 0.1
print(state_value_function)
In [ ]:
solution=#TODO: Implement the linear programming solution with a discount rate of 0.1
print(solution)
The result stays the same.
In [ ]:
import random
from collections import defaultdict
reward_counter=np.array([0., 0., 0.])
visit_counter=np.array([0., 0., 0.])
def gt(rewardlist, gamma=0.1):
'''
Function to calculate the total discounted reward
>>> gt([10, 2, 3], gamma=0.1)
10.23
'''
#TODO: Implement the total discounted reward
return 0
for i in range(400):
start_state=random.randint(0, 2)
next_state=start_state
rewardlist=[]
occurence=defaultdict(list)
for i in range(250):
rewardlist.append(rewards[next_state])
occurence[next_state].append(len(rewardlist)-1)
action=np.random.choice(np.arange(0, 3), p=policy[next_state])
next_state=action
for state in occurence:
for value in occurence[state]:
rew=gt(rewardlist[value:])
reward_counter[state]+=rew
visit_counter[state]+=1
#break #if break: return following only the first visit
print(reward_counter/visit_counter)
As can be seen the result is nearly the same as the state-value-function calculated above.
In [ ]:
q_table=np.zeros((3, 3))
for i in range(1001):
state=random.randint(0, 2)
action=random.randint(0, 2)
next_state=action
reward=rewards[next_state]
next_q=max(q_table[next_state])
q_table[state, action]= #TODO: Implement the Q-Table update
if i%100==0:
print(q_table)
In [ ]:
%matplotlib inline
from lxmls.reinforcement_learning.score_function_estimator import train
train()
In [ ]:
import numpy as np
rewards=np.array([10., 2., 3.])
gamma = 0.1
state_value_function = np.zeros(3)
for i in range(1000):
for s in range(3):
state_value_function[s]=#TODO: Implement the state value function update
print(state_value_function)
In [ ]:
from lxmls.reinforcement_learning.policy_gradient import train
train()
In [ ]:
from lxmls.reinforcement_learning.actor_critic import train
train()