In [ ]:
    
%load_ext autoreload
%autoreload 2
    
In [ ]:
    
import numpy as np
policy=np.array([[0.3, 0.2, 0.5], [0.5, 0.4, 0.1], [0.8, 0.1, 0.1]])
# 'raw_rewards' variable contains rewards obtained after transition to each state
# In our example it doesn't depend on source state
raw_rewards = np.array([1.5, -1.833333333, 19.833333333])
# 'rewards' variable contains expected values of the next reward for each state
rewards = np.matmul(policy, raw_rewards)
assert np.allclose(rewards, np.array([10., 2., 3.]))
state_value_function=np.array([0 for i in range(3)])
for i in range(20):
    print(state_value_function)
    
    state_value_function=#TODO: Implement the Policy Evaluation Update with a Discount Rate of 0.1
print(state_value_function)
    
In [ ]:
    
solution=#TODO: Implement the linear programming solution with a discount rate of 0.1
print(solution)
    
The result stays the same.
In [ ]:
    
import random
from collections import defaultdict
reward_counter=np.array([0., 0., 0.])
visit_counter=np.array([0., 0., 0.])
def gt(rewardlist, gamma=0.1):
    '''
    Function to calculate the total discounted reward
    >>> gt([10, 2, 3], gamma=0.1)
    10.23
    '''
    #TODO: Implement the total discounted reward
    return 0
for i in range(400):
    start_state=random.randint(0, 2)
    next_state=start_state
    rewardlist=[]
    occurence=defaultdict(list) 
    for i in range(250):
        rewardlist.append(rewards[next_state]) 
        occurence[next_state].append(len(rewardlist)-1) 
        action=np.random.choice(np.arange(0, 3), p=policy[next_state]) 
        next_state=action
    for state in occurence: 
        for value in occurence[state]: 
            rew=gt(rewardlist[value:]) 
            reward_counter[state]+=rew 
            visit_counter[state]+=1 
            #break #if break: return following only the first visit
print(reward_counter/visit_counter)
    
As can be seen the result is nearly the same as the state-value-function calculated above.
In [ ]:
    
q_table=np.zeros((3, 3)) 
for i in range(1001): 
    state=random.randint(0, 2) 
    action=random.randint(0, 2) 
    next_state=action
    reward=rewards[next_state] 
    next_q=max(q_table[next_state]) 
    q_table[state, action]= #TODO: Implement the Q-Table update
    if i%100==0:
        print(q_table)
    
In [ ]:
    
%matplotlib inline
from lxmls.reinforcement_learning.score_function_estimator import train
train()
    
In [ ]:
    
import numpy as np
rewards=np.array([10., 2., 3.])
gamma = 0.1
state_value_function = np.zeros(3)
for i in range(1000):
    for s in range(3):
        state_value_function[s]=#TODO: Implement the state value function update
print(state_value_function)
    
In [ ]:
    
from lxmls.reinforcement_learning.policy_gradient import train
train()
    
In [ ]:
    
from lxmls.reinforcement_learning.actor_critic import train
train()