In [1]:
#using PyPlot
#addprocs(4)
@everywhere using DataFrames
In [2]:
using ProgressMeter
In [3]:
@everywhere type Experiment
current_state::Int64
policy::Vector{Float64}
world_state_policies::Matrix{Float64}
nr_world_states::Int64
nr_actions::Int64
target_sequence::Vector{Float64}
sum_of_rewards::Vector{Float64}
nr_of_evaluations::Vector{Float64}
value_function::Vector{Float64} # on the world state
state_action_value_function::Matrix{Float64} # on the world state
k::Int64
value_history::Matrix{Float64}
end
In [4]:
@everywhere function create_experiment(nr_world_states::Int64, nr_actions::Int64, target_actions::Int64, k::Int64)
policy = ones(nr_actions)
policy = policy / sum(policy)
min = minimum([target_actions, nr_world_states])
max = maximum([0, nr_world_states - nr_actions])
sequence = vcat([1:min], int(ceil(rand(max) * nr_actions)))
sequence = sequence[randperm(length(sequence))]
return Experiment(1, policy, zeros(2,2), nr_world_states, nr_actions, sequence, ones(nr_world_states), zeros(nr_world_states), zeros(nr_world_states), zeros(nr_world_states, nr_actions), k, zeros(2,2))
end
@everywhere function run_mc_episode!(experiment::Experiment, initial_state::Int64, T::Int64)
experiment.current_state = initial_state
for t = 1:T
p = rand()
action = experiment.nr_actions
for i = 1:experiment.nr_actions
if p < sum(experiment.policy[1:i])
action = i
break
end
end
if action == experiment.target_sequence[experiment.current_state]
experiment.current_state = experiment.current_state + 1
else
experiment.current_state = 1
end
if experiment.current_state >= experiment.nr_world_states
return 1
end
end
return 0
end
@everywhere function monte_carlo_estimation_of_world_state_function!(experiment::Experiment, T::Int64, nr_of_episodes::Int64)
for w = 1:experiment.nr_world_states
experiment.sum_of_rewards[w] = 0.0
experiment.nr_of_evaluations[w] = 0.0
end
for e = 1:nr_of_episodes
initial_state = int64(ceil(rand()*experiment.nr_world_states))
reward = run_mc_episode!(experiment, initial_state, T)
experiment.sum_of_rewards[initial_state] = experiment.sum_of_rewards[initial_state] + reward
experiment.nr_of_evaluations[initial_state] = experiment.nr_of_evaluations[initial_state] + 1.0
experiment.value_function[initial_state] = experiment.sum_of_rewards[initial_state] /
experiment.nr_of_evaluations[initial_state]
end
end
@everywhere function run_td_episode!(experiment::Experiment, T::Int64)
experiment.current_state = ceil(rand() * experiment.nr_world_states)
for t = 1:T
action = ceil(rand() * experiment.nr_actions)
old_state = experiment.current_state
new_state = experiment.current_state
if action == experiment.target_sequence[experiment.current_state]
new_state = new_state + 1
else
new_state = 1
end
old_value = experiment.value_function[old_state]
new_value = (new_state > experiment.nr_world_states)?experiment.value_function[1]:experiment.value_function[new_state]
reward = (new_state > experiment.nr_world_states)?1:0
α = 0.5
γ = 0.9
if new_state > experiment.nr_world_states
experiment.current_state = ceil(rand() * experiment.nr_world_states)
end
experiment.value_function[old_state] = old_value + α * (reward + γ * new_value - old_value)
end
end
@everywhere function td_estimation_of_world_state_function!(experiment::Experiment, T::Int64, nr_of_episodes::Int64)
for e = 1:nr_of_episodes
experiment.current_state = ceil(rand() * experiment.nr_world_states)
run_td_episode!(experiment, T)
end
end
@everywhere function calculate_world_state_action_function!(experiment::Experiment)
for i = 1:experiment.nr_world_states
for action = 1:experiment.nr_actions
experiment.state_action_value_function[i,action] = 0.0
end
end
for i = 1:experiment.nr_world_states
for action = 1:experiment.nr_actions
if action == experiment.target_sequence[i]
if i == experiment.nr_world_states
experiment.state_action_value_function[i,action] = 1.0
else
experiment.state_action_value_function[i,action] = experiment.value_function[i+1]
end
else
experiment.state_action_value_function[i,action] = 0.0
end
end
end
end
@everywhere function prune_world_state_action_function!(experiment::Experiment)
nr_of_values = minimum([experiment.k, experiment.nr_actions])
for i = 1:experiment.nr_world_states
if sum(experiment.state_action_value_function[i,:]) > 0.0
values = DataFrame(VALUES=[v for v in experiment.state_action_value_function[i,:]], INDICES=[1:experiment.nr_actions])
sort!(values, cols = (:VALUES), rev=true)
values = values[1:nr_of_values,:]
for a = 1:experiment.nr_actions
experiment.state_action_value_function[i,a] = 0.0
end
for a = 1:size(values)[1]
experiment.state_action_value_function[i,values[:INDICES][a]] = values[:VALUES][a]
end
else
indices = randperm(experiment.nr_actions)[1:nr_of_values]
for a in indices
experiment.state_action_value_function[i, a] = 0.001
end
end
end
end
@everywhere function update_policy_from_world_state_action_function!(experiment::Experiment)
sum_of_values = zeros(experiment.nr_actions)
for a = 1:experiment.nr_actions
sum_of_values[a] = sum(experiment.state_action_value_function[:,a])
end
s = sum(sum_of_values)
for a = 1:experiment.nr_actions
experiment.policy[a] = sum_of_values[a] / s
end
end
@everywhere function update_policy!(experiment::Experiment)
calculate_world_state_action_function!(experiment)
prune_world_state_action_function!(experiment)
update_policy_from_world_state_action_function!(experiment)
end
@everywhere function scan_over_k(k::Int64, N::Int64, episode_length::Int64, nr_of_episodes::Int64)
nr_world_states = k
nr_of_actions = k
experiment = create_experiment(nr_world_states, nr_of_actions, nr_of_actions, k)
experiment.value_history = zeros(N, nr_world_states)
pm = Progress(N, 1)
for i = 1:N
# monte_carlo_estimation_of_world_state_function!(exp, episode_length, nr_of_episodes)
td_estimation_of_world_state_function!(experiment, episode_length, nr_of_episodes)
update_policy!(exp)
experiment.value_history[i,:] = experiment.value_function
next!(pm)
end
return exp
end
function evaluate(experiment::Experiment, N::Int64)
r = 0.0
for n = 1:N
r = r + run_mc_episode!(experiment, 1, experiment.k + 1)
end
r
end
Out[4]:
In [5]:
N = 100000
k = 5
nr_of_episodes = 1000
nr_world_states = k
nr_of_actions = k
episode_length = 100
experiment = create_experiment(nr_world_states, nr_of_actions, nr_of_actions, k)
experiment.value_history = zeros(N, nr_world_states)
td_estimation_of_world_state_function!(experiment, episode_length, nr_of_episodes)
update_policy!(experiment)
println(experiment.value_function)
println(experiment.policy)
println(experiment.state_action_value_function)
control = Experiment(1, ones(experiment.nr_actions) * 1.0 / float64(experiment.nr_actions), experiment.nr_world_states, experiment.nr_actions, experiment.target_sequence, ones(1), zeros(1), zeros(1), zeros(2, 2), k, zeros(2,2))
control = evaluate(control, 10000);
In [6]:
for i = 1:10000
td_estimation_of_world_state_function!(experiment, k, nr_of_episodes)
update_policy!(experiment)
end
println("policy $(round(experiment.policy,2)) $(sum(experiment.policy))")
println("target sequence: $(round(experiment.target_sequence,2))")
println("value function: $(round(experiment.value_function,2))")
println("state action value function $(round(experiment.state_action_value_function,2))")
learned = evaluate(experiment, 10000)
println("learned $learned vs. control $control")
In [ ]:
In [ ]: