In [1]:
#using PyPlot
#addprocs(4)
@everywhere using DataFrames

In [2]:
using ProgressMeter

In [3]:
@everywhere type Experiment
    current_state::Int64
    policy::Vector{Float64}
    world_state_policies::Matrix{Float64}
    nr_world_states::Int64
    nr_actions::Int64
    target_sequence::Vector{Float64}
    sum_of_rewards::Vector{Float64}
    nr_of_evaluations::Vector{Float64}
    value_function::Vector{Float64} # on the world state
    state_action_value_function::Matrix{Float64} # on the world state
    k::Int64
    value_history::Matrix{Float64}
end

In [4]:
@everywhere function create_experiment(nr_world_states::Int64, nr_actions::Int64, target_actions::Int64, k::Int64)
    policy = ones(nr_actions)
    policy = policy / sum(policy)
    
    min = minimum([target_actions, nr_world_states])
    max = maximum([0, nr_world_states - nr_actions])
    
    sequence = vcat([1:min], int(ceil(rand(max) * nr_actions)))
    sequence = sequence[randperm(length(sequence))]
    
    return Experiment(1, policy, zeros(2,2), nr_world_states, nr_actions, sequence, ones(nr_world_states), zeros(nr_world_states), zeros(nr_world_states), zeros(nr_world_states, nr_actions), k, zeros(2,2))
end

@everywhere function run_mc_episode!(experiment::Experiment, initial_state::Int64, T::Int64)
    experiment.current_state = initial_state
    for t = 1:T
        p = rand()
        action = experiment.nr_actions
        for i = 1:experiment.nr_actions
            if p < sum(experiment.policy[1:i])
                action = i
                break
            end
        end
        if action == experiment.target_sequence[experiment.current_state]
            experiment.current_state = experiment.current_state + 1
        else
            experiment.current_state = 1
        end
        if experiment.current_state >= experiment.nr_world_states
            return 1
        end
    end
    return 0
end

@everywhere function monte_carlo_estimation_of_world_state_function!(experiment::Experiment, T::Int64, nr_of_episodes::Int64)
    for w = 1:experiment.nr_world_states
        experiment.sum_of_rewards[w]    = 0.0
        experiment.nr_of_evaluations[w] = 0.0
    end

    for e = 1:nr_of_episodes
        initial_state                               = int64(ceil(rand()*experiment.nr_world_states))
        reward                                      = run_mc_episode!(experiment, initial_state, T)
        experiment.sum_of_rewards[initial_state]    = experiment.sum_of_rewards[initial_state] + reward
        experiment.nr_of_evaluations[initial_state] = experiment.nr_of_evaluations[initial_state] + 1.0
        experiment.value_function[initial_state]    = experiment.sum_of_rewards[initial_state] /
                                                      experiment.nr_of_evaluations[initial_state]
    end
end

@everywhere function run_td_episode!(experiment::Experiment, T::Int64)
    experiment.current_state = ceil(rand() * experiment.nr_world_states)
    for t = 1:T
        action = ceil(rand() * experiment.nr_actions)

        old_state = experiment.current_state
        new_state = experiment.current_state
        
        if action == experiment.target_sequence[experiment.current_state]
            new_state = new_state + 1
        else
            new_state = 1
        end
        
        old_value = experiment.value_function[old_state]
        new_value = (new_state > experiment.nr_world_states)?experiment.value_function[1]:experiment.value_function[new_state]
        reward    = (new_state > experiment.nr_world_states)?1:0
        α = 0.5
        γ = 0.9
        
        if new_state > experiment.nr_world_states
            experiment.current_state = ceil(rand() * experiment.nr_world_states)
        end
        
        experiment.value_function[old_state] = old_value + α * (reward + γ * new_value - old_value)
    end
end

@everywhere function td_estimation_of_world_state_function!(experiment::Experiment, T::Int64, nr_of_episodes::Int64)
    for e = 1:nr_of_episodes
        experiment.current_state = ceil(rand() * experiment.nr_world_states)
        run_td_episode!(experiment, T)
    end
end

@everywhere function calculate_world_state_action_function!(experiment::Experiment)
    for i = 1:experiment.nr_world_states
        for action = 1:experiment.nr_actions
            experiment.state_action_value_function[i,action] = 0.0
        end
    end

    for i = 1:experiment.nr_world_states
        for action = 1:experiment.nr_actions
            if action == experiment.target_sequence[i]
                if i == experiment.nr_world_states
                    experiment.state_action_value_function[i,action] = 1.0
                else
                    experiment.state_action_value_function[i,action] = experiment.value_function[i+1]
                end
            else
                experiment.state_action_value_function[i,action] = 0.0
            end
        end
    end
end

@everywhere function prune_world_state_action_function!(experiment::Experiment)
    nr_of_values = minimum([experiment.k, experiment.nr_actions])
    for i = 1:experiment.nr_world_states
        if sum(experiment.state_action_value_function[i,:]) > 0.0
            values = DataFrame(VALUES=[v for v in experiment.state_action_value_function[i,:]], INDICES=[1:experiment.nr_actions])
            sort!(values, cols = (:VALUES), rev=true)

            values = values[1:nr_of_values,:]
            for a = 1:experiment.nr_actions
                experiment.state_action_value_function[i,a] = 0.0
            end

            for a = 1:size(values)[1]
                experiment.state_action_value_function[i,values[:INDICES][a]] = values[:VALUES][a]
            end
        else
            indices = randperm(experiment.nr_actions)[1:nr_of_values]
            for a in indices
                experiment.state_action_value_function[i, a] = 0.001
            end
        end
    end
end

@everywhere function update_policy_from_world_state_action_function!(experiment::Experiment)
    sum_of_values = zeros(experiment.nr_actions)
    for a = 1:experiment.nr_actions
        sum_of_values[a] = sum(experiment.state_action_value_function[:,a])
    end
    
    s = sum(sum_of_values)
    for a = 1:experiment.nr_actions
        experiment.policy[a] = sum_of_values[a] / s
    end
end

@everywhere function update_policy!(experiment::Experiment)
    calculate_world_state_action_function!(experiment)
    prune_world_state_action_function!(experiment)
    update_policy_from_world_state_action_function!(experiment)
end

@everywhere function scan_over_k(k::Int64, N::Int64, episode_length::Int64, nr_of_episodes::Int64)
    nr_world_states   = k
    nr_of_actions     = k
    experiment        = create_experiment(nr_world_states, nr_of_actions, nr_of_actions, k)
    experiment.value_history = zeros(N, nr_world_states)
    pm = Progress(N, 1)
    for i = 1:N
        #        monte_carlo_estimation_of_world_state_function!(exp, episode_length, nr_of_episodes)
        td_estimation_of_world_state_function!(experiment, episode_length, nr_of_episodes)        
        update_policy!(exp)
        experiment.value_history[i,:] = experiment.value_function
        next!(pm)
    end
    return exp
end

function evaluate(experiment::Experiment, N::Int64)
    r = 0.0
    for n = 1:N
        r = r + run_mc_episode!(experiment, 1, experiment.k + 1)
    end
    r
end


Out[4]:
evaluate (generic function with 1 method)

In [5]:
N                 = 100000
k                 = 5
nr_of_episodes    = 1000
nr_world_states   = k
nr_of_actions     = k
episode_length    = 100
experiment        = create_experiment(nr_world_states, nr_of_actions, nr_of_actions, k)
experiment.value_history = zeros(N, nr_world_states)

td_estimation_of_world_state_function!(experiment, episode_length, nr_of_episodes)
update_policy!(experiment)
println(experiment.value_function)
println(experiment.policy)
println(experiment.state_action_value_function)

control = Experiment(1, ones(experiment.nr_actions) * 1.0 / float64(experiment.nr_actions), experiment.nr_world_states, experiment.nr_actions, experiment.target_sequence, ones(1), zeros(1), zeros(1), zeros(2, 2), k, zeros(2,2))
control = evaluate(control, 10000);


[0.00013925831220640592,0.0002846353881672688,0.006702701851759142,0.29645408339326584,0.6643396772658501]
[0.3376085266674994,0.15065399485226458,0.00014464789222295505,0.5081866072743958,0.0034062233136172887]
[0.0 0.0 0.0002846353881672688 0.0 0.0
 0.0 0.0 0.0 0.0 0.006702701851759142
 0.0 0.29645408339326584 0.0 0.0 0.0
 0.6643396772658501 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 1.0 0.0]
`Experiment` has no method matching Experiment(::Int64, ::Array{Float64,1}, ::Int64, ::Int64, ::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,2}, ::Int64, ::Array{Float64,2})
while loading In[5], in expression starting on line 16

In [6]:
for i = 1:10000
    td_estimation_of_world_state_function!(experiment, k, nr_of_episodes)
    update_policy!(experiment)
end
println("policy           $(round(experiment.policy,2)) $(sum(experiment.policy))")
println("target sequence: $(round(experiment.target_sequence,2))")
println("value function:  $(round(experiment.value_function,2))")
println("state action value function $(round(experiment.state_action_value_function,2))")

learned = evaluate(experiment, 10000)
println("learned $learned vs. control $control")


policy           [0.01,0.01,0.01,0.96,0.01] 1.0000000000000002
target sequence: [3.0,5.0,2.0,1.0,4.0]
value function:  [0.01,0.01,0.01,0.01,0.01]
state action value function [0.0 0.0 0.01 0.0 0.0
 0.0 0.0 0.0 0.0 0.01
 0.0 0.01 0.0 0.0 0.0
 0.01 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 1.0 0.0]
control not defined
while loading In[6], in expression starting on line 11

In [ ]:


In [ ]: