Guy 2018-09-23

This is a fork on a GitHub from tjvandal (https://github.com/tjvandal/generic-pred/tree/master/julia)

Generic-Pred stand for "Generic-Prediction" and was introduced by Golestani Abbas & Robin Gras, 2014 paper "Can we predict the unpredictable?" (http://www.nature.com/articles/srep06834 and http://www.nature.com/articles/srep06834#supplementary-information)

It is about "an original approach that brings a novel perspective to the field of long-term time series forecasting".

One demonstrated application is Dow Jones Industrial Average (DJIA) forcasting. Authors seems to have used closing index data from September 1993 to September 2001:

"Extract from the paper"

We have downloaded (http://quotes.wsj.com/index/DJIA/historical-prices) DJIA data from September 1st, 1993 to September 28, 2001. And found the code tjvandal put in its github.

Our intend is to reproduce Golestani Abbas & Robin Gras paper's demonstration on DJIA.

Objectives:

  • Understand
  • Add explanations and comments
  • Compare tjvandal results with the paper
  • Analyse some alternatives (TBD)

In [1]:
#Pkg.add("Distributed") #https://docs.julialang.org/en/v1/stdlib/Distributed/index.html
#Pkg.add("Gadfly") # install GadFly if not already present
#Pkg.add("DataFrames") # install DataFrames if not already present
#Pkg.update() # get the latest versions of your packages

push!(LOAD_PATH, pwd()) #LOAD_PATH contains the directories Julia searches for modules : pwd() Get the current working directory


Out[1]:
3-element Array{Any,1}:
 "C:\\Program Files\\Julia-0.6.4\\local\\share\\julia\\site\\v0.6"
 "C:\\Program Files\\Julia-0.6.4\\share\\julia\\site\\v0.6"       
 "c:\\Users\\guypa\\Documents\\GitHub\\Generic-Pred"              

In [2]:
using DataFrames #DataFrames package provides the essential tools you need for working with statistical data
using Gadfly #Gadfly is a system for plotting and visualization
using CSV # fast and flexible pure-Julia library for handling delimited text files

# @everywhere using Lyaponuv # @everywhere : this tag will insure that all the processes have access to the function
# @everywhere is a way to include a fonction library. Here, the fonctions are within this notebook
# Lyaponuv is a distinct file with all the functions. I have inserted those function in the next cell.

f = "DJIA 09-1993 to 09-2001.csv" 
# a File name which is a .csv containing two columns Date and DJIA
# Dow Jones Industrial Average from Sept 01, 1993 to Sept 28, 2001

#data=readtable(f, header=true) #read data from a CSV-like file
data=CSV.read(f)


INFO: Precompiling module DataFrames.
WARNING: static parameter R does not occur in signature for catvaluetype at C:\Users\guypa\.julia\v0.6\CategoricalArrays\src\value.jl:52.
The method will not be callable.
INFO: Precompiling module Gadfly.
INFO: Precompiling module CSV.
Out[2]:
DateDJIA
11993-09-013645.1
21993-09-023626.1
31993-09-033633.93
41993-09-073607.1
51993-09-083588.93
61993-09-093589.49
71993-09-103621.63
81993-09-133634.21
91993-09-143615.76
101993-09-153633.65
111993-09-163630.85
121993-09-173613.25
131993-09-203575.8
141993-09-213537.24
151993-09-223547.02
161993-09-233539.75
171993-09-243543.11
181993-09-273567.7
191993-09-283566.02
201993-09-293566.3
211993-09-303555.12
221993-10-013581.11
231993-10-043577.76
241993-10-053587.26
251993-10-063598.99
261993-10-073583.63
271993-10-083584.74
281993-10-113593.41
291993-10-123593.13
301993-10-133603.19
&vellip&vellip&vellip

In [3]:
#using Distributed #https://docs.julialang.org/en/v1/stdlib/Distributed/index.html
addprocs(16) #add 4 processes on the local machine. This can be used to take advantage of multiple cores.

@everywhere function lyaponuv_k(time_series, J, m, ref)
    #println("test:1")
    X = attractor(time_series, m, J)
    #println("test:2")
    norms = compute_norms(X)
    #println("test:3")
    pairs = match_pairs(norms)
    #println("test:4")
    y = follow_points(pairs, norms, ref)
    #println("test:5")
    return(norms,y)
end

@everywhere function match_pairs(norms)
    M = size(norms)[1]
    pairs = Array{Int}(M)
    for row in 1:M
        mn, idx = findmin(norms[row, :])
        pairs[row] = idx
    end
    return(pairs)
end

@everywhere function attractor(time_series, m, J)
    N = length(time_series)
    M = N - (m - 1) * J  
    X = Array{Float64}(m, M)
    i = 1
    for i=1:M
        X[:,i] = time_series[i:J:(i+(m-1)*J)]
    end
    return(X)
end

@everywhere function follow_points(pairs, norms, ref)
    y = Array{Float64}(ref)
    M = size(norms)[1]
    for i=0:ref-1
        agg = 0 
        count = 0
        for j=1:M
            jhat = pairs[j]+i
            jtrue = j+i

            if jhat <= M && jtrue <= M
                agg = agg + log(norms[jtrue, jhat])
               # agg = agg + log(vecnorm(X[:, jtrue] - X[:, jhat]))
                count = count + 1
            end
        end
        y[i+1] = agg/count # divide by delta-t also?
    end
    return(y)
end


@everywhere function compute_norms(X)
    M = size(X)[2]
norms = Array{Float64}(M, M)
    for i=1:M
        norms[i,:] = column_norms(X, i)      
    end    
    return(norms)
end


@everywhere function column_norms(X, i)
    M = size(X)[2]
    X_diff = X .- X[:, i]
    norm_vector = [vecnorm(X_diff[:, k]) for k=1:M]
    norm_vector[i] = 10^10
    return(norm_vector)
end


@everywhere function lyaponuv_exp(series)
    nn = .!isnan.(series)
    A = ones(length(series), 2)
    A[:,1] = linspace(1, length(series), length(series))
    gradient = \(A, series)
    return(gradient[1])
end


@everywhere function lyaponuv(time_series, J, m, ref)
    ts = lyaponuv_k(time_series, J, m, ref)[2]
    exponent = lyaponuv_exp(ts[isfinite.(ts)])  ## only input those which are finite
    return(exponent)
end

#=
@everywhere function get_next(ts, m, M, norms, ref, J)
    
    attractor_array = attractor(ts, m, J)
    temp_norms = Array{Float64}(M+1, M+1)
    temp_norms[1:M, 1:M] = norms
    col = column_norms(attractor_array, M+1)
    temp_norms[M+1, :] = col
    temp_norms[:, M+1] = col

    pairs=match_pairs(temp_norms)
    lyap_k_temp = follow_points(pairs, temp_norms, ref)
    return(lyaponuv_exp(lyap_k_temp))
end

@everywhere function lyaponuv_next(time_series, J, m, ref, sample_size)
    ts_diff = time_series[2:end] - time_series[1:end-1]
    sigma = std(ts_diff)
    samples = randn(sample_size) * sigma + time_series[end]
    @time norms, lyap_k = lyaponuv_k(time_series, J, m, ref)
    true_exponent = lyaponuv_exp(lyap_k)
    exponents = Array{Float64}(sample_size)
    M = size(norms)[1]
    tasks = Array{Future}(sample_size)

    for i=1:sample_size
        s = samples[i]
        tasks[i] = @spawn get_next(vcat(time_series, s), m, M, norms, ref, J)
        #@time exponents[i] = get_next(vcat(time_series, s), m, M, norms, ref, J)        
        #@printf("process: %d\n", i) 
    end
    
    for i=1:sample_size
        exponents[i]=fetch(tasks[i])
    end

    diff = abs.(exponents .-  true_exponent)    
    val, idx = findmin(diff)
    println("Next Value:", samples[idx])
    return(samples[idx])
end
=#

In [103]:
#Variables needed by lyaponuv function
J = 2  ## reconstruction delay
m = 3  ## embedding dimension
r = 11 ## 

sliding_window = 1400
next_x_points = 521
sample_size = 10

ts = deepcopy(data[:DJIA][1:end-next_x_points])/10000 
#Create a deep copy of "data", everything is copied recursively, resulting in a fully independent object.
#The copy lenghy has next_x_points less points than the original "data"
#Futhurmore, each points is divided per 10000

#println("ts:", ts, "\n")
#println("length(ts):", length(ts), "\n")
#println("length(ts[1:end-1]):", length(ts[1:end-1]), "\n")
#println("length(ts[2:end]):", length(ts[2:end]), "\n")

diff = ts[1:end-1] - ts[2:end]
#

#println("diff:", diff, "\n")
#println("length(diff):", length(diff), "\n")
#println("diff avg:", mean(abs.(diff)), "\tdiff std:", std(diff))

println(now())
tictoc = tic()
for i=1:next_x_points
    #println("i:", i, ":")
    lyap_exp = lyaponuv(ts[end-sliding_window:end], J, m, r)
    #println("lyap_exp:", lyap_exp, "\n")
    tasks = Array{Future}(sample_size)
    #println("tasks:", tasks, "\n")
    mu = mean(ts[end-sliding_window:end])
    #println("mu:", mu, "\n")

    diff = ts[end-sliding_window+1:end] - ts[end-sliding_window:end-1]
    mu = mean(diff)
    #println("mu:", mu, "\n")
    sigma = std(diff)
    #println("sigma:", sigma, "\n")

    sample_values = randn(sample_size) .* sigma .+ ts[end]
    #println("sample_values:", sample_values, "\n")

    for j=1:sample_size
        #println("j:", j, "\n")
        tempts=deepcopy(ts[end-sliding_window:end])
        #println("tempts:", tempts, "\n")
        append!(tempts, [sample_values[j]])
        #println("sample_values:", sample_values, "\n")
        tasks[j] = @spawn lyaponuv(tempts, J, m, r)
        #println("tasks[j]:", tasks[j], "\n")
    end
    
    exponents = Array{Float64}(sample_size)
    #println("exponents:", exponents, "\n")
    
    for j=1:sample_size
        #println("j:", j, "\n")
        exponents[j] = fetch(tasks[j])
        #println("exponents[j]:", exponents[j], "\n")
    end
    #println("exponent:", mean(lyap_exp))
    
    exp_diff = abs.(exponents .- lyap_exp)
    min_index = findmin(exp_diff)
    best_val = sample_values[min_index[2]]
    append!(ts, [best_val])
    #println(i, "   best value:", best_val, "\t")
end
#toc()
println("This took %s", duration([0, 0, toc(tictoc)]))

# use writetable("output.csv", df) To write data to a CSV file -> deprecated USE CSV.writer instead


2018-10-04T19:04:36.167
syntax: invalid character literal

Stacktrace:
 [1] include_string(::String, ::String) at .\loading.jl:522

In [108]:
println(tictoc)
duration([0,0,tictoc])


125519891768510
UndefVarError: duration not defined

Stacktrace:
 [1] include_string(::String, ::String) at .\loading.jl:522

In [104]:
x=linspace(1, length(data[:DJIA]), length(data[:DJIA]))
        #linspace(start, stop, n=50) Construct a range of n linearly spaced elements from start to stop.
l1 = layer(x=x, y=data[:DJIA], Geom.line)

x0 = length(data[:DJIA])-next_x_points+1
x_ts = linspace(x0, length(ts), length(ts)-x0+1)
y_ts = ts[x0:end]*10000
l2 = layer(x=x_ts, y=y_ts, Geom.line, Theme(default_color=colorant"red"))
plot(l1, l2)


Out[104]:
x -3000 -2500 -2000 -1500 -1000 -500 0 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 -2500 -2400 -2300 -2200 -2100 -2000 -1900 -1800 -1700 -1600 -1500 -1400 -1300 -1200 -1100 -1000 -900 -800 -700 -600 -500 -400 -300 -200 -100 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 -2500 0 2500 5000 -2600 -2400 -2200 -2000 -1800 -1600 -1400 -1200 -1000 -800 -600 -400 -200 0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800 3000 3200 3400 3600 3800 4000 4200 4400 4600 4800 5000 h,j,k,l,arrows,drag to pan i,o,+,-,scroll,shift-drag to zoom r,dbl-click to reset c for coordinates ? for help ? -2.0×10⁴ -1.5×10⁴ -1.0×10⁴ -5.0×10³ 0 5.0×10³ 1.0×10⁴ 1.5×10⁴ 2.0×10⁴ 2.5×10⁴ 3.0×10⁴ 3.5×10⁴ -1.50×10⁴ -1.45×10⁴ -1.40×10⁴ -1.35×10⁴ -1.30×10⁴ -1.25×10⁴ -1.20×10⁴ -1.15×10⁴ -1.10×10⁴ -1.05×10⁴ -1.00×10⁴ -9.50×10³ -9.00×10³ -8.50×10³ -8.00×10³ -7.50×10³ -7.00×10³ -6.50×10³ -6.00×10³ -5.50×10³ -5.00×10³ -4.50×10³ -4.00×10³ -3.50×10³ -3.00×10³ -2.50×10³ -2.00×10³ -1.50×10³ -1.00×10³ -5.00×10² 0 5.00×10² 1.00×10³ 1.50×10³ 2.00×10³ 2.50×10³ 3.00×10³ 3.50×10³ 4.00×10³ 4.50×10³ 5.00×10³ 5.50×10³ 6.00×10³ 6.50×10³ 7.00×10³ 7.50×10³ 8.00×10³ 8.50×10³ 9.00×10³ 9.50×10³ 1.00×10⁴ 1.05×10⁴ 1.10×10⁴ 1.15×10⁴ 1.20×10⁴ 1.25×10⁴ 1.30×10⁴ 1.35×10⁴ 1.40×10⁴ 1.45×10⁴ 1.50×10⁴ 1.55×10⁴ 1.60×10⁴ 1.65×10⁴ 1.70×10⁴ 1.75×10⁴ 1.80×10⁴ 1.85×10⁴ 1.90×10⁴ 1.95×10⁴ 2.00×10⁴ 2.05×10⁴ 2.10×10⁴ 2.15×10⁴ 2.20×10⁴ 2.25×10⁴ 2.30×10⁴ 2.35×10⁴ 2.40×10⁴ 2.45×10⁴ 2.50×10⁴ 2.55×10⁴ 2.60×10⁴ 2.65×10⁴ 2.70×10⁴ 2.75×10⁴ 2.80×10⁴ 2.85×10⁴ 2.90×10⁴ 2.95×10⁴ 3.00×10⁴ -2×10⁴ 0 2×10⁴ 4×10⁴ -1.5×10⁴ -1.4×10⁴ -1.3×10⁴ -1.2×10⁴ -1.1×10⁴ -1.0×10⁴ -9.0×10³ -8.0×10³ -7.0×10³ -6.0×10³ -5.0×10³ -4.0×10³ -3.0×10³ -2.0×10³ -1.0×10³ 0 1.0×10³ 2.0×10³ 3.0×10³ 4.0×10³ 5.0×10³ 6.0×10³ 7.0×10³ 8.0×10³ 9.0×10³ 1.0×10⁴ 1.1×10⁴ 1.2×10⁴ 1.3×10⁴ 1.4×10⁴ 1.5×10⁴ 1.6×10⁴ 1.7×10⁴ 1.8×10⁴ 1.9×10⁴ 2.0×10⁴ 2.1×10⁴ 2.2×10⁴ 2.3×10⁴ 2.4×10⁴ 2.5×10⁴ 2.6×10⁴ 2.7×10⁴ 2.8×10⁴ 2.9×10⁴ 3.0×10⁴ y

In [52]:
println(data)
println(x0)


2036×3 DataFrames.DataFrame
│ Row  │ Date       │ DJIA    │ DJIA_p   │
├──────┼────────────┼─────────┼──────────┤
│ 1    │ 1993-09-01 │ 3645.1  │ 0.36451  │
│ 2    │ 1993-09-02 │ 3626.1  │ 0.36261  │
│ 3    │ 1993-09-03 │ 3633.93 │ 0.363393 │
│ 4    │ 1993-09-07 │ 3607.1  │ 0.36071  │
│ 5    │ 1993-09-08 │ 3588.93 │ 0.358893 │
│ 6    │ 1993-09-09 │ 3589.49 │ 0.358949 │
│ 7    │ 1993-09-10 │ 3621.63 │ 0.362163 │
│ 8    │ 1993-09-13 │ 3634.21 │ 0.363421 │
⋮
│ 2028 │ 2001-09-18 │ 8903.4  │ 1.0273   │
│ 2029 │ 2001-09-19 │ 8759.13 │ 1.03633  │
│ 2030 │ 2001-09-20 │ 8376.21 │ 1.03585  │
│ 2031 │ 2001-09-21 │ 8235.81 │ 1.04673  │
│ 2032 │ 2001-09-24 │ 8603.86 │ 1.05039  │
│ 2033 │ 2001-09-25 │ 8659.97 │ 1.04651  │
│ 2034 │ 2001-09-26 │ 8567.39 │ 1.05792  │
│ 2035 │ 2001-09-27 │ 8681.42 │ 1.07111  │
│ 2036 │ 2001-09-28 │ 8847.56 │ 1.07388  │
1516

In [101]:
lay1 = layer(data, x=:Date, y=:DJIA, Geom.line, Theme(default_color="cyan"))
#lay2 = layer(ts, )
plot(lay1)


Out[101]:
Date Jan 1, 1980 1982 1985 1988 1990 1992 1995 1998 2000 2002 2005 2008 2010 2012 2015 Jan 1, 1980 1990 2000 2010 2020 Jan 1, 1980 1990 2000 2010 2020 Jan 1, 1980 1990 2000 2010 2020 h,j,k,l,arrows,drag to pan i,o,+,-,scroll,shift-drag to zoom r,dbl-click to reset c for coordinates ? for help ? -2.0×10⁴ -1.5×10⁴ -1.0×10⁴ -5.0×10³ 0 5.0×10³ 1.0×10⁴ 1.5×10⁴ 2.0×10⁴ 2.5×10⁴ 3.0×10⁴ 3.5×10⁴ -1.50×10⁴ -1.45×10⁴ -1.40×10⁴ -1.35×10⁴ -1.30×10⁴ -1.25×10⁴ -1.20×10⁴ -1.15×10⁴ -1.10×10⁴ -1.05×10⁴ -1.00×10⁴ -9.50×10³ -9.00×10³ -8.50×10³ -8.00×10³ -7.50×10³ -7.00×10³ -6.50×10³ -6.00×10³ -5.50×10³ -5.00×10³ -4.50×10³ -4.00×10³ -3.50×10³ -3.00×10³ -2.50×10³ -2.00×10³ -1.50×10³ -1.00×10³ -5.00×10² 0 5.00×10² 1.00×10³ 1.50×10³ 2.00×10³ 2.50×10³ 3.00×10³ 3.50×10³ 4.00×10³ 4.50×10³ 5.00×10³ 5.50×10³ 6.00×10³ 6.50×10³ 7.00×10³ 7.50×10³ 8.00×10³ 8.50×10³ 9.00×10³ 9.50×10³ 1.00×10⁴ 1.05×10⁴ 1.10×10⁴ 1.15×10⁴ 1.20×10⁴ 1.25×10⁴ 1.30×10⁴ 1.35×10⁴ 1.40×10⁴ 1.45×10⁴ 1.50×10⁴ 1.55×10⁴ 1.60×10⁴ 1.65×10⁴ 1.70×10⁴ 1.75×10⁴ 1.80×10⁴ 1.85×10⁴ 1.90×10⁴ 1.95×10⁴ 2.00×10⁴ 2.05×10⁴ 2.10×10⁴ 2.15×10⁴ 2.20×10⁴ 2.25×10⁴ 2.30×10⁴ 2.35×10⁴ 2.40×10⁴ 2.45×10⁴ 2.50×10⁴ 2.55×10⁴ 2.60×10⁴ 2.65×10⁴ 2.70×10⁴ 2.75×10⁴ 2.80×10⁴ 2.85×10⁴ 2.90×10⁴ 2.95×10⁴ 3.00×10⁴ -2×10⁴ 0 2×10⁴ 4×10⁴ -1.5×10⁴ -1.4×10⁴ -1.3×10⁴ -1.2×10⁴ -1.1×10⁴ -1.0×10⁴ -9.0×10³ -8.0×10³ -7.0×10³ -6.0×10³ -5.0×10³ -4.0×10³ -3.0×10³ -2.0×10³ -1.0×10³ 0 1.0×10³ 2.0×10³ 3.0×10³ 4.0×10³ 5.0×10³ 6.0×10³ 7.0×10³ 8.0×10³ 9.0×10³ 1.0×10⁴ 1.1×10⁴ 1.2×10⁴ 1.3×10⁴ 1.4×10⁴ 1.5×10⁴ 1.6×10⁴ 1.7×10⁴ 1.8×10⁴ 1.9×10⁴ 2.0×10⁴ 2.1×10⁴ 2.2×10⁴ 2.3×10⁴ 2.4×10⁴ 2.5×10⁴ 2.6×10⁴ 2.7×10⁴ 2.8×10⁴ 2.9×10⁴ 3.0×10⁴ DJIA

In [72]:
println(length(ts))
println(length(data[:DJIA]))
println(size(data,1))
println(length(ts)-next_x_points+1)
size(data)


2036
2036
2036
1516
Out[72]:
(2036, 3)

In [31]:
typeof(data[:DJIA])


Out[31]:
Array{Union{Float64, Missings.Missing},1}

In [76]:
ds = deepcopy(data)
#ds[:DJIA_p] = 1:size(data,1) #ajoute une colonne avec chiffre de 1 à n
ds[:DJIA_p] = ts*10000
#delete!(ds, :DJIA_P)
println(data)


2036×3 DataFrames.DataFrame
│ Row  │ Date       │ DJIA    │ DJIA_p  │
├──────┼────────────┼─────────┼─────────┤
│ 1    │ 1993-09-01 │ 3645.1  │ 3645.1  │
│ 2    │ 1993-09-02 │ 3626.1  │ 3626.1  │
│ 3    │ 1993-09-03 │ 3633.93 │ 3633.93 │
│ 4    │ 1993-09-07 │ 3607.1  │ 3607.1  │
│ 5    │ 1993-09-08 │ 3588.93 │ 3588.93 │
│ 6    │ 1993-09-09 │ 3589.49 │ 3589.49 │
│ 7    │ 1993-09-10 │ 3621.63 │ 3621.63 │
│ 8    │ 1993-09-13 │ 3634.21 │ 3634.21 │
⋮
│ 2028 │ 2001-09-18 │ 8903.4  │ 10273.0 │
│ 2029 │ 2001-09-19 │ 8759.13 │ 10363.3 │
│ 2030 │ 2001-09-20 │ 8376.21 │ 10358.5 │
│ 2031 │ 2001-09-21 │ 8235.81 │ 10467.3 │
│ 2032 │ 2001-09-24 │ 8603.86 │ 10503.9 │
│ 2033 │ 2001-09-25 │ 8659.97 │ 10465.1 │
│ 2034 │ 2001-09-26 │ 8567.39 │ 10579.2 │
│ 2035 │ 2001-09-27 │ 8681.42 │ 10711.1 │
│ 2036 │ 2001-09-28 │ 8847.56 │ 10738.8 │

In [ ]:
lay1 = layer(data, x=:Date, y=:DJIA, Geom.line, Theme(default_color="cyan"))
#lay2 = layer(ts, )
plot(lay1)