In [1]:
using PyPlot

In [2]:
abstract Kernel

# squared exponential kernel
type SEKernel <: Kernel
    params::AbstractArray # log(σ_f^2), log(l^2), log(σ_y^2)
    SEKernel(l::Float64,σ::Float64,σy::Float64) = new([log(σ^2); log(l^2); log(σy^2)])
    SEKernel(params::AbstractArray) = new(params)
end

# kernel defintion for SE kernel
function kern(se::SEKernel, x::AbstractArray,x_::AbstractArray)
    diff = x - x_
    k = exp(se.params[1]) * exp(-diff'*diff/(2*exp(se.params[2])))[1]
    return k
end

# dk/dx (not x_)
function d_kern(se::SEKernel, x::AbstractArray,x_::AbstractArray)
    δ = x - x_
    result =  -exp(se.params[1]) * δ * exp(-δ'*δ/(2exp(se.params[2]))) / exp(se.params[2])
    return result[1]
end

# d^2k/dx^2 (not x_)
function d2_kern(se::SEKernel, x::AbstractArray,x_::AbstractArray)
    δ = x - x_
    outer_product = δ*δ'
    result =  -(eye(outer_product) - outer_product/exp(se.params[2]))* exp(se.params[1]) * exp(-δ'*δ/(2exp(se.params[2])))/exp(se.params[2])
    return result[1]
end

# d^3k/dx^3 (not x_)
# note that we assume that x and x_ are
# 1x1 arrays here so we don't have to return a tensor!
function d3_kern(se::SEKernel, x::AbstractArray,x_::AbstractArray)
    δ = x - x_
    return (exp(se.params[1]) * (3δ - δ^3/exp(se.params[2])) * exp(-δ'*δ/(2exp(se.params[2])))[1] / exp(se.params[2])^2)[1]
end

# create convariance matrix K
# X,X_ have rows of data points
function K(kernel::Kernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            @inbounds K_[i,j] = kern(kernel, X[i,:], X_[j,:])
        end
    end
    if size(K_)[1] == size(K_)[2] && typeof(kernel) == SEKernel
        K_ += exp(kernel.params[3])*eye(K_)
    end
    return K_
end
function d_K(kernel::Kernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            @inbounds K_[i,j] = -d_kern(kernel, X[i,:], X_[j,:])
        end
    end
    return K_
end
function d2_K(kernel::Kernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            @inbounds K_[i,j] = d2_kern(kernel, X[i,:], X_[j,:])
        end
    end
    return K_
end
function d3_K(kernel::Kernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            @inbounds K_[i,j] = -d3_kern(kernel, X[i,:], X_[j,:])
        end
    end
    return K_
end

function K(se::SEKernel, x::Float64,x_::Float64)
    return kern(se, [x], [x_])
end


Out[2]:
K (generic function with 2 methods)

Algorithm 15.1: GP regression: (Murphy pp. 526)

  1. $L = \mathrm{cholesky}(K + \sigma_y^2 I)$
  2. $\alpha = L^T \backslash (L \backslash y)$
  3. $\mathbb{E}[f_*] = k_8^T\alpha$
  4. $v = L \backslash k_*$
  5. $\mathrm{var}[f_*] = \kappa(x_*,x_*) - v^Tv$
  6. $\log p(y|X) = -\frac{1}{2}y^T\alpha - \sum_i \log L_{ii} - \frac{N}{2}\log(2\pi)$

In [3]:
σ = 2.
l = 0.75
σ_y = 1.5
se = SEKernel(l,σ,σ_y)

# define some fake data

t = collect(0:0.5:10)
y_true = t.*sin(0.25*2π*t) + t
y = y_true + randn!(zeros(t))*2


println("==> Construct Kernels")
t_ = collect(-1:0.01:15)
K_ = K(se, t, t)

k_ = K(se, t, t_)
k__ = K(se, t_,t_)


println("==> Run GP Regression")
# GP Regression algorithm
# -- Machine Learning: A Probabalistic Perspective
#    Kevin Murphy, pp. 526
L = chol(K_)
α = L'\(L\y)
μ = k_'*α
v = L'\k_
σ = sqrt(diag(k__) - diag(v'*v))
logP = -y'*α/2 - sum(log(diag(L))) - size(y)[1]*log(2π)/2
println("--  done.")
println("==> Marginal Likelihood: log P(y|X) = $(logP[1])")


==> Construct Kernels
==> Run GP Regression
--  done.
==> Marginal Likelihood: log P(y|X) = -95.27477553008154

In [4]:
plot(t,y, alpha=0.6) #label="\$f(t) = t\\sin(\\pi t/2) + t + \\epsilon\$", alpha=0.6)
plot(t_, μ, label="\$\\mathbb{E}[f_*]\$", color="#FF425B")
fill_between(t_, μ-2σ, μ+2σ, color="#dddddd", alpha=0.3)
plot(t_, t_.*sin(0.25*2π*t_) + t_, label="Ground Truth", color="#84FF80")
title("Gaussian Process Regression")
xlabel("\$t\$")
ylabel("\$f\$")
legend(loc="upper left")

xlim(-1,15)


Out[4]:
(-1,15)

(Murphy pp. 520)

Our Gaussian Process is given by the joint density

$$ \begin{pmatrix}y\\f_* \end{pmatrix} \sim \mathcal{N}\left( 0, \begin{pmatrix} K_y & K_* \\ K_*^T & K_{**} \end{pmatrix} \right) $$

Note we assume the mean $\mu = 0$. Then our posterior predictive distribution is given by

$$ \begin{align*} p(f_*|x_*,X,y) &= \mathcal{N}(f_*|\mu_*,\Sigma_*)\\ \mu &= K_*^TK_y^{-1}y\\ \Sigma_* &= K_{**} - K_*^TK_y^{-1}K_* \end{align*} $$

When we have a single test input this becomes

$$ p(f_*|x_*,X,y) = \mathcal{N}(f_*| k_*^TK_y^{-1}y,\; k_{**} - k_*^TK_y^{-1}k_*), $$

where the posterior mean can be written

$$ \overline f_* = k_*^TK_y^{-1}y = \sum_{i=1}^N \alpha_i \kappa(x_i, x_*)\qquad\alpha = K_y^{-1}y $$

This gives the first derivative

$$ \frac{\partial \overline f_*}{\partial x_*} = \sum_{i=1}^N \alpha_i \kappa'(x_i,x_*), $$

For our Squared Exponential Kernel,

$$ \kappa(x,x') = \sigma_f^2 \exp\left( -\frac{1}{2}(x-x')^T\Lambda^{-1}(x-x') \right) = \sigma_f^2 \exp\left( -\frac{1}{2}||x-x' ||_{\Lambda^{-1}}^2 \right). $$

We have can differentiate this, as shown in (Andrew McHutchon 2013 pp. 6), giving (with the third derivative in one dimension $x\in\mathbb{R}$ to avoid a tensor)

$$ \begin{align*} \frac{\partial \kappa(x,x')}{\partial x} &= -\sigma_f^2\Lambda^{-1}(x-x')\exp\left(-\frac{1}{2}||x-x'||_{\Lambda^{-1}}^2\right)\\ &= -\Lambda^{-1}(x-x')\kappa(x,x')\\ \frac{\partial^2 \kappa(x,x')}{\partial x^2} &= -\Lambda^{-1}\left(I - \Lambda^{-1}(x-x')(x-x')^T\right)\kappa(x,x')\\ \frac{\partial^3 \kappa(x,x')}{\partial x^3} &= \lambda^{-1}\left[ 3(x-x') - \lambda^{-1}(x-x')^3 \right]\kappa(x,x')\\ \end{align*} $$

In [5]:
# define some fake data

t = collect(0:0.5:10)
y_true = t.*sin(0.25*2π*t) + t
y = y_true + randn!(zeros(t))*2

println("==> Construct Kernels")
σ = 2.
l = 0.75
σ_y = 1.
se = SEKernel(l,σ,σ_y)

t_ = collect(-1:0.01:15)
K_ = K(se, t, t)

k_ = K(se, t, t_)
dk_ = d_K(se,t,t_)
d2k_ = d2_K(se,t,t_)
d3k_ = d3_K(se,t,t_)
k__ = K(se, t_,t_)


println("==> Run GP Regression with Derivatives")
# GP Regression algorithm + derivatives
# -- Machine Learning: A Probabalistic Perspective
#    Kevin Murphy, pp. 526
L = chol(K_)
α = L\(L'\y)
μ = k_'*α
 = dk_'*α
d2μ = d2k_'*α
d3μ = d3k_'*α
v = L'\k_
σ = sqrt(diag(k__) - diag(v'*v))
logP = -y'*α/2 - sum(log(diag(L))) - size(y)[1]*log(2π)/2
println("--  done.")
println("==> Marginal Likelihood: log P(y|X) = $(logP[1])")

println("==> Plot Results")
plot(t,y, alpha=0.6)
plot(t_, μ, label="\$\\mathbb{E}[f_*]\$", color="#FF425B")
plot(t_, , label="\$\\mathbb{E}\\left[\\frac{\\partial f_*}{\\partial x}\\right]\$", color="black")
plot(t_, d2μ, label="\$\\mathbb{E}\\left[\\frac{\\partial^2 f_*}{\\partial x^2}\\right]\$", color="green")
plot(t_, d3μ, label="\$\\mathbb{E}\\left[\\frac{\\partial^3 f_*}{\\partial x^3}\\right]\$", color="cyan")
fill_between(t_, μ-2σ, μ+2σ, color="#dddddd", alpha=0.3)
#plot(t_, t_.*sin(0.25*2π*t_) + t_, label="Ground Truth", color="#84FF80")
title("Gaussian Process Regression")
xlabel("\$t\$")
ylabel("\$f\$")
legend(loc="upper right")

grid()

xlim(-1,15)


==> Construct Kernels
==> Run GP Regression with Derivatives
--  done.
==> Marginal Likelihood: log P(y|X) = -92.50921678845933
==> Plot Results
Out[5]:
(-1,15)

Financial Data


In [6]:
using DataFrames
using Dates # will throw warning for some reason


WARNING: requiring "Dates" in module "Main" did not define a corresponding module.

In [7]:
tsla_raw = readtable("../data/raw/financial/tsla.csv")

tsla_raw[:_Date] = Date(tsla_raw[:_Date],"d-u-y")

tsla_raw[:Date] = [Float64(datetime2rata(DateTime(date))) for date in tsla_raw[:_Date]]

sort!(tsla_raw, cols=[:Date])

head(tsla_raw)


Out[7]:
_DateOpenHighLowCloseVolumeDate
10015-05-05237.76239.5229.13232.9557968735238.0
20015-05-06234.1234.47228.2230.4352709335239.0
30015-05-07221.0237.48220.25236.894559095240.0
40015-05-08235.99238.41233.7236.6146682365241.0
50015-05-11236.29242.88235.31239.4956722625244.0
60015-05-12240.11246.35238.19244.7463634295245.0

In [8]:
t    = collect(tsla_raw[:Date])
t    = (t - mean(t)) / std(t)
tsla = collect(tsla_raw[:Open])
tsla = (tsla - mean(tsla)) / std(tsla)
t_   = collect(minimum(t):1e-2:maximum(t))

println("==> Run GP Regression over TSLA Open Prices")
println("--  between May 5, 2015 and May 4, 2016")

σ = 0.75
l = 0.2
σ_y = 0.2
se = SEKernel(l,σ,σ_y)

println("==> Construct Kernels")
K_ = K(se, t, t)

k_ = K(se, t, t_)
dk_ = d_K(se,t,t_)
d2k_ = d2_K(se,t,t_)
k__ = K(se, t_,t_)

println("--  done.")


==> Run GP Regression over TSLA Open Prices
--  between May 5, 2015 and May 4, 2016
==> Construct Kernels
--  done.

In [9]:
println("==> Run GP Regression with Derivatives")
# GP Regression algorithm + derivatives
# -- Machine Learning: A Probabalistic Perspective
#    Kevin Murphy, pp. 526
L = chol(K_)
α = L\(L'\tsla)
μ = k_'*α
 = dk_'*α
d2μ = d2k_'*α
v = L'\k_
σ = sqrt(diag(k__) - diag(v'*v))
logP = -tsla'*α/2 - sum(log(diag(L))) - size(tsla)[1]*log(2π)/2
println("--  done.")
println("==> Marginal Likelihood: log P(y|X) = $(logP[1])")


==> Run GP Regression with Derivatives
--  done.
==> Marginal Likelihood: log P(y|X) = -84.39008409897264

In [10]:
println("==> Plot Results")
plot(t,tsla)
plot(t_, μ, label="\$\\mathbb{E}[f_*]\$", color="#FF425B")
plot(t_, , label="\$\\mathbb{E}\\left[\\frac{\\partial f_*}{\\partial x}\\right]\$", color="black")
#plot(t_, d2μ, label="\$\\mathbb{E}\\left[\\frac{\\partial^2 f_*}{\\partial x^2}\\right]\$")
fill_between(t_, μ-2σ, μ+2σ, color="#dddddd", alpha=0.3)
title("TSLA Open Prices")
xlabel("\$t\$")
ylabel("\$f\$")
legend(loc="lower left")

grid()
xlim(minimum(t), maximum(t))


==> Plot Results
Out[10]:
(-1.7101709607151185,1.7265814083171187)

Numerical Gradient Check

Check the first derivative numerically off of the mean. Check the second derivative numerically off of the analytic first derivative.


In [11]:
eps = 0.1
println("Checking First and Second Derivatives with ϵ=$(eps)")
for i=2:Integer(floor(size(t_)[1]/10)):size(t_)[1]
    pass = true
    
    # check first derivative
    if abs([i] - (μ[i+1]-μ[i-1])/(t_[i+1]-t_[i-1])) > eps
        pass = false
    end
    
    # check second derivative
    if abs(d2μ[i] - ([i+1]-[i-1])/(t_[i+1]-t_[i-1])) > eps
        pass = false
    end
    
    if pass
        println("t=$(t_[i])\t [√]")
    else
        println("t=$(t_[i])\t [x]")
    end
end


Checking First and Second Derivatives with ϵ=0.1
t=-1.7001709607151185	 [√]
t=-1.3601709607151184	 [√]
t=-1.0201709607151184	 [√]
t=-0.6801709607151185	 [x]
t=-0.34017096071511843	 [x]
t=-0.00017096071511857325	 [√]
t=0.3398290392848813	 [√]
t=0.6798290392848816	 [√]
t=1.0198290392848814	 [x]
t=1.3598290392848817	 [√]
t=1.6998290392848816	 [√]

Finding Critical Points

We use Newton's Method to find critical points of the time series, where $\mathbb{E}\left[\frac{\partial f^*}{\partial x}\right] = 0$. We first approximate $f$ quadratically about our current guess $x_n$:

$$ f \approx f(x_n) + \nabla f(x_0)^T(x-x_n) + \frac{1}{2}(x-x_n)^T\nabla^2 f(x_0)(x-x_n). $$

where $\nabla^2 f(x_n)$ is the Hessian of $f$ at $x_n$. Now, we find the critical point $x_{n+1}$ from our current quadratic approximation of $f$. Taking the first derivative we find

$$ \begin{align*} \frac{\partial f}{\partial x} &= \nabla f(x_n) + \nabla^2 f(x_0)(x-x_n) = 0\\ x_{n+1} &= x_n - \nabla^2 f(x_n)^{-1}\nabla f(x_n), \end{align*} $$

giving us our desired update rule.

Algorithm 2: Newton-Raphson Method

  1. Input: Initial Guess $x_0$
  2. Repeat Until Convergence ( $||\nabla f(x_n)||_2 \leq \epsilon$ ):
    1. $x_{n+1} = x_n - \nabla^2 f(x_n)^{-1}\nabla f(x_n)$
  3. Output Optimal $x_{n+1}$

In [31]:
# find critical points of TSLA data

ϵ = 1e-6 # convergence tolerance
N = 50   # number of starting points
t_opt = maximum(t)*2*(rand(N,1)-0.5)

∇f  = d_K(se,t,t_opt)'*α
∇2f = d2_K(se,t,t_opt)'*α

i  = 0
@time while any(∇f.*∇f .> ϵ^2) && i < 100
    t_opt = t_opt - ∇f ./ ∇2f
    
    if i > 5
        # drop points that aren't converging quickly
        t_opt = t_opt[∇f - mean(∇f) .< 2std(∇f)]
        
        # remove any of t out of the range of data
        t_opt = t_opt[!(t_opt .> maximum(t))]
        t_opt = t_opt[!(t_opt .< minimum(t))]
    end
    
    ∇f  = d_K(se,t,t_opt)'*α
    ∇2f = d2_K(se,t,t_opt)'*α
    
    i = i+1
end

println("==> Done.")
println("--  Steps:   $(i)")
println("--  Maximum? $(∇2f[1] < 0)")


  0.479888 seconds (6.29 M allocations: 284.126 MB, 8.57% gc time)
==> Done.
--  Steps:   7
--  Maximum? false

In [32]:
unique(t_opt)


Out[32]:
19-element Array{Float64,1}:
 -0.0683816
  0.442814 
 -0.0683816
  1.57425  
  0.977516 
 -0.408594 
 -1.08562  
 -0.0683816
  0.442814 
 -1.08562  
  0.442814 
 -0.652616 
 -1.08562  
 -0.652616 
 -1.08562  
 -0.408594 
 -0.652616 
 -0.0683816
 -0.0683816

In [33]:
println("==> Plot Results of Critical Point Optimization")

t_opt  = unique(t_opt[!isnan(t_opt)])
f_opt  = K(se, t, t_opt)'*α
∇f_opt = d_K(se, t, t_opt)'*α

plot(t,tsla)
plot(t_, μ, label="\$\\mathbb{E}[f_*]\$", color="#FF425B")
scatter(t_opt, f_opt, label="\$\\mathbb{E}\\left[\\frac{\\partial f^*}{\\partial x}\\right] = 0\$")

fill_between(t_, μ-2σ, μ+2σ, color="#dddddd", alpha=0.3)
title("TSLA Open Prices")
xlabel("\$t\$")
ylabel("\$f\$")
legend(loc="lower left")

grid()
xlim(minimum(t), maximum(t))


==> Plot Results of Critical Point Optimization
Out[33]:
(-1.7101709607151185,1.7265814083171187)

Approximate Generation of Gram Matrix

We note that the kernel matrix $K_y$ and $k_{\star\star}$ are both heavily concentrated on the diagonal. We try to exploit this implicit structure given by sorting our times $t$ prior to constructing the matrix.


In [34]:
plt[:cm][:copper]


Out[34]:
copper

In [49]:
imshow(K_, cmap=plt[:cm][:copper])
colorbar()


Out[49]:
PyObject <matplotlib.colorbar.Colorbar instance at 0x315e158c0>

Bayesian Optimization of Hyperparameters

(Murphy pp. 523)

We can optimize out the (non-convex) marginal likelihood of the model by maximizing the $\log$ marginal likelihood. We have

$$ p(y|X) = \int p(y|f,X)p(f|X)df. $$

One can show that $$ \log p(y|X) = \log \mathcal{N}(y|0,K_y) = -\frac{1}{2}yK_Y^{-1}y - \frac{1}{2}\log|K_y| - \frac{N}{2}\log(2\pi) $$ and, with the kernel hyperparameters denoted $\theta$, $$ \frac{\partial}{\partial \theta_j}\log p(y|X) = \frac{1}{2}\mathrm{tr}\left( (\alpha\alpha^T - K_y^{-1})\frac{\partial K_y}{\partial \theta_j} \right) = \frac{1}{2}\mathrm{tr}\left(A\frac{\partial K_y}{\partial \theta_j}\right). $$

With this expression for the gradient, we can optimize our hyperparameters in a Bayesian way using any standard gradient-based method like gradient descent.

For the squared exponential kernel in the 1-D case, letting $$ \begin{align*} \theta_1 &= \log(\sigma_f^2)\\ \theta_2 &= \log(l^2)\\ \theta_3 &= \log(\sigma_y^2) \end{align*} $$ which gives

$$ \begin{align*} \kappa(x,x') &= \exp\left(\theta_1 -\frac{1}{2\exp(\theta_2)}(x-x')^2 \right) + \exp(\theta_3)\mathbb{1}\{x=x'\}\\ \frac{\partial \kappa}{\partial \theta_1} &= \exp\left(\theta_1 -\frac{1}{2\exp(\theta_2)}(x-x')^2 \right)\\ \frac{\partial \kappa}{\partial \theta_2} &= \frac{1}{2}(x-x')^2 \exp\left(\theta_1 - \theta_2 -\frac{1}{2\exp(\theta_2)}(x-x')^2\right)\\ \frac{\partial \kappa}{\partial \theta_3} &= \exp(\theta_3)\mathbb{1}\{x=x'\} \end{align*} $$

In [12]:
## Perform Gradient Descent on Parameters
## first define gradients
function d_σf(se::SEKernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            diff = X[i,:] - X_[j,:]
            @inbounds K_[i,j] = exp(se.params[1] - diff'*diff*exp(-se.params[2])/2)[1]
        end
    end
    return K_
end
function d_l(se::SEKernel, X::AbstractArray,X_::AbstractArray)
    K_ = zeros(size(X)[1], size(X_)[1])
    for i=1:size(K_)[1]
        for j=1:size(K_)[2]
            diff = X[i,:] - X_[j,:]
            δ2 = diff'*diff
            @inbounds K_[i,j] = (δ2[1]*exp(se.params[1]-se.params[2] - δ2*exp(-se.params[2])/2))[1]/2
        end
    end
    return K_
end
function d_σy(se::SEKernel, X::AbstractArray,X_::AbstractArray)
    return exp(se.params[3])*eye(size(X)[1], size(X_)[1])
end

# derivs returns a vector and the marginal likelihood
#  (dlogP/dθ, logP)
function derivs(se::SEKernel, X::AbstractArray, y::AbstractVector)
    d_params = zeros(3,1)
    
    # calculate common factors from data
    K_ = K(se, X,X)
    L = chol(K_)
    α = L\(L'\y)
    logP = -y'*α/2 - sum(log(diag(L))) - size(y)[1]*log(2π)/2
    
    L_inv = inv(L)
    A = α*α' - L_inv*L_inv'
    
    d_params[1] = trace(A*d_σf(se,X,X))/2
    d_params[2] = trace(A*d_l(se,X,X))/2
    d_params[3] = trace(A*d_σy(se,X,X))/2
    
    return d_params, logP[1]
end


Out[12]:
derivs (generic function with 1 method)

In [46]:
# run gradient descent

σ = 0.75
l = 0.2
σ_y = 0.2
params = [log(σ^2); log(l^2); log(σ_y^2)]''
# params = randn(3,1) # random instantiation converges much slower!
se = SEKernel(params)

n_iters = 100
lr = 0.05
β = 0.75
∇logP = [0.; 0.; 0.]
logP  = zeros(n_iters)
for i=1:n_iters
    ∇logP, logP[i] = derivs(se, t, tsla)
    
    # we want to maximize logP so we
    # will step `lr/i` in the direction of
    # the graditent
    se.params += (lr/(β*sqrt(i)))*∇logP
    
    if i%25 == 0
        println("==> logP[$(i)] = $(logP[i])\n |∇logP| = $(norm(∇logP))")
    end
end

plot(collect(1:n_iters), logP)
title("Learning Curve")
xlabel("Iterations")
ylabel("Marginal Likelihood \$\\log\\; P(\\mathbf{y}|\\mathbf{X})\$")


==> logP[25] = -35.479696420639215
 |∇logP| = 1.0574794334156705
==> logP[50] = -35.4747695428708
 |∇logP| = 1.2105451177842266e-5
==> logP[75] = -35.47476954286745
 |∇logP| = 3.6082897086890046e-8
==> logP[100] = -35.47476954286739
 |∇logP| = 2.968107980559532e-10
Out[46]:
PyObject <matplotlib.text.Text object at 0x31ed3b590>

In [47]:
∇logP


Out[47]:
3x1 Array{Float64,2}:
 2.81052e-10
 9.30817e-11
 2.10236e-11

In [45]:
# plot results
K_ = K(se, t, t)

k_ = K(se, t, t_)
dk_ = d_K(se,t,t_)
d2k_ = d2_K(se,t,t_)
k__ = K(se, t_,t_)

println("==> Run GP Regression")
# GP Regression algorithm + derivatives
# -- Machine Learning: A Probabalistic Perspective
#    Kevin Murphy, pp. 526
L = chol(K_)
α = L\(L'\tsla)
μ = k_'*α
v = L'\k_
σ = sqrt(diag(k__) - diag(v'*v))
logP_ = -tsla'*α/2 - sum(log(diag(L))) - size(tsla)[1]*log(2π)/2
println("--  done.")
println("==> Marginal Likelihood: log P(y|X) = $(logP_[1])")

println("==> Plot Results")
plot(t,tsla)
plot(t_, μ, label="\$\\mathbb{E}[f_*]\$", color="#FF425B")
fill_between(t_, μ-2σ, μ+2σ, color="#dddddd", alpha=0.3)
title("TSLA Open Prices with Optimized Hyperparameters")
xlabel("\$t\$")
ylabel("\$f\$")
legend(loc="lower left")

grid()
xlim(minimum(t), maximum(t))


==> Run GP Regression
--  done.
==> Marginal Likelihood: log P(y|X) = -35.474769542867676
==> Plot Results
Out[45]:
(-1.7101709607151185,1.7265814083171187)

In [33]:
## we are going to plot a surface over the
## marginal likelihood to see the nonconvexity

# adapted from https://github.com/gizmaa/Julia_Examples/blob/master/pyplot_surfaceplot.jl

n = 10
σ_y = linspace(0.01,0.5,n)
σ_f = 0.75
l = linspace(0.25,3,n)

σgrid = repmat(σ_y',n,1)
lgrid = repmat(l,1,n)

z = zeros(n,n)

for i in 1:n
    for j in 1:n
        # calculate marginal likelihood for this
        # pair of parameters
        se = SEKernel(σ_f,l[j],σ_y[i])
        
        K_ = K(se, t, t)
        L = chol(K_)
        α = L\(L'\tsla)
        logP = -tsla'*α/2 - sum(log(diag(L))) - size(tsla)[1]*log(2π)/2
        
        z[i:i,j:j] = logP
        
        if j%10 + i%10 == 0
            println("==> Finished (i,j)=($(i),$(j))")
        end
    end
end

cp = plt[:contourf](σgrid, lgrid, z, levels=linspace(minimum(z),maximum(z),100), cmap=plt[:cm][:jet]) 
xlabel("\$\\sigma_y\$") 
ylabel("\$l\$")
title("Marginal Likelihood for Different Hyperparameters")
colorbar()


==> Finished (i,j)=(10,10)
Out[33]:
PyObject <matplotlib.colorbar.Colorbar instance at 0x320776440>