In [239]:
using Gadfly
using StatsBase: sample
using Optim: optimize
In [ ]:
data = readdlm("./stanford_dl_ex/ex1/housing.data")
data = data'
In [191]:
data = vcat(ones(eltype(data), 1, 506), data)
Out[191]:
In [ ]:
mask = sample(1:506, 506, replace=false)
In [ ]:
# training data
train_X = data[1:end-1, mask[1:400]]
train_y = data[end, mask[1:400]]
# test data
test_X = data[1:end-1, mask[401:end]]
test_y = data[end, mask[401:end]]
In [ ]:
# number of training instances
m = size(train_X, 2)
# number of features
n = size(train_X, 1)
In [ ]:
theta = rand(n)
In [287]:
function f(theta::Vector)
# fast implementation based on: http://stackoverflow.com/questions/21778374/what-is-the-recommended-way-to-iterate-a-matrix-over-rows
obj = 0
for j = 1:m
y = 0
for i = 1:n
y += theta[i] * train_X[i, j]
end
obj += (y - train_y[j])^2
end
obj / 2
end
Out[287]:
In [ ]:
f(theta)
In [289]:
function f_vec(theta::Vector)
obj = 0
for i = 1:size(train_X, 2)
obj += (theta' * train_X[:, i] - train_y[i])[1]^2
end
obj / 2
end
Out[289]:
In [ ]:
f_vec(theta)
In [290]:
function g!(theta::Vector, storage::Vector)
for k = 1:length(storage)
storage[k] = 0
end
for j = 1:size(train_X, 2)
y = 0
for i = 1:size(train_X, 1)
y += theta[i] * train_X[i, j]
end
for k = 1:length(storage)
storage[k] += (y - train_y[j]) * theta[k]
end
end
end
Out[290]:
In [296]:
optimize(f_vec, g!, theta, method=:gradient_descent, iterations=200)
Out[296]:
In [238]:
actual_prices = train_y
predicted_prices = θ' * train_X
train_rms = sqrt(mean((predicted_prices - actual_prices).^2))
println("Train RMS: $train_rms")
actual_prices = test_y
predicted_prices = θ' * test_X
test_rms = sqrt(mean((predicted_prices - actual_prices).^2))
println("Test RMS: $test_rms")
In [271]:
Gadfly.plot(
layer(x=1:length(test_y), y=sort(test_y'[:, 1]), Geom.point),
layer(x=1:length(test_y), y=sort(predicted_prices'[:, 1]), Geom.point),
)
Out[271]: