In [1]:
using DataFrames
using JSON
using Iterators
#using taxis
using HDF5, JLD
using Stats
using kNN
#using sequenceCompare
#reload("taxis")
nprocs()
Out[1]:
In [2]:
println("Begin")
println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_100k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")
println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]
println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]
println("deleting unneeded data rows/columns")
delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)
println("adding start/end point columns")
taxi_df[:START] = [x[:,1] for x in taxi_df[:COORDS]]
taxi_validation_df[:START] = [x[:,1] for x in taxi_validation_df[:COORDS]]
taxi_df[:END] = [x[:,end] for x in taxi_df[:COORDS]]
taxi_validation_df[:END] = [x[:,end] for x in taxi_validation_df[:COORDS]]
println("deleting training examples with no coords!")
#These examples are not going to be useful!
deleterows!(taxi_df, find(taxi_df[:NUM_COORDS] .== 0))
println("done!")
In [3]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)
Out[3]:
In [4]:
#small_taxi_df = GetTableOrderedSubset(taxi_df, 10000)
#coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)
In [ ]:
function GetDateInfo(df)
if haskey(df, :DAYOFWEEK)
return df
end
function GetDistanceData(df)
if haskey(df, :DISTANCE)
return df
end
In [4]:
# http://en.wikipedia.org/wiki/Dynamic_time_warping
function DTWDistance{T}(s::Array{T}, t::Array{T})
n, m = size(s,2), size(t, 2)
DTW = zeros(n+1,m+1)
#println("sizeof DTW: ", size(DTW))
DTW[:,1], DTW[1,:] = Inf, Inf
DTW[1,1] = 0
for i=2:n+1
for j=2:m+1
#println("i=", i, "/", n, " j=", j, "/", m, " size(s)=", size(s), " size(t)=", size(t))
cost = sum((s[:,i-1] - t[:,j-1]).^2)
DTW[i,j] = cost + min(DTW[i-1, j ], #insertion
DTW[i , j-1], #deletion
DTW[i-1, j-1]) #match
#println("cost is ", cost, " i=", i, "/", n, " j=", j, "/", m, " DTW=", DTW[i,j])
end
end
return DTW[n+1,m+1]
end
# note. there is also a windowed version for better performance! See the wikipedia article
function GetAverageDistanceFromCoordsSequence(coords)
#Outline of the basic algorithm
#for coord_pair c_1..c_n
# estimate <- for i=1:n average(15*(i-1) + average(for c_j in D[c+i] * exp(-|L_j-L_i|) * L_j)
coords = round(Coords[i],round_len)
num_coords = size(coords,2)
end
function GetAverageDistancesByCoordsDict(TripIds, Coords, round_len=5)
distaces = Array{Float64,0}()
num_coords = size(TripIds,1)
for i=1:num_coords
trip_id = TripIds[i]
coords = Coords[i]
if length(coords) == 0
continue
end
coords = round(Coords[i],round_len)
num_coords = size(coords,2)
#println("num_coords: ", num_coords)
for j=1:num_coords
coord_pair = (coords[1,j], coords[2,j])
#println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
if !haskey(D, coord_pair)
D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
else
push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
end
end
end
return D
end
function findClosestTrainingExample(all_train_coords, test_path)
num_paths = length(all_train_coords)
best_dist = 9999.0
best_path = all_train_coords[1]
for k=1:num_paths
train_path = all_train_coords[k]
dist = DTWDistance(train_path, test_path)
#println("k=", k, " dist=", dist)
if dist < best_dist
#println(k, ", old best: ", best_dist, " new best: ", dist)
best_dist = dist
best_path = all_train_coords[k]
end
end
return best_path
end
function findClosestTrainingExampleForTestSet(all_train_paths, all_test_paths)
num_train_paths = length(all_train_paths)
num_test_paths = length(all_test_paths)
closest_examples = cell(num_test_paths)
for k=1:num_test_paths
if k % 20 == 0
println(k, "/", num_test_paths, " for ", num_train_paths, " train path examples")
end
test_path = all_test_paths[k]
closest_training_example = findClosestTrainingExample(all_train_paths, test_path)
closest_examples[k] = closest_training_example
end
return closest_examples
end
function pFindClosestTrainingExampleForTestSet(all_train_paths, all_test_paths)
function findClosestTrainingExample(all_train_coords, test_path)
num_paths = length(all_train_coords)
best_dist = 9999.0
best_path = all_train_coords[1]
for k=1:num_paths
train_path = all_train_coords[k]
dist = DTWDistance(train_path, test_path)
#println("k=", k, " dist=", dist)
if dist < best_dist
#println(k, ", old best: ", best_dist, " new best: ", dist)
best_dist = dist
best_path = all_train_coords[k]
end
end
return best_path
end
getClosestExample = p -> findClosestTrainingExample(all_train_paths, p)
return pmap(getClosestExample, all_test_paths)
end
Out[4]:
In [8]:
#@everywhere using taxis
#@everywhere using sequenceCompare
#@everywhere reload("taxis")
#@everywhere reload("sequenceCompare")
#reload("taxis")
all_train_coords = taxi_df[:COORDS][1:20000]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, taxi_validation_df[:COORDS])
Out[8]:
In [ ]:
taxi_validation_df[:GUESS_PATHS] = test_guess_paths
In [ ]:
#guess_times = [length(x)*15 for x in test_guess_paths]
num_test_examples = length(test_guess_paths)
guess_times = Array(Int64,num_test_examples)
dest_coords = cell(num_test_examples)
all_test_paths = taxi_validation_df[:COORDS]
for k=1:num_test_examples
test_path = all_test_paths[k]
best_guess_path = test_guess_paths[k]
test_path_time = length(test_path)*15
best_guess_time = length(best_guess_path)*15
if length(test_path) > length(best_guess_path)
println(k, ": guessing ", best_guess_time, " but existing time is ", test_path_time)
best_guess_time = max(660, test_path_time)
end
guess_times[k] = best_guess_time
end
submission_validation = guess_times
In [ ]:
#mean_time = mean(times_validation)
#submission_validation = [max(x, mean_time) for x in times_validation]
df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("third_submission_50k.csv", df_submission)
In [46]:
In [388]:
immutable Point2{T}
x::T
y::T
end
D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.)) #False!
Out[388]:
In [42]:
In [39]:
taxi_validation_df[:COORDS]
Out[39]:
In [ ]: