In [1]:
using DataFrames
using JSON
using Iterators
using taxis
using HDF5, JLD
using Stats
using kNN
using sequenceCompare
#reload("taxis")
#reload("sequenceCompare")
nprocs()
Out[1]:
In [2]:
println("Begin")
println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_100k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")
println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]
println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]
println("deleting unneeded data rows/columns")
delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)
println("adding start/end point columns")
taxi_df[:START] = [x[:,1] for x in taxi_df[:COORDS]]
taxi_validation_df[:START] = [x[:,1] for x in taxi_validation_df[:COORDS]]
taxi_df[:END] = [x[:,end] for x in taxi_df[:COORDS]]
taxi_validation_df[:END] = [x[:,end] for x in taxi_validation_df[:COORDS]]
println("deleting training examples with no coords!")
#These examples are not going to be useful!
deleterows!(taxi_df, find(taxi_df[:NUM_COORDS] .== 0))
println("generating test coords column")
taxi_df[:COORDS_TEST] = [x[1:2,1:round(rand(1)[1]*size(x,2))] for x in taxi_df[:COORDS]]
println("done!")
In [3]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)
Out[3]:
In [58]:
#small_taxi_df = GetTableOrderedSubset(taxi_df, 10000)
#coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)
In [ ]:
function GetDateInfo(df)
if haskey(df, :DAYOFWEEK)
return df
end
function GetDistanceData(df)
if haskey(df, :DISTANCE)
return df
end
In [58]:
all_train_coords = taxi_df[:COORDS][1:200]
all_validation_coords = taxi_validation_df[:COORDS]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_validation_coords, 2)
taxi_validation_df[:GUESS_PATHS] = test_guess_paths
Out[58]:
In [59]:
all_train_coords = taxi_df[:COORDS][1000:1200]
test_df = tail(taxi_df, 100)
all_test_coords = test_df[:COORDS_TEST]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_test_coords, 2)
test_df[:GUESS_PATHS] = test_guess_paths
Out[59]:
In [88]:
function score_path_guess(test_df)
pred_paths = test_df[:GUESS_PATHS]
actual_paths = test_df[:COORDS]
pred_times = [(length(x)*15)::Int64 for x in pred_paths]
actual_times = [(length(x)*15)::Int64 for x in actual_paths]
score = sqrt(mean((log(max(1100, pred_times)+1)-log(actual_times-1)).^2))
println("time score: ", score)
for k=1:length(pred_times)
println("pred: ", pred_times[k], ", actual: ", actual_times[k])
end
end
score_path_guess(test_df)
In [126]:
log([1 2 3])
Out[126]:
In [62]:
taxi_validation_df[:GUESS_PATHS] = test_guess_paths
#guess_times = [length(x)*15 for x in test_guess_paths]
num_test_examples = length(test_guess_paths)
guess_times = Array(Int64, num_test_examples)
dest_coords = cell(num_test_examples)
all_test_paths = taxi_validation_df[:COORDS]
for k=1:num_test_examples
test_path = all_test_paths[k]
best_guess_path = test_guess_paths[k]
test_path_time = length(test_path)*15
best_guess_time = length(best_guess_path)*15
if test_path_time > best_guess_time
println(k, ": guessing ", best_guess_time, " but existing time is ", test_path_time)
best_guess_time = test_path_time + 100
end
guess_times[k] = best_guess_time
end
submission_validation = guess_times
Out[62]:
In [53]:
# beat the benchmark example
#mean_time = mean(times_validation)
#submission_validation = [max(x, mean_time) for x in times_validation]
#submission_validation
df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("second_submission.csv", df_submission)
In [50]:
In [388]:
immutable Point2{T}
x::T
y::T
end
D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.)) #False!
Out[388]:
In [42]:
In [39]:
taxi_validation_df[:COORDS]
Out[39]:
In [ ]: