In [130]:
using DataFrames
using JSON
using Iterators
using taxis
using HDF5, JLD
using Stats
using kNN
using sequenceCompare
#reload("taxis")
#reload("sequenceCompare")
nprocs()
Out[130]:
In [131]:
taxi_df, taxi_validation_df = taxis.LoadData("/home/tony/ML/taxi/taxi2_time/train_200k.csv", "/home/tony/ML/taxi/taxi2_time/test.csv")
0
Out[131]:
In [133]:
train_coords = taxi_df[:COORDS_TEST][1:4]
#train_lens = [c[:,1:int(ceil(rand(1)[1]*size(c,2)))] for c in train_coords]
Out[133]:
In [240]:
println("looking at taxi id information")
head(taxi_df)
taxi_ids = taxi_df[:TAXI_ID]
taxi_ids_dict = Dict{Int64, Int64}()
for id in taxi_ids
taxi_ids_dict[id] = get(taxi_ids_dict, id, 0) + 1
end
taxi_id_counts = [x::Int64 for x in values(taxi_ids_dict)]
describe(taxi_id_counts)
println("number less than 10: ", sum(taxi_id_counts .< 20))
println("number of taxi ids: ", length(keys(taxi_ids_dict)))
#taxis_by_id = [id => taxi_df[taxi_df[:TAXI_ID].==id,:] for id in keys(taxi_ids_dict)]
val_ids = [get(taxi_ids_dict, x, 0)::Int64 for x in taxi_validation_df[:TAXI_ID]]
sum(val_ids .== 0)
Out[240]:
In [104]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)
Out[104]:
In [53]:
small_taxi_df = GetTableOrderedSubset(taxi_df, 190000)
coordsDB = ConstructCoordsDatabase(small_taxi_df, 2)
describe([length(x)::Int64 for x in values(coordsDB)])
In [63]:
taxi_df[:GRID_START] = [round(c,2) for c in taxi_df[:START]]
taxi_validation_df[:GRID_START] = [round(c,2) for c in taxi_validation_df[:START]]
taxi_df[:GRID_END] = [round(c,2) for c in taxi_df[:END]]
taxi_validation_df[:GRID_END] = [round(c,2) for c in taxi_validation_df[:END]]
Out[63]:
In [ ]:
In [128]:
function getGridDicts(train_df, test_df, grid_delta=.01)
train_df[:GRID_START] = [round(c,2) for c in train_df[:START]]
test_df[:GRID_START] = [round(c,2) for c in test_df[:START] ]
train_df[:GRID_END] = [round(c,2) for c in train_df[:END] ]
test_df[:GRID_END] = [round(c,2) for c in test_df[:END] ]
grids_dict = Dict()
for i in 1:length(train_df[:GRID_START])
coord_pair = train_df[:GRID_START][i]
res = get(grids_dict, coord_pair, Any[])
push!(res, train_df[:COORDS][i])
grids_dict[coord_pair] = res
end
mean_length_dict = Dict()
mean_dest_dict = Dict()
for grid_coord in keys(grids_dict)
paths = grids_dict[grid_coord]
endpoints = [x[:,end] for x in paths]
path_lens = [int(length(x)/2)::Int64 for x in paths]
#println("hey")
mean_dest_dict[grid_coord] = mean(endpoints)
#println("2222")
mean_length_dict[grid_coord] = round(mean(path_lens))
end
return mean_length_dict, mean_dest_dict
end
mean_length_dict, mean_dest_dict = getGridDicts()
In [129]:
values(mean_length_dict)
Out[129]:
In [122]:
pairs = collect(keys(grids_dict))
grid_coord = pairs[1]
p = grids_dict[grid_coord]
println("num paths: ", length(p))
endpoints = [x[:,end] for x in p]
paths_lens = [int(length(x)/2)::Int64 for x in p]
println("endpoiktns and paths")
println(mean(paths_lens))
println(length(p[1]))
for x in p
println(length(x))
end
println(mean(endpoints))
In [90]:
sum([haskey(grids_dict, x) for x in taxi_validation_df[:GRID_START]])
mean([c for c in train_df[:END][1:5]])
median(train_df[:NUM_COORDS][1:5])
Out[90]:
In [ ]:
function GetDateInfo(df)
if haskey(df, :DAYOFWEEK)
return df
end
function GetDistanceData(df)
if haskey(df, :DISTANCE)
return df
end
In [48]:
function euclideanDist(p1, p2)
return sqrt((p1[1]-p2[1])^2 + (p1[2]-p2[2])^2)
end
function findClosestTrainingExample2(all_train_coords, test_path, w=1)
num_paths = length(all_train_coords)
best_dist = 9999.0
best_path = all_train_coords[1]
for k=1:num_paths
train_path = all_train_coords[k]
if w <= 1
dist = DTWDistance(train_path, test_path)
else
dist = DTWDistance(train_path, test_path, w)
end
#println("k=", k, " dist=", dist)
if dist < best_dist
#println(k, ", old best: ", best_dist, " new best: ", dist)
best_dist = dist
best_path = all_train_coords[k]
end
end
return best_path
end
# Create a DTW/START_DIFF metric
function findClosestTrainingExample3(all_train_coords, test_path, w=1)
num_paths = length(all_train_coords)
best_score = 999999.0
best_path = all_train_coords[1]
for k=1:num_paths
train_path = all_train_coords[k]
test_path = test_path[:,1:min(end, size(train_path,2)+2)]
if size(train_path,2) < size(test_path, 2)
continue
end
if w <= 1
dist = DTWDistance(train_path, test_path)
else
dist = DTWDistance(train_path, test_path, w)
end
start_diff = float(euclideanDist(test_path[:,1], train_path[:,1])+0.000001)
# score is 1/dist * 1/start_diff = 1/(dist*start_diff)
# we'll minimize score = (dist*start_diff)
score = float(dist * start_diff)
if score < best_score
best_score = score
best_path = all_train_coords[k]
end
end
return best_path
end
function findClosestTrainingExamplesDTWstarDiffAvg(all_train_coords, test_path, w=2, num_avg=30)
num_paths = length(all_train_coords)
dists = [float(DTWDistance(train_path, test_path, w)) for train_path in all_train_coords]
start_diffs = [float(euclideanDist(test_path[:,1], train_path[:,1])+0.0000001) for train_path in all_train_coords]
num_coords = [size(train_path,2) for train_path in all_train_coords]
df = DataFrame(DISTS = dists, START_DIFF = start_diffs, NUM_COORDS = num_coords, DTW_START_DIFF = dists .* start_diffs)
sort!(df, cols=[:DTW_START_DIFF])
avg_length = median(df[:NUM_COORDS][1:min(end,num_avg)])
return int(avg_length)
end
function findClosestTrainingExampleForTestSet2(train_df, test_df, max_subset=1000, w=1)
all_train_paths = train_df[:COORDS]
all_test_paths = test_df[:COORDS]
num_train_paths = length(all_train_paths)
num_test_paths = length(all_test_paths)
train_paths_subset = all_train_paths[7000:(7000+max_subset)]
closest_examples = cell(num_test_paths)
for k=1:num_test_paths
if k % 20 == 0
println(k, "/", num_test_paths, " for ", num_train_paths, " train path examples")
end
test_path = all_test_paths[k]
test_path_len = size(test_path, 2)
test_taxi_id = test_df[:TAXI_ID][k]
same_taxi_df = train_df[train_df[:TAXI_ID] .== test_taxi_id,:]
if size(same_taxi_df,1) == 0
println("no taxi ids found for ", test_taxi_id)
closest_training_example = findClosestTrainingExample(train_paths_subset, test_path, w)
closest_examples[k] = closest_training_example
else
#println(size(same_taxi_df,1), " - number of taxi id's routes")
#println("test path length: ", test_path_len)
#println("taxi id: ", test_taxi_id)
same_taxi_paths = same_taxi_df[same_taxi_df[:NUM_COORDS] .>= test_path_len,:][:COORDS][1:min(end,max_subset)]
println(size(same_taxi_df,1), " - number of taxi id's routes with ", length(same_taxi_paths),
" greater than length ", test_path_len,"!")
if length(same_taxi_paths) == 0
println("all paths filtered out! resorting to random search again!")
closest_training_example = findClosestTrainingExample(train_paths_subset, test_path, w)
closest_examples[k] = closest_training_example
else
closest_training_example = findClosestTrainingExample(same_taxi_paths, test_path, w)
closest_examples[k] = closest_training_example
end
end
end
return closest_examples
end
function findClosestTrainingExampleForTestSet3(train_df, test_df, max_subset=4000, w=1)
all_train_paths = train_df[:COORDS]
all_test_paths = test_df[:COORDS]
num_train_paths = length(all_train_paths)
num_test_paths = length(all_test_paths)
train_paths_subset = all_train_paths[7000:(7000+max_subset)]
closest_examples = cell(num_test_paths)
for k=1:num_test_paths
if k % 20 == 0
println(k, "/", num_test_paths, " for ", num_train_paths, " train path examples")
end
test_path = all_test_paths[k]
test_path_len = size(test_path, 2)
test_taxi_id = test_df[:TAXI_ID][k]
same_taxi_df = train_df[train_df[:TAXI_ID] .== test_taxi_id,:]
if size(same_taxi_df,1) == 0
println("no taxi ids found for ", test_taxi_id)
closest_training_example = findClosestTrainingExample3(train_paths_subset, test_path, w)
closest_examples[k] = closest_training_example
else
#println(size(same_taxi_df,1), " - number of taxi id's routes")
#println("test path length: ", test_path_len)
#println("taxi id: ", test_taxi_id)
same_taxi_paths = same_taxi_df[same_taxi_df[:NUM_COORDS] .>= test_path_len,:][:COORDS]
println(size(same_taxi_df,1), " - number of taxi id's routes with ", length(same_taxi_paths),
" greater than length ", test_path_len,"!")
if length(same_taxi_paths) == 0
println("all paths filtered out! resorting to random search again!")
closest_training_example = findClosestTrainingExample3(train_paths_subset, test_path, w)
closest_examples[k] = closest_training_example
else
closest_training_example = findClosestTrainingExample3(same_taxi_paths, test_path, w)
closest_examples[k] = closest_training_example
end
end
end
return closest_examples
end
function avgDTWstartDiffScore(train_df, test_df, max_subset=4000, w=2, num_avg=30)
all_train_paths = train_df[:COORDS]
all_test_paths = test_df[:COORDS]
num_train_paths = length(all_train_paths)
num_test_paths = length(all_test_paths)
train_paths_subset = all_train_paths[7000:(7000+max_subset)]
closest_examples = cell(num_test_paths)
for k=1:num_test_paths
if k % 20 == 0
println(k, "/", num_test_paths, " for ", num_train_paths, " train path examples")
end
test_path = all_test_paths[k]
test_path_len = size(test_path, 2)
test_taxi_id = test_df[:TAXI_ID][k]
same_taxi_df = train_df[train_df[:TAXI_ID] .== test_taxi_id,:]
if size(same_taxi_df,1) == 0
println("no taxi ids found for ", test_taxi_id)
closest_training_example = findClosestTrainingExamplesDTWstarDiffAvg(train_paths_subset, test_path, w, num_avg)
closest_examples[k] = closest_training_example
else
#println(size(same_taxi_df,1), " - number of taxi id's routes")
#println("test path length: ", test_path_len)
#println("taxi id: ", test_taxi_id)
same_taxi_paths = same_taxi_df[same_taxi_df[:NUM_COORDS] .>= test_path_len,:][:COORDS]
println(size(same_taxi_df,1), " - number of taxi id's routes with ", length(same_taxi_paths),
" greater than length ", test_path_len,"!")
if length(same_taxi_paths) == 0
println("all paths filtered out! resorting to random search again!")
closest_training_example = findClosestTrainingExamplesDTWstarDiffAvg(train_paths_subset, test_path, w, num_avg)
closest_examples[k] = closest_training_example
else
closest_training_example = findClosestTrainingExamplesDTWstarDiffAvg(same_taxi_paths, test_path, w, num_avg)
closest_examples[k] = closest_training_example
end
end
end
return closest_examples
end
Out[48]:
In [ ]:
function startEndPrediction(train_paths, test_paths)
num_test_examples = length(test_paths)
end
In [16]:
println("SUBMISSION PREDICTION")
# 1. Using subset of all_train_coords and scanning it for each test example
#all_train_coords = taxi_df[:COORDS][1:20]
#all_validation_coords = taxi_validation_df[:COORDS]
#test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_validation_coords)
#taxi_validation_df[:GUESS_PATHS] = test_guess_paths
# 2. Scanning paths of same taxi
all_train_coords = taxi_df[:COORDS]
all_validation_coords = taxi_validation_df[:COORDS]
test_guess_lengths = avgDTWstartDiffScore(taxi_df, taxi_validation_df, 4000, 2, 30)
taxi_validation_df[:GUESS_NUM_COORDS] = test_guess_lengths
Out[16]:
In [50]:
println("Local prediction")
#scanning subset of train paths for all test paths
#all_train_coords = taxi_df[:COORDS][10000:10400]
#test_df = tail(taxi_df, 100)
#all_test_coords = test_df[:COORDS_TEST]
#test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_test_coords, 2)
#test_df[:GUESS_PATHS] = test_guess_paths
#looking at paths for same driver
train_df = taxi_df[1:90000,:]
test_df = tail(taxi_df, 10)
test_coords = test_df[:COORDS]
test_df[:COORDS] = test_df[:COORDS_TEST]
test_guess_lengths = avgDTWstartDiffScore(train_df, test_df, 1000, 2, 140)
test_df[:GUESS_NUM_COORDS] = test_guess_lengths
test_df[:COORDS] = test_coords #change it back for scoring
score_coords_guess(test_df)
In [52]:
function score_path_guess(test_df)
pred_paths = test_df[:GUESS_PATHS]
actual_paths = test_df[:COORDS]
pred_times = [((size(x,2)-1)*15)::Int64 for x in pred_paths]
actual_times = [((size(x,2)-1)*15)::Int64 for x in actual_paths]
score = sqrt(mean((log(max(pred_times, 660)+1)-log(actual_times+1)).^2))
println("time score: ", score)
for k=1:length(pred_times)
println("pred: ", pred_times[k], ", actual: ", actual_times[k], ", delta error: ", pred_times[k]-actual_times[k])
end
end
function score_coords_guess(test_df)
pred_num_coords = test_df[:GUESS_NUM_COORDS]
actual_paths = test_df[:COORDS]
pred_times = [((x-1)*15)::Int64 for x in pred_num_coords]
actual_times = [((size(x,2)-1)*15)::Int64 for x in actual_paths]
score = sqrt(mean((log(max(pred_times, 660)+1)-log(actual_times+1)).^2))
println("time score: ", score)
for k=1:length(pred_times)
println("pred: ", pred_times[k], ", actual: ", actual_times[k], ", delta error: ", pred_times[k]-actual_times[k])
end
end
score_path_guess(test_df)
In [126]:
log([1 2 3])
Out[126]:
In [17]:
#taxi_validation_df[:GUESS_PATHS] = test_guess_paths
#guess_times = [length(x)*15 for x in test_guess_paths]
num_test_examples = length(test_guess_paths)
guess_times = Array(Int64, num_test_examples)
dest_coords = cell(num_test_examples)
all_test_paths = taxi_validation_df[:COORDS]
test_guess_paths = taxi_validation_df[:GUESS_PATHS]
for k=1:num_test_examples
test_path = all_test_paths[k]
best_guess_path = test_guess_paths[k]
test_path_time = length(test_path)*15
best_guess_time = length(best_guess_path)*15
if test_path_time > best_guess_time
println(k, ": guessing ", best_guess_time, " but existing time is ", test_path_time)
#best_guess_time = test_path_time + 100
end
guess_times[k] = max(test_path_time, best_guess_time)
end
submission_validation = guess_times
Out[17]:
In [174]:
size(taxi_df[:COORDS][1],2)
Out[174]:
In [18]:
# beat the benchmark example
#mean_time = mean(times_validation)
#submission_validation = [max(x, mean_time) for x in times_validation]
#submission_validation
df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("7th_submission_DTW_START_DIFF.csv", df_submission)
In [27]:
lats = [float(x[2,end])::Float64 for x in taxi_validation_df[:GUESS_PATHS]]
lons = [float(x[1,end])::Float64 for x in taxi_validation_df[:GUESS_PATHS]]
df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:LATITUDE] = lats
df_submission[:LONGITUDE] = lons
writetable("1th_sub_endpoint_DTW_START_DIFF.csv", df_submission)
In [388]:
immutable Point2{T}
x::T
y::T
end
D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.)) #False!
Out[388]:
In [42]:
In [39]:
taxi_validation_df[:COORDS]
Out[39]:
In [ ]: