In [389]:
    
# Ideas to try
# [ ] Create a coords database
#    [ ] use knn
#    [ ] coords -> SequenceId, remainingTripLength
#        [ ] average the times from each of the trips wrt to remaining trip length
#        per_coord_distances = zeros(k)
#        for c_i in coords c_1..c_k,
#            distances = []
#            for seq in D[c_i]:
#                remaining_trip = coords_df[seq.SequenceId][seq.Index:end]
#                push!(length(distances))
#            per_coord_distances = average(distances)    
#        return averaging of per_coord_distances
#        [ ] use 2/3/n-gram coords for distances
#
#
#
    
In [2]:
    
using DataFrames
using JSON
using Iterators
using taxis
using HDF5, JLD
#reload("taxis")
nprocs()
    
    Out[2]:
In [3]:
    
println("Begin")
println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_200k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")
println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in  taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]
println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]
delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)
println("done!")
    
    
In [4]:
    
describe(taxi_df)
    
    
In [5]:
    
head(taxi_validation_df)
    
    Out[5]:
In [6]:
    
type Point{T}
  x::T
  y::T
end
type SequenceRef
    TripId::Int64
    SequenceIndex::Int64
    LengthRemaining::Int64
end
    
In [12]:
    
#describe(taxi_validation_df)
    
In [7]:
    
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)
    
    
    
In [ ]:
    
function GetUniqueCoords(c, round_len=4)
    c = round(c, round_len)
    c2 = Set([(c[i,1]::Float64, c[i,2]::Float64) for i in 1:size(c, 1)])
    println("all coords: ", length(c), ", unique coords: ", length(c2))
    return c2
end
function GetTableOrderedSubset(df, num_records=100000)
    idxs = sort(shuffle([1:size(df,1)])[1:num_records])
    return df[idxs,:]
end
function ConstructCoordsDatabase(df, round_len=5)
    print(length(df[:TRIP_ID]), ", ", length(df[:COORDS]))
    return ConstructCoordsDatabase2(df[:TRIP_ID], df[:COORDS], round_len)
end
function ConstructCoordsDatabase2(TripIds, Coords, round_len=5)
    #println("num trip_ids/coords: ", length(TripIds), " ", length(Coords))
    #print("coords:", Coords)
    D = Dict{(Float64,Float64),Array{SequenceRef,1}}()
    num_coords = size(TripIds,1)
    for i=1:num_coords
        trip_id = TripIds[i]
        coords = Coords[i]
        if length(coords) == 0
            continue
        end
        
        coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
        #println("num_coords: ", num_coords)
        for j=1:num_coords
            coord_pair = (coords[1,j], coords[2,j])
            #println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
            if !haskey(D, coord_pair)
                D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
            else
                push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
            end        
        end
    end
    
    return D    
end
function CreateTrainingTestSet(df)
    
end
    
In [9]:
    
small_taxi_df = GetTableOrderedSubset(taxi_df, 20000)
coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)
    
    
    Out[9]:
In [ ]:
    
coord_counts = [length(x)::Int64 for x in values(coordsDB)]
#coord_counts
describe(coord_counts)
    
In [ ]:
    
function saveDfs()
   writetable("output.dat", df, quotemark = '\'', separator = ',')
   writetable("output.dat", df, quotemark = '\'', separator = ',')
end
    
In [ ]:
    
function GetDateInfo(df)
    if haskey(df, :DAYOFWEEK)
        return df
end
function GetDistanceData(df)
    if haskey(df, :DISTANCE)
        return df
end
    
In [387]:
    
function GetAverageDistanceFromCoordsSequence(coords)
    #Outline of the basic algorithm
    #for coord_pair c_1..c_n
    # estimate <- for i=1:n average(15*(i-1) + average(for c_j in D[c+i] * exp(-|L_j-L_i|) * L_j)
    coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
end
function GetAverageDistancesByCoordsDict(TripIds, Coords, round_len=5)
    distaces = Array{Float64,0}()
    num_coords = size(TripIds,1)
    for i=1:num_coords
        trip_id = TripIds[i]
        coords = Coords[i]
        if length(coords) == 0
            continue
        end
        
        coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
        #println("num_coords: ", num_coords)
        for j=1:num_coords
            coord_pair = (coords[1,j], coords[2,j])
            #println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
            if !haskey(D, coord_pair)
                D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
            else
                push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
            end        
        end
    end
    
    return D    
end
    
    Out[387]:
In [44]:
    
mean_time = mean(times_validation)
submission_validation = [max(x, mean_time) for x in times_validation]
df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("first_submission.csv", df_submission)
    
    Out[44]:
In [46]:
    
    
In [388]:
    
immutable Point2{T}
  x::T
  y::T
end
D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.))  #False!
    
    Out[388]:
In [42]:
    
    
    
In [39]:
    
taxi_validation_df[:COORDS]
    
    Out[39]:
In [ ]: