In [389]:
# Ideas to try
# [ ] Create a coords database
#    [ ] use knn
#    [ ] coords -> SequenceId, remainingTripLength
#        [ ] average the times from each of the trips wrt to remaining trip length
#        per_coord_distances = zeros(k)
#        for c_i in coords c_1..c_k,
#            distances = []
#            for seq in D[c_i]:
#                remaining_trip = coords_df[seq.SequenceId][seq.Index:end]
#                push!(length(distances))
#            per_coord_distances = average(distances)    
#        return averaging of per_coord_distances
#        [ ] use 2/3/n-gram coords for distances
#
#
#

In [2]:
using DataFrames
using JSON
using Iterators
using taxis
using HDF5, JLD
#reload("taxis")

nprocs()


Out[2]:
8

In [3]:
println("Begin")

println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_200k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")

println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in  taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]

println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]

delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)

println("done!")


Begin
loading csv files
loading coords
getting coords counts
done!

In [4]:
describe(taxi_df)


TRIP_ID
Min      1.3726368536200003e18
1st Qu.  1.3735286041200005e18
Median   1.3744274916200003e18
Mean     1.374470565520865e18
3rd Qu.  1.3754004771200005e18
Max      1.3764333326200008e18
NAs      0
NA%      0.0%

CALL_TYPE
Length  199999
Type    UTF8String
NAs     0
NA%     0.0%
Unique  3

ORIGIN_CALL
Min      2001.0
1st Qu.  5568.0
Median   17320.0
Mean     23532.490181765093
3rd Qu.  39488.0
Max      63882.0
NAs      158902
NA%      79.45%

ORIGIN_STAND
Min      1.0
1st Qu.  15.0
Median   26.0
Mean     30.249708435920695
3rd Qu.  49.0
Max      63.0
NAs      99678
NA%      49.84%

TAXI_ID
Min      2.0000001e7
1st Qu.  2.0000171e7
Median   2.0000347e7
Mean     2.000035240828704e7
3rd Qu.  2.0000523e7
Max      2.0000981e7
NAs      0
NA%      0.0%

TIMESTAMP
Min      1.372636853e9
1st Qu.  1.3735286035e9
Median   1.374427491e9
Mean     1.3744705649008646e9
3rd Qu.  1.3754004765e9
Max      1.376433332e9
NAs      0
NA%      0.0%

DAY_TYPE
Length  199999
Type    UTF8String
NAs     0
NA%     0.0%
Unique  1

MISSING_DATA
Length  199999
Type    UTF8String
NAs     0
NA%     0.0%
Unique  2

COORDS
Length  199999
Type    Any
NAs     0
NA%     0.0%
Unique  199207

NUM_COORDS
Min      0.0
1st Qu.  54.0
Median   80.0
Mean     94.47731238656193
3rd Qu.  112.0
Max      7142.0
NAs      0
NA%      0.0%


In [5]:
head(taxi_validation_df)


Out[5]:
TRIP_IDCALL_TYPEORIGIN_CALLORIGIN_STANDTAXI_IDTIMESTAMPDAY_TYPEMISSING_DATACOORDSNUM_COORDS
1T1BNA15200005421408039037AFalse[-8.585676 -8.585712 -8.585685 -8.58573 -8.585982 -8.586396 -8.586072 -8.586324 -8.586999 -8.586576 -8.584884 41.148522 41.148639 41.148855 41.148927 41.148963 41.148954 41.14872 41.147847 41.14746 41.147154 41.146623]22
2T2BNA57200001081408038611AFalse[-8.610876 -8.610858 -8.610903 -8.610444 -8.609445 -8.608896 -8.608968 -8.608707 -8.608347 -8.608149 -8.608041 -8.607654 -8.607348 -8.607393 -8.607357 -8.606817 -8.606358 -8.605719 -8.604981 -8.604783 -8.604828 -8.604801 -8.604648 -8.604522 -8.604513 -8.604378 -8.604378 -8.604369 -8.60436 -8.604162 -8.604126 -8.60409 -8.60409 -8.604108 -8.604126 -8.604135 -8.60391 -8.602929 -8.602551 -8.601894 41.14557 41.145579 41.145768 41.146191 41.146758 41.147118 41.147127 41.147532 41.148117 41.148351 41.148576 41.14926 41.149899 41.149899 41.149962 41.150979 41.151915 41.152788 41.153319 41.154345 41.154372 41.155353 41.156775 41.158197 41.159943 41.160555 41.1606 41.160645 41.160807 41.161176 41.161248 41.161293 41.161266 41.161239 41.161194 41.161275 41.162049 41.162832 41.163111 41.163597]80
3T3BNA15200003701408038568AFalse[-8.585739 -8.58573 -8.585721 -8.586288 -8.586117 -8.586198 -8.586279 -8.587152 -8.585685 -8.584281 -8.583075 -8.581365 -8.579511 -8.579349 -8.579232 -8.5797 -8.578728 -8.580024 -8.581518 -8.582022 -8.582409 -8.582679 -8.582895 -8.582985 -8.582949 -8.582742 -8.582508 -8.582346 -8.58213 -8.5815 -8.580978 -8.580411 -8.579871 -8.578791 -8.577621 -8.577513 -8.577342 -8.576946 -8.575902 -8.574903 41.148558 41.148828 41.148972 41.149017 41.148567 41.148315 41.147919 41.147325 41.146839 41.1462 41.145111 41.14485 41.145651 41.145858 41.146308 41.147847 41.150241 41.152077 41.153292 41.153832 41.15439 41.154903 41.155578 41.156487 41.156991 41.157774 41.158386 41.158773 41.159295 41.160816 41.162715 41.163966 41.164038 41.164767 41.165361 41.165415 41.165739 41.166441 41.167233 41.167719]80
4T4BNA53200004921408039090AFalse[-8.613963 -8.614125 -8.615088 -8.615277 -8.615259 -8.615241 -8.615052 -8.614638 41.141169 41.141124 41.140926 41.140818 41.140809 41.1408 41.140818 41.14098]16
5T5BNA18200006211408039177AFalse[-8.619903 -8.619894 41.148036 41.148036]4
6T6A42612NA200006071408037146AFalse[-8.630613 -8.630613 -8.630739 -8.631513 -8.631306 -8.630028 -8.629425 -8.629173 -8.629056 -8.62866 -8.627958 -8.627967 -8.627985 -8.627778 -8.627157 -8.626275 -8.626221 -8.626419 -8.626419 -8.626428 -8.626428 -8.626419 -8.62641 -8.626401 -8.62641 -8.62641 -8.626419 -8.626419 -8.626419 -8.626419 -8.626437 -8.626446 -8.626446 -8.626446 -8.626437 -8.626428 -8.626437 -8.626437 -8.626437 -8.626428 -8.626428 -8.626437 -8.626446 -8.626446 -8.626446 -8.626437 -8.626437 -8.626446 -8.626446 -8.626446 -8.626446 -8.626446 -8.626446 -8.626437 -8.626446 -8.626455 -8.626464 -8.626464 -8.626464 -8.626464 -8.626455 -8.626455 -8.626446 -8.626446 -8.626437 -8.626437 -8.626437 -8.626446 -8.626455 -8.626464 -8.626464 -8.626464 -8.626455 -8.626455 -8.626446 -8.626446 -8.626437 -8.626437 -8.626446 -8.626446 -8.626446 -8.626455 -8.626464 -8.626464 -8.626464 -8.626473 -8.626473 -8.626464 -8.626464 -8.626455 -8.626455 -8.626455 -8.626455 -8.626446 -8.626437 -8.626437 -8.626437 -8.626446 -8.626446 -8.626446 -8.626446 -8.626455 -8.626455 -8.626464 -8.626473 -8.626473 -8.626473 -8.626509 -8.626689 -8.626698 -8.62668 -8.626662 -8.626878 -8.626842 -8.626833 -8.626842 -8.626842 -8.626851 -8.626842 -8.626833 -8.626824 -8.626779 -8.626509 -8.626455 -8.626455 -8.626455 -8.626428 -8.62641 -8.62641 -8.626419 -8.626419 -8.626428 -8.626419 -8.626401 -8.626392 -8.626401 -8.62641 41.178249 41.178249 41.178231 41.178141 41.178114 41.177889 41.177817 41.17797 41.178042 41.177394 41.176152 41.176125 41.176107 41.175684 41.174127 41.172588 41.171922 41.171895 41.171904 41.171913 41.171913 41.171913 41.171922 41.171922 41.171931 41.171931 41.171931 41.171922 41.171922 41.171913 41.171913 41.171922 41.171913 41.171913 41.171913 41.171913 41.171913 41.171904 41.171904 41.171913 41.171904 41.171895 41.171895 41.171886 41.171886 41.171886 41.171877 41.171877 41.171895 41.171895 41.171895 41.171913 41.171913 41.171922 41.171922 41.171931 41.171922 41.171922 41.171931 41.171922 41.171922 41.171922 41.171931 41.171922 41.171922 41.171913 41.171913 41.171913 41.171913 41.171913 41.171913 41.171904 41.171904 41.171904 41.171913 41.171913 41.171922 41.171922 41.171922 41.171913 41.171904 41.171904 41.171895 41.171895 41.171895 41.171895 41.171895 41.171904 41.171913 41.171913 41.171913 41.171913 41.171904 41.171904 41.171913 41.171922 41.171922 41.171931 41.171931 41.17194 41.171949 41.171949 41.171949 41.17194 41.171931 41.171931 41.171931 41.171931 41.171895 41.171895 41.171868 41.171859 41.171886 41.171859 41.171886 41.171904 41.171904 41.171904 41.171904 41.171904 41.171859 41.171832 41.171886 41.171904 41.171904 41.171904 41.171913 41.171922 41.171922 41.171913 41.171913 41.171913 41.171922 41.17194 41.171958 41.171967 41.171958]274

In [6]:
type Point{T}
  x::T
  y::T
end

type SequenceRef
    TripId::Int64
    SequenceIndex::Int64
    LengthRemaining::Int64
end

Data Analysis


In [12]:
#describe(taxi_validation_df)

In [7]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)


finding unique number of coords
interrupt
while loading In[7], in expression starting on line 3

 in promote_eltype at abstractarray.jl:506 (repeats 7580 times)
 in cat at abstractarray.jl:675
 in hcat at abstractarray.jl:737

In [ ]:
function GetUniqueCoords(c, round_len=4)
    c = round(c, round_len)
    c2 = Set([(c[i,1]::Float64, c[i,2]::Float64) for i in 1:size(c, 1)])
    println("all coords: ", length(c), ", unique coords: ", length(c2))
    return c2
end

function GetTableOrderedSubset(df, num_records=100000)
    idxs = sort(shuffle([1:size(df,1)])[1:num_records])
    return df[idxs,:]
end

function ConstructCoordsDatabase(df, round_len=5)
    print(length(df[:TRIP_ID]), ", ", length(df[:COORDS]))
    return ConstructCoordsDatabase2(df[:TRIP_ID], df[:COORDS], round_len)
end

function ConstructCoordsDatabase2(TripIds, Coords, round_len=5)
    #println("num trip_ids/coords: ", length(TripIds), " ", length(Coords))
    #print("coords:", Coords)
    D = Dict{(Float64,Float64),Array{SequenceRef,1}}()
    num_coords = size(TripIds,1)
    for i=1:num_coords
        trip_id = TripIds[i]

        coords = Coords[i]
        if length(coords) == 0
            continue
        end
        
        coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
        #println("num_coords: ", num_coords)
        for j=1:num_coords
            coord_pair = (coords[1,j], coords[2,j])
            #println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
            if !haskey(D, coord_pair)
                D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
            else
                push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
            end        
        end
    end
    
    return D    
end

function CreateTrainingTestSet(df)
    
end

Creating coord dict


In [9]:
small_taxi_df = GetTableOrderedSubset(taxi_df, 20000)
coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)


20000, 20000
Out[9]:
Dict{(Float64,Float64),Array{SequenceRef,1}} with 172374 entries:
  (-8.6458,41.1608) => [SequenceRef(1372667821620000578,25,32),SequenceRef(1372…
  (-8.6346,41.1609) => [SequenceRef(1375947877620000189,3,22)]
  (-8.6369,41.0949) => [SequenceRef(1374258278620000224,201,9)]
  (-8.6419,41.1401) => [SequenceRef(1373023493620000595,18,3),SequenceRef(13748…
  (-8.3737,41.1834) => [SequenceRef(1375343194620000486,60,24)]
  (-8.684,41.1737)  => [SequenceRef(1376230604620000450,53,33)]
  (-8.6114,41.1629) => [SequenceRef(1374676784620000352,37,31)]
  (-8.6465,41.1617) => [SequenceRef(1372668127620000076,35,2)]
  (-8.5799,41.1453) => [SequenceRef(1372673694620000578,11,38),SequenceRef(1372…
  (-8.5478,41.1741) => [SequenceRef(1374912227620000518,82,9)]
  (-8.613,41.1497)  => [SequenceRef(1373666345620000517,32,9),SequenceRef(13737…
  (-8.6488,41.241)  => [SequenceRef(1372683491620000217,78,12)]
  (-8.5911,41.1546) => [SequenceRef(1372975880620000363,30,20),SequenceRef(1373…
  (-7.3548,40.6326) => [SequenceRef(1373269966620000576,487,137)]
  (-8.6075,41.1425) => [SequenceRef(1372697346620000624,46,7),SequenceRef(13728…
  (-8.6423,41.1671) => [SequenceRef(1372666772620000472,41,30),SequenceRef(1372…
  (-8.5891,41.17)   => [SequenceRef(1372808638620000476,13,5),SequenceRef(13732…
  (-8.6367,41.1258) => [SequenceRef(1373473766620000616,7,82),SequenceRef(13747…
  (-8.6048,41.13)   => [SequenceRef(1374689909620000243,21,35)]
  (-8.5786,41.1693) => [SequenceRef(1375326017620000178,62,7)]
  (-8.6157,41.1438) => [SequenceRef(1375375823620000312,4,183)]
  (-8.6346,41.1717) => [SequenceRef(1374299253620000424,36,0),SequenceRef(13759…
  (-8.6635,41.1645) => [SequenceRef(1376033537620000410,18,2)]
  (-8.6584,41.1497) => [SequenceRef(1376298683620000235,7,277)]
  (-8.5574,41.1721) => [SequenceRef(1375093684620000002,61,6)]
  ⋮                 => ⋮

In [ ]:
coord_counts = [length(x)::Int64 for x in values(coordsDB)]
#coord_counts
describe(coord_counts)

In [ ]:
function saveDfs()
   writetable("output.dat", df, quotemark = '\'', separator = ',')
   writetable("output.dat", df, quotemark = '\'', separator = ',')
end

Creating new features


In [ ]:
function GetDateInfo(df)
    if haskey(df, :DAYOFWEEK)
        return df
end

function GetDistanceData(df)
    if haskey(df, :DISTANCE)
        return df
end

Training models


In [387]:
function GetAverageDistanceFromCoordsSequence(coords)
    #Outline of the basic algorithm
    #for coord_pair c_1..c_n
    # estimate <- for i=1:n average(15*(i-1) + average(for c_j in D[c+i] * exp(-|L_j-L_i|) * L_j)
    coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
end

function GetAverageDistancesByCoordsDict(TripIds, Coords, round_len=5)
    distaces = Array{Float64,0}()
    num_coords = size(TripIds,1)
    for i=1:num_coords
        trip_id = TripIds[i]

        coords = Coords[i]
        if length(coords) == 0
            continue
        end
        
        coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
        #println("num_coords: ", num_coords)
        for j=1:num_coords
            coord_pair = (coords[1,j], coords[2,j])
            #println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
            if !haskey(D, coord_pair)
                D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
            else
                push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
            end        
        end
    end
    
    return D    
end


Out[387]:
false

Generating Submission


In [44]:
mean_time = mean(times_validation)
submission_validation = [max(x, mean_time) for x in times_validation]

df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("first_submission.csv", df_submission)


Out[44]:
320-element Array{Any,1}:
  676.781
  676.781
  676.781
  676.781
  676.781
 2055.0  
  676.781
  676.781
  676.781
 1185.0  
  945.0  
  676.781
  676.781
    ⋮    
  676.781
  676.781
  676.781
 1080.0  
  676.781
 4005.0  
  705.0  
  720.0  
 1410.0  
  676.781
  676.781
  780.0  

In [46]:


In [388]:
immutable Point2{T}
  x::T
  y::T
end

D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.))  #False!


Out[388]:
true

In [42]:



ArgumentError("setindex!(::DataFrame, ...) only broadcasts scalars, not arrays")
while loading In[42], in expression starting on line 2

 in setindex! at /home/tony/.julia/v0.3/DataFrames/src/dataframe/dataframe.jl:356
 in DataFrame at /home/tony/.julia/v0.3/DataFrames/src/dataframe/dataframe.jl:104

In [39]:
taxi_validation_df[:COORDS]


Out[39]:
320-element DataArray{Any,1}:
 2x11 Array{Float64,2}:
 -8.58568  -8.58571  -8.58568  -8.58573  …  -8.587   -8.58658  -8.58488
 41.1485   41.1486   41.1489   41.1489      41.1475  41.1472   41.1466    
 2x40 Array{Float64,2}:
 -8.61088  -8.61086  -8.6109  -8.61044  …  -8.60293  -8.60255  -8.60189
 41.1456   41.1456   41.1458  41.1462      41.1628   41.1631   41.1636    
 2x40 Array{Float64,2}:
 -8.58574  -8.58573  -8.58572  -8.58629  …  -8.57695  -8.5759  -8.5749
 41.1486   41.1488   41.149    41.149       41.1664   41.1672  41.1677     
 2x8 Array{Float64,2}:
 -8.61396  -8.61412  -8.61509  -8.61528  …  -8.61524  -8.61505  -8.61464
 41.1412   41.1411   41.1409   41.1408      41.1408   41.1408   41.141    
 2x2 Array{Float64,2}:
 -8.6199  -8.61989
 41.148   41.148                                                                                                                
 2x137 Array{Float64,2}:
 -8.63061  -8.63061  -8.63074  -8.63151  …  -8.62639  -8.6264  -8.62641
 41.1782   41.1782   41.1782   41.1781      41.172    41.172   41.172    
 2x24 Array{Float64,2}:
 -8.58562  -8.58564  -8.58592  -8.58637  …  -8.58156  -8.58181  -8.58205
 41.1489   41.1489   41.1489   41.1489      41.1533   41.1535   41.1538  
 2x17 Array{Float64,2}:
 -8.58292  -8.582   -8.58108  -8.58011  …  -8.57703  -8.57753  -8.57877
 41.1811   41.1818  41.183    41.184       41.1861   41.1861   41.1852    
 2x43 Array{Float64,2}:
 -8.60653  -8.60667  -8.6068  -8.60679  …  -8.60548  -8.60549  -8.60549
 41.1447   41.1447   41.1447  41.1447      41.1257   41.1258   41.1258    
 2x79 Array{Float64,2}:
 -8.58566  -8.5857  -8.58573  -8.58574  …  -8.59117  -8.58826  -8.58631
 41.1486   41.1486  41.1486   41.1486      41.1942   41.1974   41.1993    
 2x63 Array{Float64,2}:
 -8.59123  -8.59123  -8.59122  -8.591   …  -8.58767  -8.5881  -8.58823
 41.1627   41.1627   41.1627   41.1626     41.1687   41.1689  41.1689      
 2x17 Array{Float64,2}:
 -8.58569  -8.58576  -8.58571  -8.58576  …  -8.59455  -8.59592  -8.59665
 41.1486   41.1487   41.1489   41.149       41.1507   41.1499   41.1494  
 2x5 Array{Float64,2}:
 -8.5801  -8.58023  -8.58152  -8.58252  -8.58433
 41.1594  41.1594   41.1593   41.1589   41.1583                                                   
 ⋮                                                                                                                                                                        
 2x21 Array{Float64,2}:
 -8.61072  -8.61049  -8.6094  -8.6085  …  -8.59074  -8.58956  -8.58816
 41.1445   41.1437   41.1432  41.1431     41.1469   41.1471   41.1473      
 2x25 Array{Float64,2}:
 -8.6406  -8.64005  -8.64022  -8.63974  …  -8.63605  -8.63605  -8.63605
 41.1549  41.1547   41.1536   41.1533      41.1405   41.1405   41.1406    
 2x19 Array{Float64,2}:
 -8.68929  -8.6893  -8.68873  -8.68765  …  -8.678   -8.67778  -8.67773
 41.1682   41.1682  41.1674   41.1663      41.1521  41.1517   41.1515      
 2x72 Array{Float64,2}:
 -8.60636  -8.60636  -8.60711  -8.6073  …  -8.68831  -8.6866  -8.68486
 41.1445   41.1446   41.1451   41.1457     41.1728   41.1734  41.1734      
 2x45 Array{Float64,2}:
 -8.61253  -8.61253  -8.61287  -8.61289  …  -8.58566  -8.58584  -8.58584
 41.1595   41.1595   41.1595   41.1595      41.1489   41.149    41.149   
 2x267 Array{Float64,2}:
 -8.66747  -8.66735  -8.66717  -8.66798  …  -8.53496  -8.53497  -8.53498
 41.2381   41.2383   41.2384   41.2387      41.1433   41.1433   41.1433 
 2x47 Array{Float64,2}:
 -8.60647  -8.60648  -8.60649  -8.60667  …  -8.5917  -8.59579  -8.60045
 41.1447   41.1447   41.1447   41.1448      41.1973  41.1973   41.1988    
 2x48 Array{Float64,2}:
 -8.5702  -8.57019  -8.56947  -8.56733  …  -8.59311  -8.59333  -8.59331
 41.1595  41.159    41.1591   41.1606      41.1511   41.151    41.1511    
 2x94 Array{Float64,2}:
 -8.61387  -8.61388  -8.61472  -8.61584  …  -8.62978  -8.62977  -8.62979
 41.1412   41.1412   41.1411   41.1407      41.1526   41.1526   41.1527  
 2x6 Array{Float64,2}:
 -8.6481  -8.64746  -8.64688  -8.64593  -8.64534  -8.6433
 41.1525  41.1524   41.1531   41.1538   41.1544   41.1543                                
 2x15 Array{Float64,2}:
 -8.5717  -8.57058  -8.569   -8.57006  …  -8.5658  -8.56669  -8.56921
 41.1561  41.1559   41.1555  41.1561      41.1647  41.1667   41.1676        
 2x52 Array{Float64,2}:
 -8.57456  -8.57225  -8.57049  -8.56883  …  -8.59046  -8.59078  -8.59234
 41.1802   41.1799   41.1795   41.1806      41.1978   41.1952   41.1922  

In [ ]: