In [1]:
using DataFrames
using JSON
using Iterators
#using taxis
using HDF5, JLD
using Stats
using kNN
#using sequenceCompare
#reload("taxis")

nprocs()


Out[1]:
8

In [2]:
println("Begin")

println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_100k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")

println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in  taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]

println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]

println("deleting unneeded data rows/columns")
delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)

println("adding start/end point columns")
taxi_df[:START] = [x[:,1] for x in taxi_df[:COORDS]]
taxi_validation_df[:START] = [x[:,1] for x in taxi_validation_df[:COORDS]]

taxi_df[:END] = [x[:,end] for x in taxi_df[:COORDS]]
taxi_validation_df[:END] = [x[:,end] for x in taxi_validation_df[:COORDS]]

println("deleting training examples with no coords!")
#These examples are not going to be useful!
deleterows!(taxi_df, find(taxi_df[:NUM_COORDS] .== 0))

println("done!")


Begin
loading csv files
loading coords
getting coords counts
deleting unneeded data rows/columns
adding start/end point columns
deleting training examples with no coords!
done!

In [3]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)


finding unique number of coords
Out[3]:
2x4774976 Array{Float64,2}:
 -8.61864  -8.6185  -8.62033  -8.62215  …  -8.6304  -8.63042  -8.63041
 41.1414   41.1414  41.1425   41.1438      41.1579  41.158    41.1579 

Creating coord dict


In [4]:
#small_taxi_df = GetTableOrderedSubset(taxi_df, 10000)
#coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)

Creating new features


In [ ]:
function GetDateInfo(df)
    if haskey(df, :DAYOFWEEK)
        return df
end

function GetDistanceData(df)
    if haskey(df, :DISTANCE)
        return df
end

Training models


In [4]:
# http://en.wikipedia.org/wiki/Dynamic_time_warping
function DTWDistance{T}(s::Array{T}, t::Array{T})
    n, m = size(s,2), size(t, 2)
    DTW = zeros(n+1,m+1)
    #println("sizeof DTW: ", size(DTW))
    DTW[:,1], DTW[1,:] = Inf, Inf
    DTW[1,1] = 0

    for i=2:n+1
        for j=2:m+1
            #println("i=", i, "/", n, " j=", j, "/", m, " size(s)=", size(s), " size(t)=", size(t))
            cost = sum((s[:,i-1] - t[:,j-1]).^2)

            DTW[i,j] = cost + min(DTW[i-1, j  ], #insertion
                                  DTW[i  , j-1], #deletion
                                  DTW[i-1, j-1]) #match
            #println("cost is ", cost, " i=", i, "/", n, " j=", j, "/", m, " DTW=", DTW[i,j])
        end
    end

    return DTW[n+1,m+1]
end

# note. there is also a windowed version for better performance!  See the wikipedia article


function GetAverageDistanceFromCoordsSequence(coords)
    #Outline of the basic algorithm
    #for coord_pair c_1..c_n
    # estimate <- for i=1:n average(15*(i-1) + average(for c_j in D[c+i] * exp(-|L_j-L_i|) * L_j)
    coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
end

function GetAverageDistancesByCoordsDict(TripIds, Coords, round_len=5)
    distaces = Array{Float64,0}()
    num_coords = size(TripIds,1)
    for i=1:num_coords
        trip_id = TripIds[i]

        coords = Coords[i]
        if length(coords) == 0
            continue
        end
        
        coords = round(Coords[i],round_len)       
        
        num_coords = size(coords,2)
        #println("num_coords: ", num_coords)
        for j=1:num_coords
            coord_pair = (coords[1,j], coords[2,j])
            #println("coord_pair: ", coord_pair, ", j/num_coords: ", j, "/", num_coords, ", thing: ", [SequenceRef(trip_id, j, num_coords-j)])
            if !haskey(D, coord_pair)
                D[coord_pair] = [SequenceRef(trip_id, j, num_coords-j)]
            else
                push!(D[coord_pair], SequenceRef(trip_id, j, num_coords-j))
            end        
        end
    end
    
    return D    
end

function findClosestTrainingExample(all_train_coords, test_path)
    num_paths = length(all_train_coords)
    best_dist = 9999.0
    best_path = all_train_coords[1]
    for k=1:num_paths
        train_path = all_train_coords[k]
        dist = DTWDistance(train_path, test_path) 
        #println("k=", k, " dist=", dist)
        if dist < best_dist
            #println(k, ", old best: ", best_dist, " new best: ", dist)
            best_dist = dist
            best_path = all_train_coords[k]
        end
    end
    
    return best_path
end

function findClosestTrainingExampleForTestSet(all_train_paths, all_test_paths)
    
    
    num_train_paths = length(all_train_paths)
    num_test_paths = length(all_test_paths)
    closest_examples = cell(num_test_paths)
    for k=1:num_test_paths
        if k % 20 == 0
            println(k, "/", num_test_paths, " for ", num_train_paths, " train path examples") 
        end
        test_path = all_test_paths[k]
        closest_training_example = findClosestTrainingExample(all_train_paths, test_path) 
        closest_examples[k] = closest_training_example
    end
    
    return closest_examples
end

function pFindClosestTrainingExampleForTestSet(all_train_paths, all_test_paths)
    function findClosestTrainingExample(all_train_coords, test_path)
        num_paths = length(all_train_coords)
        best_dist = 9999.0
        best_path = all_train_coords[1]
        for k=1:num_paths
            train_path = all_train_coords[k]
            dist = DTWDistance(train_path, test_path) 
            #println("k=", k, " dist=", dist)
            if dist < best_dist
                #println(k, ", old best: ", best_dist, " new best: ", dist)
                best_dist = dist
                best_path = all_train_coords[k]
            end
        end

        return best_path
    end
    
    getClosestExample = p -> findClosestTrainingExample(all_train_paths, p)
    return pmap(getClosestExample, all_test_paths)
end


Out[4]:
pFindClosestTrainingExampleForTestSet (generic function with 1 method)

In [8]:
#@everywhere using taxis
#@everywhere using sequenceCompare
#@everywhere reload("taxis")
#@everywhere reload("sequenceCompare")
#reload("taxis")

all_train_coords = taxi_df[:COORDS][1:20000]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, taxi_validation_df[:COORDS])


20/320 for 20000 train path examples
40/320 for 20000 train path examples
60/320 for 20000 train path examples
80/320 for 20000 train path examples
100/320 for 20000 train path examples
120/320 for 20000 train path examples
140/320 for 20000 train path examples
160/320 for 20000 train path examples
180/320 for 20000 train path examples
200/320 for 20000 train path examples
220/320 for 20000 train path examples
240/320 for 20000 train path examples
260/320 for 20000 train path examples
280/320 for 20000 train path examples
300/320 for 20000 train path examples
320/320 for 20000 train path examples
Out[8]:
320-element Array{Any,1}:
 2x1 Array{Float64,2}:
 -8.58588
 41.1483                                                                                                                                
 2x18 Array{Float64,2}:
 -8.61094  -8.61045  -8.60924  -8.60837  …  -8.60393  -8.60395  -8.60395
 41.1457   41.1461   41.1468   41.1477      41.1615   41.1616   41.1616 
 2x7 Array{Float64,2}:
 -8.58016  -8.57873  -8.58034  -8.5828  -8.58244  -8.58103  -8.58287
 41.1471   41.1495   41.1523   41.1551  41.1587   41.1628   41.1661          
 2x3 Array{Float64,2}:
 -8.61555  -8.61535  -8.61453
 41.1407   41.1409   41.1412                                                                                        
 2x3 Array{Float64,2}:
 -8.61981  -8.61979  -8.61978
 41.148    41.1481   41.1481                                                                                        
 2x4 Array{Float64,2}:
 -8.62776  -8.62755  -8.62635  -8.6259
 41.1754   41.1748   41.1723   41.1704                                                                     
 2x10 Array{Float64,2}:
 -8.58568  -8.58625  -8.58621  -8.58715  …  -8.58409  -8.58407  -8.58406
 41.1487   41.149    41.1483   41.1474      41.149    41.149    41.149  
 2x17 Array{Float64,2}:
 -8.58242  -8.58241  -8.58172  -8.5826  …  -8.57537  -8.57489  -8.57427
 41.1806   41.1806   41.1805   41.1809     41.1871   41.1877   41.1879   
 2x29 Array{Float64,2}:
 -8.60653  -8.60674  -8.60735  -8.60762  …  -8.60395  -8.60472  -8.60547
 41.1446   41.1447   41.1444   41.1439      41.125    41.1251   41.1257 
 2x53 Array{Float64,2}:
 -8.58573  -8.58572  -8.58574  -8.58568  …  -8.59819  -8.59819  -8.5982
 41.1486   41.1487   41.1488   41.1489      41.1896   41.1896   41.1896  
 2x14 Array{Float64,2}:
 -8.59004  -8.58999  -8.58998  -8.58949  …  -8.58681  -8.58699  -8.58702
 41.1633   41.1634   41.1634   41.164       41.1677   41.1684   41.1684 
 2x11 Array{Float64,2}:
 -8.58563  -8.58606  -8.58809  -8.59091  …  -8.59361  -8.59361  -8.59362
 41.1486   41.149    41.1494   41.1501      41.1492   41.1492   41.1492 
 2x1 Array{Float64,2}:
 -8.58083
 41.1593                                                                                                                                
 ⋮                                                                                                                                                                       
 2x25 Array{Float64,2}:
 -8.61256  -8.61212  -8.61131  -8.61085  …  -8.58941  -8.58817  -8.58796
 41.146    41.146    41.146    41.1459      41.1471   41.1472   41.1475 
 2x20 Array{Float64,2}:
 -8.63789  -8.63794  -8.638  -8.63562  …  -8.6359  -8.63583  -8.63581
 41.153    41.153    41.153  41.1524      41.1413  41.1415   41.1415       
 2x19 Array{Float64,2}:
 -8.68937  -8.68917  -8.68798  -8.68721  …  -8.67911  -8.67911  -8.67839
 41.1681   41.1675   41.1668   41.1654      41.1538   41.1538   41.1526 
 2x91 Array{Float64,2}:
 -8.61211  -8.62125  -8.62244  -8.62245  …  -8.68418  -8.68418  -8.68416
 41.148    41.1476   41.1477   41.1478      41.1729   41.1729   41.1729 
 2x35 Array{Float64,2}:
 -8.60965  -8.60973  -8.61116  -8.61157  …  -8.58895  -8.58643  -8.58568
 41.1604   41.1604   41.1606   41.1607      41.1494   41.1489   41.1487 
 2x93 Array{Float64,2}:
 -8.65271  -8.65282  -8.65401  -8.65485  …  -8.54163  -8.54167  -8.54136
 41.1778   41.1778   41.1772   41.1778      41.1295   41.1283   41.1274 
 2x38 Array{Float64,2}:
 -8.5989  -8.59888  -8.5992  -8.60111  …  -8.60783  -8.61394  -8.61968
 41.1484  41.1485   41.1485  41.1488      41.2005   41.2015   41.2035     
 2x47 Array{Float64,2}:
 -8.57002  -8.56904  -8.56734  -8.5671  …  -8.58961  -8.58967  -8.58963
 41.166    41.1663   41.1663   41.1663     41.1573   41.1573   41.1573   
 2x26 Array{Float64,2}:
 -8.61384  -8.61383  -8.61434  -8.61579  …  -8.62986  -8.62988  -8.62987
 41.1412   41.1412   41.1411   41.1406      41.1526   41.1526   41.1526 
 2x9 Array{Float64,2}:
 -8.64776  -8.6477  -8.64769  -8.64726  …  -8.64551  -8.64351  -8.64278
 41.1505   41.1506  41.1507   41.1526      41.1544   41.1544   41.1542    
 2x17 Array{Float64,2}:
 -8.56992  -8.57025  -8.57024  -8.56903  …  -8.56301  -8.56295  -8.56293
 41.1597   41.1596   41.159    41.1595      41.1674   41.1674   41.1674 
 2x19 Array{Float64,2}:
 -8.56044  -8.56106  -8.56144  -8.56147  …  -8.57412  -8.57412  -8.5741
 41.1902   41.1911   41.1902   41.1896      41.1924   41.1924   41.1924  

In [ ]:
taxi_validation_df[:GUESS_PATHS] = test_guess_paths

In [ ]:
#guess_times = [length(x)*15 for x in test_guess_paths]
num_test_examples = length(test_guess_paths)

guess_times = Array(Int64,num_test_examples)
dest_coords = cell(num_test_examples)

all_test_paths = taxi_validation_df[:COORDS]
for k=1:num_test_examples
    test_path = all_test_paths[k]
    best_guess_path = test_guess_paths[k]
    
    test_path_time = length(test_path)*15
    best_guess_time = length(best_guess_path)*15
    
    if length(test_path) > length(best_guess_path)
        println(k, ":  guessing ", best_guess_time, " but existing time is ", test_path_time)
        best_guess_time = max(660, test_path_time)
    end
    guess_times[k] = best_guess_time
end

submission_validation = guess_times

Generating Submission


In [ ]:
#mean_time = mean(times_validation)
#submission_validation = [max(x, mean_time) for x in times_validation]

df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("third_submission_50k.csv", df_submission)

In [46]:


In [388]:
immutable Point2{T}
  x::T
  y::T
end

D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.))  #False!


Out[388]:
true

In [42]:



ArgumentError("setindex!(::DataFrame, ...) only broadcasts scalars, not arrays")
while loading In[42], in expression starting on line 2

 in setindex! at /home/tony/.julia/v0.3/DataFrames/src/dataframe/dataframe.jl:356
 in DataFrame at /home/tony/.julia/v0.3/DataFrames/src/dataframe/dataframe.jl:104

In [39]:
taxi_validation_df[:COORDS]


Out[39]:
320-element DataArray{Any,1}:
 2x11 Array{Float64,2}:
 -8.58568  -8.58571  -8.58568  -8.58573  …  -8.587   -8.58658  -8.58488
 41.1485   41.1486   41.1489   41.1489      41.1475  41.1472   41.1466    
 2x40 Array{Float64,2}:
 -8.61088  -8.61086  -8.6109  -8.61044  …  -8.60293  -8.60255  -8.60189
 41.1456   41.1456   41.1458  41.1462      41.1628   41.1631   41.1636    
 2x40 Array{Float64,2}:
 -8.58574  -8.58573  -8.58572  -8.58629  …  -8.57695  -8.5759  -8.5749
 41.1486   41.1488   41.149    41.149       41.1664   41.1672  41.1677     
 2x8 Array{Float64,2}:
 -8.61396  -8.61412  -8.61509  -8.61528  …  -8.61524  -8.61505  -8.61464
 41.1412   41.1411   41.1409   41.1408      41.1408   41.1408   41.141    
 2x2 Array{Float64,2}:
 -8.6199  -8.61989
 41.148   41.148                                                                                                                
 2x137 Array{Float64,2}:
 -8.63061  -8.63061  -8.63074  -8.63151  …  -8.62639  -8.6264  -8.62641
 41.1782   41.1782   41.1782   41.1781      41.172    41.172   41.172    
 2x24 Array{Float64,2}:
 -8.58562  -8.58564  -8.58592  -8.58637  …  -8.58156  -8.58181  -8.58205
 41.1489   41.1489   41.1489   41.1489      41.1533   41.1535   41.1538  
 2x17 Array{Float64,2}:
 -8.58292  -8.582   -8.58108  -8.58011  …  -8.57703  -8.57753  -8.57877
 41.1811   41.1818  41.183    41.184       41.1861   41.1861   41.1852    
 2x43 Array{Float64,2}:
 -8.60653  -8.60667  -8.6068  -8.60679  …  -8.60548  -8.60549  -8.60549
 41.1447   41.1447   41.1447  41.1447      41.1257   41.1258   41.1258    
 2x79 Array{Float64,2}:
 -8.58566  -8.5857  -8.58573  -8.58574  …  -8.59117  -8.58826  -8.58631
 41.1486   41.1486  41.1486   41.1486      41.1942   41.1974   41.1993    
 2x63 Array{Float64,2}:
 -8.59123  -8.59123  -8.59122  -8.591   …  -8.58767  -8.5881  -8.58823
 41.1627   41.1627   41.1627   41.1626     41.1687   41.1689  41.1689      
 2x17 Array{Float64,2}:
 -8.58569  -8.58576  -8.58571  -8.58576  …  -8.59455  -8.59592  -8.59665
 41.1486   41.1487   41.1489   41.149       41.1507   41.1499   41.1494  
 2x5 Array{Float64,2}:
 -8.5801  -8.58023  -8.58152  -8.58252  -8.58433
 41.1594  41.1594   41.1593   41.1589   41.1583                                                   
 ⋮                                                                                                                                                                        
 2x21 Array{Float64,2}:
 -8.61072  -8.61049  -8.6094  -8.6085  …  -8.59074  -8.58956  -8.58816
 41.1445   41.1437   41.1432  41.1431     41.1469   41.1471   41.1473      
 2x25 Array{Float64,2}:
 -8.6406  -8.64005  -8.64022  -8.63974  …  -8.63605  -8.63605  -8.63605
 41.1549  41.1547   41.1536   41.1533      41.1405   41.1405   41.1406    
 2x19 Array{Float64,2}:
 -8.68929  -8.6893  -8.68873  -8.68765  …  -8.678   -8.67778  -8.67773
 41.1682   41.1682  41.1674   41.1663      41.1521  41.1517   41.1515      
 2x72 Array{Float64,2}:
 -8.60636  -8.60636  -8.60711  -8.6073  …  -8.68831  -8.6866  -8.68486
 41.1445   41.1446   41.1451   41.1457     41.1728   41.1734  41.1734      
 2x45 Array{Float64,2}:
 -8.61253  -8.61253  -8.61287  -8.61289  …  -8.58566  -8.58584  -8.58584
 41.1595   41.1595   41.1595   41.1595      41.1489   41.149    41.149   
 2x267 Array{Float64,2}:
 -8.66747  -8.66735  -8.66717  -8.66798  …  -8.53496  -8.53497  -8.53498
 41.2381   41.2383   41.2384   41.2387      41.1433   41.1433   41.1433 
 2x47 Array{Float64,2}:
 -8.60647  -8.60648  -8.60649  -8.60667  …  -8.5917  -8.59579  -8.60045
 41.1447   41.1447   41.1447   41.1448      41.1973  41.1973   41.1988    
 2x48 Array{Float64,2}:
 -8.5702  -8.57019  -8.56947  -8.56733  …  -8.59311  -8.59333  -8.59331
 41.1595  41.159    41.1591   41.1606      41.1511   41.151    41.1511    
 2x94 Array{Float64,2}:
 -8.61387  -8.61388  -8.61472  -8.61584  …  -8.62978  -8.62977  -8.62979
 41.1412   41.1412   41.1411   41.1407      41.1526   41.1526   41.1527  
 2x6 Array{Float64,2}:
 -8.6481  -8.64746  -8.64688  -8.64593  -8.64534  -8.6433
 41.1525  41.1524   41.1531   41.1538   41.1544   41.1543                                
 2x15 Array{Float64,2}:
 -8.5717  -8.57058  -8.569   -8.57006  …  -8.5658  -8.56669  -8.56921
 41.1561  41.1559   41.1555  41.1561      41.1647  41.1667   41.1676        
 2x52 Array{Float64,2}:
 -8.57456  -8.57225  -8.57049  -8.56883  …  -8.59046  -8.59078  -8.59234
 41.1802   41.1799   41.1795   41.1806      41.1978   41.1952   41.1922  

In [ ]: