In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sampleSubmission.csv')

In [3]:
train.head(2)


Out[3]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE
0 1372636858620000589 C NaN NaN 20000589 1372636858 A False [[-8.618643,41.141412],[-8.618499,41.141376],[...
1 1372637303620000596 B NaN 7 20000596 1372637303 A False [[-8.639847,41.159826],[-8.640351,41.159871],[...

In [4]:
test.head(2)


Out[4]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE
0 T1 B NaN 15 20000542 1408039037 A False [[-8.585676,41.148522],[-8.585712,41.148639],[...
1 T2 B NaN 57 20000108 1408038611 A False [[-8.610876,41.14557],[-8.610858,41.145579],[-...

In [6]:
sample.head(2)


Out[6]:
TRIP_ID TRAVEL_TIME
0 T1 660
1 T2 660

In [10]:
print train.isnull().sum()
print 
print test.isnull().sum()


TRIP_ID               0
CALL_TYPE             0
ORIGIN_CALL     1345900
ORIGIN_STAND     904091
TAXI_ID               0
TIMESTAMP             0
DAY_TYPE              0
MISSING_DATA          0
POLYLINE              0
dtype: int64

TRIP_ID           0
CALL_TYPE         0
ORIGIN_CALL     248
ORIGIN_STAND    197
TAXI_ID           0
TIMESTAMP         0
DAY_TYPE          0
MISSING_DATA      0
POLYLINE          0
dtype: int64

In [29]:
# 대부분의 GPS정보는 온전히 기록되지는 않음
pd.value_counts(train['MISSING_DATA'].values.ravel())


Out[29]:
False    1710660
True          10
dtype: int64

In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
train = pd.read_csv('train.csv', usecols=['POLYLINE','TRIP_ID'])

In [4]:
#formula to calculate distance among two gps points
def haversine(coord1,coord2):
    import math
    sin = math.sin
    cos = math.cos
    atan2 = math.atan2
    sqrt = math.sqrt
    
    lon1,lat1=coord1
    lon2,lat2=coord2
    R=6371000 #metres
    phi1=lat1 * (3.1415 / 180)
    phi2=lat2 * (3.1415 / 180)
    Dphi= phi2 - phi1
    Dlambda = (lon2 -lon1) *  (3.1415 / 180)

    a = sin(Dphi / 2) ** 2 + cos(phi1)*cos(phi2) *sin(Dlambda/2)**2
    c = 2 * atan2(sqrt(a),sqrt(1-a))
    d = R*c
    return d
    

def speeds(polyline):
    polyline = literal_eval(polyline)
    N=len(polyline)
    v = [0.]*N
    if N == 0:
        return []
    
    for i in range(N - 1):
        v[i] = haversine(polyline[i],polyline[i+1]) / 15.
    
    v[N-1]  = haversine(polyline[N-1],polyline[N-2]) / 15.
    
    return v


def onSpeed(v):
    N=len(v)
    
    if N < 44:
        return 660.0
    
    if v[N-1] < 3:
        return (N-1)*15
    
    return (N*1.5 - 1) * 15

In [4]:
train.head(2)


Out[4]:
TRIP_ID POLYLINE
0 1372636858620000589 [[-8.618643,41.141412],[-8.618499,41.141376],[...
1 1372637303620000596 [[-8.639847,41.159826],[-8.640351,41.159871],[...

In [ ]:
# from string to list of coords
# train['POLYLINE'] = train['POLYLINE'].apply(literal_eval)

In [5]:
train['SPEED'] = train.POLYLINE.apply(speeds)

In [6]:
train.head(2)


Out[6]:
TRIP_ID POLYLINE SPEED
0 1372636858620000589 [[-8.618643,41.141412],[-8.618499,41.141376],[... [0.847027397221, 13.2169490312, 14.0570802149,...
1 1372637303620000596 [[-8.639847,41.159826],[-8.640351,41.159871],[... [2.83253801148, 10.4533049131, 12.9150340069, ...

In [7]:
train['travel_time'] = train['SPEED'].apply(onSpeed)

In [8]:
train.head(2)


Out[8]:
TRIP_ID POLYLINE SPEED travel_time
0 1372636858620000589 [[-8.618643,41.141412],[-8.618499,41.141376],[... [0.847027397221, 13.2169490312, 14.0570802149,... 660
1 1372637303620000596 [[-8.639847,41.159826],[-8.640351,41.159871],[... [2.83253801148, 10.4533049131, 12.9150340069, ... 660

In [10]:
test = pd.read_csv('test.csv', usecols=['POLYLINE','TRIP_ID'])

test['SPEED'] = test.POLYLINE.apply(speeds)
test['travel_time'] = test['SPEED'].apply(onSpeed)

# Create your submission file
submission = pd.DataFrame({"TRIP_ID": test['TRIP_ID'], "TRAVEL_TIME": test['travel_time']})
submission.to_csv("submission.csv", index=False)

In [13]:
submission.head(2)


Out[13]:
TRAVEL_TIME TRIP_ID
0 660 T1
1 660 T2

In [14]:
# kaggle LB
# 0.60680 131st/278

In [15]:
# try
# https://www.kaggle.com/gshguru/pkdd-15-taxi-trip-time-prediction-ii/beat-the-benchmark-for-200-trips

In [2]:
import json
import numpy as np

In [4]:
# read test
test = pd.read_csv('test.csv', usecols=['TRIP_ID', 'POLYLINE'])

In [5]:
test['snapshots'] = test['POLYLINE'].apply(lambda x: len(eval(x)))

In [9]:
test.count()


Out[9]:
TRIP_ID      320
POLYLINE     320
snapshots    320
dtype: int64

In [20]:
test['LONGITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][0])
test['LATITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][1])
test.drop('POLYLINE', axis=1, inplace=True)

In [22]:
test['TRAVEL_TIME'] = 0

In [23]:
test.head(2)


Out[23]:
TRIP_ID snapshots LONGITUDE LATITUDE TRAVEL_TIME
0 T1 11 -8.585676 41.148522 0
1 T2 40 -8.610876 41.145570 0

In [10]:
# read train
train = pd.read_csv('train.csv', usecols=['POLYLINE'])

In [11]:
train['snapshots'] = train['POLYLINE'].apply(lambda x: len(eval(x)))

In [12]:
train.count()


Out[12]:
POLYLINE     1710670
snapshots    1710670
dtype: int64

In [13]:
train = train[train.snapshots > 25]
train.count()


Out[13]:
POLYLINE     1356583
snapshots    1356583
dtype: int64

In [24]:
train['LONGITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][0])
train['LATITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][1])
train.drop('POLYLINE', axis=1, inplace=True)

In [25]:
train.head(2)


Out[25]:
snapshots LONGITUDE LATITUDE
2 65 -8.612964 41.140359
3 43 -8.574678 41.151951

In [49]:
### Control the number of closest trips used to calculate trip duration
N_trips = 200

### Get Haversine distance
def get_dist(x, lonlat2_lon, lonlat2_lat):
    
    lonlat1 = [x[0], x[1]]
    lonlat2 = [lonlat2_lon, lonlat2_lat]
    
    lon_diff = np.abs(lonlat1[0]-lonlat2[0])*np.pi/360.0
    lat_diff = np.abs(lonlat1[1]-lonlat2[1])*np.pi/360.0
    a = np.sin(lat_diff)**2 + np.cos(lonlat1[1]*np.pi/180.0) * np.cos(lonlat2[1]*np.pi/180.0) * np.sin(lon_diff)**2  
    d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return(d)

In [71]:
# 7시간정도 걸림
for row, lon, lat in zip(range(len(test)), test['LONGITUDE'], test['LATITUDE']):
    
    d = train[['LONGITUDE', 'LATITUDE']].apply(lambda x: get_dist(x, lon, lat), axis=1)
    i = np.argpartition(d, N_trips)[0:N_trips]
    w = np.maximum(d.iloc[i], 0.01)
    s = train.iloc[i]['snapshots']
    j = np.argpartition(s, int(N_trips*.95))[0:int(N_trips*.95)]
    test.loc[row, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[row, 'snapshots'], np.average(s.iloc[j], weights=1/w.iloc[j]**2))

In [72]:
test.head(2)


Out[72]:
TRIP_ID snapshots LONGITUDE LATITUDE TRAVEL_TIME
0 T1 11 -8.585676 41.148522 797.131579
1 T2 40 -8.610876 41.145570 737.052632

In [73]:
test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
test[['TRIP_ID', 'TRAVEL_TIME']].to_csv('submission.csv', index=False)