notebook.community

Edit and run



In [2]:

    
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sampleSubmission.csv')



In [3]:

    
train.head(2)









    Out[3]:






  
    
      
      TRIP_ID
      CALL_TYPE
      ORIGIN_CALL
      ORIGIN_STAND
      TAXI_ID
      TIMESTAMP
      DAY_TYPE
      MISSING_DATA
      POLYLINE
    
  
  
    
      0
       1372636858620000589
       C
      NaN
      NaN
       20000589
       1372636858
       A
       False
       [[-8.618643,41.141412],[-8.618499,41.141376],[...
    
    
      1
       1372637303620000596
       B
      NaN
        7
       20000596
       1372637303
       A
       False
       [[-8.639847,41.159826],[-8.640351,41.159871],[...



In [4]:

    
test.head(2)









    Out[4]:






  
    
      
      TRIP_ID
      CALL_TYPE
      ORIGIN_CALL
      ORIGIN_STAND
      TAXI_ID
      TIMESTAMP
      DAY_TYPE
      MISSING_DATA
      POLYLINE
    
  
  
    
      0
       T1
       B
      NaN
       15
       20000542
       1408039037
       A
       False
       [[-8.585676,41.148522],[-8.585712,41.148639],[...
    
    
      1
       T2
       B
      NaN
       57
       20000108
       1408038611
       A
       False
       [[-8.610876,41.14557],[-8.610858,41.145579],[-...



In [6]:

    
sample.head(2)









    Out[6]:






  
    
      
      TRIP_ID
      TRAVEL_TIME
    
  
  
    
      0
       T1
       660
    
    
      1
       T2
       660



In [10]:

    
print train.isnull().sum()
print 
print test.isnull().sum()









    



TRIP_ID               0
CALL_TYPE             0
ORIGIN_CALL     1345900
ORIGIN_STAND     904091
TAXI_ID               0
TIMESTAMP             0
DAY_TYPE              0
MISSING_DATA          0
POLYLINE              0
dtype: int64

TRIP_ID           0
CALL_TYPE         0
ORIGIN_CALL     248
ORIGIN_STAND    197
TAXI_ID           0
TIMESTAMP         0
DAY_TYPE          0
MISSING_DATA      0
POLYLINE          0
dtype: int64



In [29]:

    
# 대부분의 GPS정보는 온전히 기록되지는 않음
pd.value_counts(train['MISSING_DATA'].values.ravel())









    Out[29]:





False    1710660
True          10
dtype: int64



In [1]:

    
import pandas as pd
from ast import literal_eval



In [2]:

    
train = pd.read_csv('train.csv', usecols=['POLYLINE','TRIP_ID'])



In [4]:

    
#formula to calculate distance among two gps points
def haversine(coord1,coord2):
    import math
    sin = math.sin
    cos = math.cos
    atan2 = math.atan2
    sqrt = math.sqrt
    
    lon1,lat1=coord1
    lon2,lat2=coord2
    R=6371000 #metres
    phi1=lat1 * (3.1415 / 180)
    phi2=lat2 * (3.1415 / 180)
    Dphi= phi2 - phi1
    Dlambda = (lon2 -lon1) *  (3.1415 / 180)

    a = sin(Dphi / 2) ** 2 + cos(phi1)*cos(phi2) *sin(Dlambda/2)**2
    c = 2 * atan2(sqrt(a),sqrt(1-a))
    d = R*c
    return d
    

def speeds(polyline):
    polyline = literal_eval(polyline)
    N=len(polyline)
    v = [0.]*N
    if N == 0:
        return []
    
    for i in range(N - 1):
        v[i] = haversine(polyline[i],polyline[i+1]) / 15.
    
    v[N-1]  = haversine(polyline[N-1],polyline[N-2]) / 15.
    
    return v


def onSpeed(v):
    N=len(v)
    
    if N < 44:
        return 660.0
    
    if v[N-1] < 3:
        return (N-1)*15
    
    return (N*1.5 - 1) * 15



In [4]:

    
train.head(2)









    Out[4]:






  
    
      
      TRIP_ID
      POLYLINE
    
  
  
    
      0
       1372636858620000589
       [[-8.618643,41.141412],[-8.618499,41.141376],[...
    
    
      1
       1372637303620000596
       [[-8.639847,41.159826],[-8.640351,41.159871],[...



In [ ]:

    
# from string to list of coords
# train['POLYLINE'] = train['POLYLINE'].apply(literal_eval)



In [5]:

    
train['SPEED'] = train.POLYLINE.apply(speeds)



In [6]:

    
train.head(2)









    Out[6]:






  
    
      
      TRIP_ID
      POLYLINE
      SPEED
    
  
  
    
      0
       1372636858620000589
       [[-8.618643,41.141412],[-8.618499,41.141376],[...
       [0.847027397221, 13.2169490312, 14.0570802149,...
    
    
      1
       1372637303620000596
       [[-8.639847,41.159826],[-8.640351,41.159871],[...
       [2.83253801148, 10.4533049131, 12.9150340069, ...



In [7]:

    
train['travel_time'] = train['SPEED'].apply(onSpeed)



In [8]:

    
train.head(2)









    Out[8]:






  
    
      
      TRIP_ID
      POLYLINE
      SPEED
      travel_time
    
  
  
    
      0
       1372636858620000589
       [[-8.618643,41.141412],[-8.618499,41.141376],[...
       [0.847027397221, 13.2169490312, 14.0570802149,...
       660
    
    
      1
       1372637303620000596
       [[-8.639847,41.159826],[-8.640351,41.159871],[...
       [2.83253801148, 10.4533049131, 12.9150340069, ...
       660



In [10]:

    
test = pd.read_csv('test.csv', usecols=['POLYLINE','TRIP_ID'])

test['SPEED'] = test.POLYLINE.apply(speeds)
test['travel_time'] = test['SPEED'].apply(onSpeed)

# Create your submission file
submission = pd.DataFrame({"TRIP_ID": test['TRIP_ID'], "TRAVEL_TIME": test['travel_time']})
submission.to_csv("submission.csv", index=False)



In [13]:

    
submission.head(2)









    Out[13]:






  
    
      
      TRAVEL_TIME
      TRIP_ID
    
  
  
    
      0
       660
       T1
    
    
      1
       660
       T2



In [14]:

    
# kaggle LB
# 0.60680 131st/278



In [15]:

    
# try
# https://www.kaggle.com/gshguru/pkdd-15-taxi-trip-time-prediction-ii/beat-the-benchmark-for-200-trips



In [2]:

    
import json
import numpy as np



In [4]:

    
# read test
test = pd.read_csv('test.csv', usecols=['TRIP_ID', 'POLYLINE'])



In [5]:

    
test['snapshots'] = test['POLYLINE'].apply(lambda x: len(eval(x)))



In [9]:

    
test.count()









    Out[9]:





TRIP_ID      320
POLYLINE     320
snapshots    320
dtype: int64



In [20]:

    
test['LONGITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][0])
test['LATITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][1])
test.drop('POLYLINE', axis=1, inplace=True)



In [22]:

    
test['TRAVEL_TIME'] = 0



In [23]:

    
test.head(2)









    Out[23]:






  
    
      
      TRIP_ID
      snapshots
      LONGITUDE
      LATITUDE
      TRAVEL_TIME
    
  
  
    
      0
       T1
       11
      -8.585676
       41.148522
       0
    
    
      1
       T2
       40
      -8.610876
       41.145570
       0



In [10]:

    
# read train
train = pd.read_csv('train.csv', usecols=['POLYLINE'])



In [11]:

    
train['snapshots'] = train['POLYLINE'].apply(lambda x: len(eval(x)))



In [12]:

    
train.count()









    Out[12]:





POLYLINE     1710670
snapshots    1710670
dtype: int64



In [13]:

    
train = train[train.snapshots > 25]
train.count()









    Out[13]:





POLYLINE     1356583
snapshots    1356583
dtype: int64



In [24]:

    
train['LONGITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][0])
train['LATITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][1])
train.drop('POLYLINE', axis=1, inplace=True)



In [25]:

    
train.head(2)



In [49]:

    
### Control the number of closest trips used to calculate trip duration
N_trips = 200

### Get Haversine distance
def get_dist(x, lonlat2_lon, lonlat2_lat):
    
    lonlat1 = [x[0], x[1]]
    lonlat2 = [lonlat2_lon, lonlat2_lat]
    
    lon_diff = np.abs(lonlat1[0]-lonlat2[0])*np.pi/360.0
    lat_diff = np.abs(lonlat1[1]-lonlat2[1])*np.pi/360.0
    a = np.sin(lat_diff)**2 + np.cos(lonlat1[1]*np.pi/180.0) * np.cos(lonlat2[1]*np.pi/180.0) * np.sin(lon_diff)**2  
    d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return(d)



In [71]:

    
# 7시간정도 걸림
for row, lon, lat in zip(range(len(test)), test['LONGITUDE'], test['LATITUDE']):
    
    d = train[['LONGITUDE', 'LATITUDE']].apply(lambda x: get_dist(x, lon, lat), axis=1)
    i = np.argpartition(d, N_trips)[0:N_trips]
    w = np.maximum(d.iloc[i], 0.01)
    s = train.iloc[i]['snapshots']
    j = np.argpartition(s, int(N_trips*.95))[0:int(N_trips*.95)]
    test.loc[row, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[row, 'snapshots'], np.average(s.iloc[j], weights=1/w.iloc[j]**2))



In [72]:

    
test.head(2)









    Out[72]:






  
    
      
      TRIP_ID
      snapshots
      LONGITUDE
      LATITUDE
      TRAVEL_TIME
    
  
  
    
      0
       T1
       11
      -8.585676
       41.148522
       797.131579
    
    
      1
       T2
       40
      -8.610876
       41.145570
       737.052632



In [73]:

    
test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
test[['TRIP_ID', 'TRAVEL_TIME']].to_csv('submission.csv', index=False)

	TRIP_ID	CALL_TYPE	ORIGIN_CALL	ORIGIN_STAND	TAXI_ID	TIMESTAMP	DAY_TYPE	MISSING_DATA	POLYLINE
0	1372636858620000589	C	NaN	NaN	20000589	1372636858	A	False	[[-8.618643,41.141412],[-8.618499,41.141376],[...
1	1372637303620000596	B	NaN	7	20000596	1372637303	A	False	[[-8.639847,41.159826],[-8.640351,41.159871],[...

	TRIP_ID	CALL_TYPE	ORIGIN_CALL	ORIGIN_STAND	TAXI_ID	TIMESTAMP	DAY_TYPE	MISSING_DATA	POLYLINE
0	T1	B	NaN	15	20000542	1408039037	A	False	[[-8.585676,41.148522],[-8.585712,41.148639],[...
1	T2	B	NaN	57	20000108	1408038611	A	False	[[-8.610876,41.14557],[-8.610858,41.145579],[-...

	TRIP_ID	POLYLINE	SPEED
0	1372636858620000589	[[-8.618643,41.141412],[-8.618499,41.141376],[...	[0.847027397221, 13.2169490312, 14.0570802149,...
1	1372637303620000596	[[-8.639847,41.159826],[-8.640351,41.159871],[...	[2.83253801148, 10.4533049131, 12.9150340069, ...

	TRIP_ID	snapshots	LONGITUDE	LATITUDE	TRAVEL_TIME
0	T1	11	-8.585676	41.148522	797.131579
1	T2	40	-8.610876	41.145570	737.052632