In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sampleSubmission.csv')
In [3]:
train.head(2)
Out[3]:
In [4]:
test.head(2)
Out[4]:
In [6]:
sample.head(2)
Out[6]:
In [10]:
print train.isnull().sum()
print
print test.isnull().sum()
In [29]:
# 대부분의 GPS정보는 온전히 기록되지는 않음
pd.value_counts(train['MISSING_DATA'].values.ravel())
Out[29]:
In [1]:
import pandas as pd
from ast import literal_eval
In [2]:
train = pd.read_csv('train.csv', usecols=['POLYLINE','TRIP_ID'])
In [4]:
#formula to calculate distance among two gps points
def haversine(coord1,coord2):
import math
sin = math.sin
cos = math.cos
atan2 = math.atan2
sqrt = math.sqrt
lon1,lat1=coord1
lon2,lat2=coord2
R=6371000 #metres
phi1=lat1 * (3.1415 / 180)
phi2=lat2 * (3.1415 / 180)
Dphi= phi2 - phi1
Dlambda = (lon2 -lon1) * (3.1415 / 180)
a = sin(Dphi / 2) ** 2 + cos(phi1)*cos(phi2) *sin(Dlambda/2)**2
c = 2 * atan2(sqrt(a),sqrt(1-a))
d = R*c
return d
def speeds(polyline):
polyline = literal_eval(polyline)
N=len(polyline)
v = [0.]*N
if N == 0:
return []
for i in range(N - 1):
v[i] = haversine(polyline[i],polyline[i+1]) / 15.
v[N-1] = haversine(polyline[N-1],polyline[N-2]) / 15.
return v
def onSpeed(v):
N=len(v)
if N < 44:
return 660.0
if v[N-1] < 3:
return (N-1)*15
return (N*1.5 - 1) * 15
In [4]:
train.head(2)
Out[4]:
In [ ]:
# from string to list of coords
# train['POLYLINE'] = train['POLYLINE'].apply(literal_eval)
In [5]:
train['SPEED'] = train.POLYLINE.apply(speeds)
In [6]:
train.head(2)
Out[6]:
In [7]:
train['travel_time'] = train['SPEED'].apply(onSpeed)
In [8]:
train.head(2)
Out[8]:
In [10]:
test = pd.read_csv('test.csv', usecols=['POLYLINE','TRIP_ID'])
test['SPEED'] = test.POLYLINE.apply(speeds)
test['travel_time'] = test['SPEED'].apply(onSpeed)
# Create your submission file
submission = pd.DataFrame({"TRIP_ID": test['TRIP_ID'], "TRAVEL_TIME": test['travel_time']})
submission.to_csv("submission.csv", index=False)
In [13]:
submission.head(2)
Out[13]:
In [14]:
# kaggle LB
# 0.60680 131st/278
In [15]:
# try
# https://www.kaggle.com/gshguru/pkdd-15-taxi-trip-time-prediction-ii/beat-the-benchmark-for-200-trips
In [2]:
import json
import numpy as np
In [4]:
# read test
test = pd.read_csv('test.csv', usecols=['TRIP_ID', 'POLYLINE'])
In [5]:
test['snapshots'] = test['POLYLINE'].apply(lambda x: len(eval(x)))
In [9]:
test.count()
Out[9]:
In [20]:
test['LONGITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][0])
test['LATITUDE'] = test.POLYLINE.apply(lambda x: eval(x)[0][1])
test.drop('POLYLINE', axis=1, inplace=True)
In [22]:
test['TRAVEL_TIME'] = 0
In [23]:
test.head(2)
Out[23]:
In [10]:
# read train
train = pd.read_csv('train.csv', usecols=['POLYLINE'])
In [11]:
train['snapshots'] = train['POLYLINE'].apply(lambda x: len(eval(x)))
In [12]:
train.count()
Out[12]:
In [13]:
train = train[train.snapshots > 25]
train.count()
Out[13]:
In [24]:
train['LONGITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][0])
train['LATITUDE'] = train.POLYLINE.apply(lambda x: eval(x)[0][1])
train.drop('POLYLINE', axis=1, inplace=True)
In [25]:
train.head(2)
Out[25]:
In [49]:
### Control the number of closest trips used to calculate trip duration
N_trips = 200
### Get Haversine distance
def get_dist(x, lonlat2_lon, lonlat2_lat):
lonlat1 = [x[0], x[1]]
lonlat2 = [lonlat2_lon, lonlat2_lat]
lon_diff = np.abs(lonlat1[0]-lonlat2[0])*np.pi/360.0
lat_diff = np.abs(lonlat1[1]-lonlat2[1])*np.pi/360.0
a = np.sin(lat_diff)**2 + np.cos(lonlat1[1]*np.pi/180.0) * np.cos(lonlat2[1]*np.pi/180.0) * np.sin(lon_diff)**2
d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
return(d)
In [71]:
# 7시간정도 걸림
for row, lon, lat in zip(range(len(test)), test['LONGITUDE'], test['LATITUDE']):
d = train[['LONGITUDE', 'LATITUDE']].apply(lambda x: get_dist(x, lon, lat), axis=1)
i = np.argpartition(d, N_trips)[0:N_trips]
w = np.maximum(d.iloc[i], 0.01)
s = train.iloc[i]['snapshots']
j = np.argpartition(s, int(N_trips*.95))[0:int(N_trips*.95)]
test.loc[row, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[row, 'snapshots'], np.average(s.iloc[j], weights=1/w.iloc[j]**2))
In [72]:
test.head(2)
Out[72]:
In [73]:
test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
test[['TRIP_ID', 'TRAVEL_TIME']].to_csv('submission.csv', index=False)