In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sampleSubmission.csv')

In [7]:
train.head(2)


Out[7]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE
0 1372636858620000589 C NaN NaN 20000589 1372636858 A False [[-8.618643,41.141412],[-8.618499,41.141376],[...
1 1372637303620000596 B NaN 7 20000596 1372637303 A False [[-8.639847,41.159826],[-8.640351,41.159871],[...

In [8]:
test.head(2)


Out[8]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE
0 T1 B NaN 15 20000542 1408039037 A False [[-8.585676,41.148522],[-8.585712,41.148639],[...
1 T2 B NaN 57 20000108 1408038611 A False [[-8.610876,41.14557],[-8.610858,41.145579],[-...

In [6]:
sample.head(2)


Out[6]:
TRIP_ID LATITUDE LONGITUDE
0 T1 41.146504 -8.611317
1 T2 41.146504 -8.611317

In [5]:
gps = pd.read_csv('metaData_taxistandsID_name_GPSlocation.csv')

In [10]:
print gps[:5]


   ID  Descricao       Latitude  Longitude
0   1       Agra  41.1771457135  -8.609670
1   2    Alameda    41.15618964  -8.591064
2   3     Aldoar  41.1705249231  -8.665876
3   4  Alfândega  41.1437639911  -8.621803
4   5      Amial  41.1835097223  -8.612726

In [4]:
import pandas as pd

In [11]:
# 머신러닝을 쓰지말고 가까운데만 계산해서 제출해보자
test = pd.read_csv('test.csv')
gps = pd.read_csv('metaData_taxistandsID_name_GPSlocation.csv')

In [15]:
tmp = test.POLYLINE[0]
print eval(tmp)
print eval(tmp)[-1]


[[-8.585676, 41.148522], [-8.585712, 41.148639], [-8.585685, 41.148855], [-8.58573, 41.148927], [-8.585982, 41.148963], [-8.586396, 41.148954], [-8.586072, 41.14872], [-8.586324, 41.147847], [-8.586999, 41.14746], [-8.586576, 41.147154], [-8.584884, 41.146623]]
[-8.584884, 41.146623]

In [22]:
test = pd.read_csv('test.csv', usecols=['POLYLINE','TRIP_ID'],
                   converters={'POLYLINE': lambda x: eval(x)[-1]})

In [19]:
print test.head(2)
print test.POLYLINE[0][0]


  TRIP_ID                POLYLINE
0      T1  [-8.584884, 41.146623]
1      T2  [-8.601894, 41.163597]
-8.584884

In [23]:
test['LONGITUDE'] = test.POLYLINE.apply(lambda x: x[0])
test['LATITUDE'] = test.POLYLINE.apply(lambda x: x[1])

# Create your submission file
submission = pd.DataFrame({"TRIP_ID": test['TRIP_ID'], "LONGITUDE" : test['LONGITUDE'], "LATITUDE" : test['LATITUDE']})
submission.to_csv("submission.csv", index=False)

In [25]:
# kaggle LB
# 3.31766 211/326

In [27]:
pwd


Out[27]:
u'/home/ubuntu/taxia'