In [1]:
import ast

import pandas as pd

import datetime

from keras.layers import Input, Dense, Embedding, merge, Flatten, Merge, BatchNormalization
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np

from sklearn.cluster import MeanShift, estimate_bandwidth

import utils

import data

from sklearn.model_selection import train_test_split

from bcolz_array_iterator import BcolzArrayIterator

import bcolz

from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint


Using Theano backend.
Using gpu device 1: GeForce GTX TITAN X (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)
/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

Below path is a shared directory, swap to own


In [2]:
data_path = "/data/datasets/taxi/"

Replication of 'csv_to_hdf5.py'

Original repo used some bizarre tuple method of reading in data to save in a hdf5 file using fuel. The following does the same approach in that module, only using pandas and saving in a bcolz format (w/ training data as example)


In [3]:
meta = pd.read_csv(data_path+'metaData_taxistandsID_name_GPSlocation.csv', header=0)

In [66]:
meta.head()


Out[66]:
ID Descricao Latitude Longitude
0 1 Agra 41.177146 -8.609670
1 2 Alameda 41.156190 -8.591064
2 3 Aldoar 41.170525 -8.665876
3 4 Alfândega 41.143764 -8.621803
4 5 Amial 41.183510 -8.612726

In [85]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [5]:
train.head()


Out[5]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE
0 1372636858620000589 C NaN NaN 20000589 1372636858 A False [[-8.618643,41.141412],[-8.618499,41.141376],[...
1 1372637303620000596 B NaN 7.0 20000596 1372637303 A False [[-8.639847,41.159826],[-8.640351,41.159871],[...
2 1372636951620000320 C NaN NaN 20000320 1372636951 A False [[-8.612964,41.140359],[-8.613378,41.14035],[-...
3 1372636854620000520 C NaN NaN 20000520 1372636854 A False [[-8.574678,41.151951],[-8.574705,41.151942],[...
4 1372637091620000337 C NaN NaN 20000337 1372637091 A False [[-8.645994,41.18049],[-8.645949,41.180517],[-...

In [6]:
train['ORIGIN_CALL'] = pd.Series(pd.factorize(train['ORIGIN_CALL'])[0]) + 1

In [7]:
train['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in train["ORIGIN_STAND"]])

In [8]:
train['TAXI_ID'] = pd.Series(pd.factorize(train['TAXI_ID'])[0]) + 1

In [9]:
train['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in train['DAY_TYPE']])

The array of long/lat coordinates per trip (row) is read in as a string. The function ast.literal_eval(x) evaluates the string into the expression it represents (safely). This happens below


In [138]:
polyline = pd.Series([ast.literal_eval(x) for x in train['POLYLINE']])

Split into latitude/longitude


In [148]:
train['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

In [150]:
train['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])

In [157]:
utils.save_array(data_path+'train/train.bc', train.as_matrix())

In [158]:
utils.save_array(data_path+'train/meta_train.bc', meta.as_matrix())

Further Feature Engineering

After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.


In [424]:
train = pd.DataFrame(utils.load_array(data_path+'train/train.bc'), columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE'])

In [425]:
train.head()


Out[425]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE
0 1372636858620000589 C 0 0 1 1372636858 0 False [[-8.618643,41.141412],[-8.618499,41.141376],[... [41.1414, 41.1414, 41.1425, 41.1438, 41.1444, ... [-8.61864, -8.6185, -8.62033, -8.62215, -8.623...
1 1372637303620000596 B 0 7 2 1372637303 0 False [[-8.639847,41.159826],[-8.640351,41.159871],[... [41.1598, 41.1599, 41.1601, 41.1605, 41.1609, ... [-8.63985, -8.64035, -8.6422, -8.64445, -8.646...
2 1372636951620000320 C 0 0 3 1372636951 0 False [[-8.612964,41.140359],[-8.613378,41.14035],[-... [41.1404, 41.1404, 41.1403, 41.1404, 41.1404, ... [-8.61296, -8.61338, -8.61421, -8.61477, -8.61...
3 1372636854620000520 C 0 0 4 1372636854 0 False [[-8.574678,41.151951],[-8.574705,41.151942],[... [41.152, 41.1519, 41.1519, 41.152, 41.1519, 41... [-8.57468, -8.57471, -8.5747, -8.57466, -8.574...
4 1372637091620000337 C 0 0 5 1372637091 0 False [[-8.645994,41.18049],[-8.645949,41.180517],[-... [41.1805, 41.1805, 41.18, 41.1789, 41.1785, 41... [-8.64599, -8.64595, -8.64605, -8.6468, -8.649...

The paper discusses how many categorical variables there are per category. The following all check out


In [426]:
train['ORIGIN_CALL'].max()


Out[426]:
57105

In [427]:
train['ORIGIN_STAND'].max()


Out[427]:
63

In [428]:
train['TAXI_ID'].max()


Out[428]:
448

Self-explanatory


In [429]:
train['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in train['TIMESTAMP']])

Quarter hour of the day, i.e. 1 of the 4*24 = 96 quarter hours of the day


In [430]:
train['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                   for t in train['TIMESTAMP']])

Self-explanatory


In [431]:
train['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in train['TIMESTAMP']])

Target coords are the last in the sequence (final position). If there are no positions, or only 1, then mark as invalid w/ nan in order to drop later


In [433]:
train['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else numpy.nan for l in train[['LONGITUDE','LATITUDE']].iterrows()])

This function creates the continuous inputs, which are the concatened k first and k last coords in a sequence, as discussed in the paper.

If there aren't at least 2* k coords excluding the target, then the k first and k last overlap. In this case the sequence (excluding target) is padded at the end with the last coord in the sequence. The paper mentioned they padded front and back but didn't specify in what manner.

Also marks any invalid w/ na's


In [437]:
def start_stop_inputs(k):
    result = []
    for l in train[['LONGITUDE','LATITUDE']].iterrows():
        if len(l[1][0]) < 2 or len(l[1][1]) < 2:
            result.append(numpy.nan)
        elif len(l[1][0][:-1]) >= 2*k:
            result.append(numpy.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
        else:
            l1 = numpy.lib.pad(l[1][0][:-1], (0,20-len(l[1][0][:-1])), mode='edge')
            l2 = numpy.lib.pad(l[1][1][:-1], (0,20-len(l[1][1][:-1])), mode='edge')
            result.append(numpy.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)

In [438]:
train['COORD_FEATURES'] = start_stop_inputs(5)

In [442]:
train.shape


Out[442]:
(1710670, 16)

In [441]:
train.dropna().shape


Out[441]:
(1674160, 16)

Drop na's


In [443]:
train = train.dropna()

In [446]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())

End to end feature transformation


In [155]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [ ]:
test = pd.read_csv(data_path+'test/test.csv', header=0)

In [139]:
def start_stop_inputs(k, data, test):
    result = []
    for l in data[['LONGITUDE','LATITUDE']].iterrows():
        if not test:
            if len(l[1][0]) < 2 or len(l[1][1]) < 2:
                result.append(np.nan)
            elif len(l[1][0][:-1]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0][:-1], (0,4*k-len(l[1][0][:-1])), mode='edge')
                l2 = np.lib.pad(l[1][1][:-1], (0,4*k-len(l[1][1][:-1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
        else:
            if len(l[1][0]) < 1 or len(l[1][1]) < 1:
                result.append(np.nan)
            elif len(l[1][0]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-k:],l[1][1][0:k],l[1][1][-k:]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0], (0,4*k-len(l[1][0])), mode='edge')
                l2 = np.lib.pad(l[1][1], (0,4*k-len(l[1][1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)

Pre-calculated below on train set


In [143]:
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309

In [ ]:
def feature_ext(data, test=False):   
    
    data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1

    data['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in data["ORIGIN_STAND"]])

    data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0]) + 1

    data['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in data['DAY_TYPE']])

    polyline = pd.Series([ast.literal_eval(x) for x in data['POLYLINE']])

    data['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

    data['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
    
    if not test:
    
        data['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in data[['LONGITUDE','LATITUDE']].iterrows()])

    
    data['LATITUDE'] = pd.Series([(t-lat_mean)/lat_std for t in data['LATITUDE']])
    
    data['LONGITUDE'] = pd.Series([(t-long_mean)/long_std for t in data['LONGITUDE']])
    
    data['COORD_FEATURES'] = start_stop_inputs(5, data, test)

    data['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in data['TIMESTAMP']])

    data['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                       for t in data['TIMESTAMP']])

    data['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in data['TIMESTAMP']])
    
        
    data = data.dropna()

    return data

In [ ]:
train = feature_ext(train)

In [ ]:
test = feature_ext(test, test=True)

In [161]:
test.head()


Out[161]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE COORD_FEATURES DAY_OF_WEEK QUARTER_HOUR WEEK_OF_YEAR
0 T1 B 0 15 1 1408039037 0 False [[-8.585676,41.148522],[-8.585712,41.148639],[... [-0.118578, -0.116982, -0.1141, -0.113122, -0.... [0.532604, 0.531971, 0.532454, 0.531671, 0.527... [0.532604, 0.531971, 0.532454, 0.531671, 0.527... 3 43 33
1 T2 B 0 57 2 1408038611 0 False [[-8.610876,41.14557],[-8.610858,41.145579],[-... [-0.158413, -0.158258, -0.155736, -0.150024, -... [0.0920491, 0.0923659, 0.0915823, 0.0996017, 0... [0.0920491, 0.0923659, 0.0915823, 0.0996017, 0... 3 43 33
2 T3 B 0 15 3 1408038568 0 False [[-8.585739,41.148558],[-8.58573,41.148828],[-... [-0.118063, -0.11446, -0.112505, -0.111887, -0... [0.531504, 0.531671, 0.531821, 0.5219, 0.52490... [0.531504, 0.531671, 0.531821, 0.5219, 0.52490... 3 43 33
3 T4 B 0 53 4 1408039090 0 False [[-8.613963,41.141169],[-8.614125,41.141124],[... [-0.217753, -0.21837, -0.221047, -0.222488, -0... [0.0380801, 0.0352457, 0.0184065, 0.0151053, 0... [0.0380801, 0.0352457, 0.0184065, 0.0151053, 0... 3 43 33
4 T5 B 0 18 5 1408039177 0 False [[-8.619903,41.148036],[-8.619894,41.148036]] [-0.125114, -0.125114] [-0.0657565, -0.0656064] [-0.0657565, -0.0656064, -0.0656064, -0.065606... 3 43 33

In [162]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())

In [163]:
utils.save_array(data_path+'test/test_features.bc', test.as_matrix())

In [164]:
train.head()


Out[164]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE TARGET COORD_FEATURES DAY_OF_WEEK QUARTER_HOUR WEEK_OF_YEAR
0 1372636858620000589 C 0 0 1 1372636858 0 False [[-8.618643,41.141412],[-8.618499,41.141376],[... [-0.21451, -0.214974, -0.199688, -0.182087, -0... [-0.0437321, -0.0412145, -0.0731591, -0.105104... [-8.63084, 41.1545] [-0.0437321, -0.0412145, -0.0731591, -0.105104... 6 68 26
1 1372637303620000596 B 0 7 2 1372637303 0 False [[-8.639847,41.159826],[-8.640351,41.159871],[... [0.0339161, 0.0345337, 0.0378275, 0.0429227, 0... [-0.414429, -0.423249, -0.455494, -0.494991, -... [-8.66574, 41.1707] [-0.414429, -0.423249, -0.455494, -0.494991, -... 6 68 26
2 1372636951620000320 C 0 0 3 1372636951 0 False [[-8.612964,41.140359],[-8.613378,41.14035],[-... [-0.228715, -0.228818, -0.229796, -0.228561, -... [0.0555529, 0.048317, 0.0336785, 0.0239251, 0.... [-8.61597, 41.1405] [0.0555529, 0.048317, 0.0336785, 0.0239251, 0.... 6 68 26
3 1372636854620000520 C 0 0 4 1372636854 0 False [[-8.574678,41.151951],[-8.574705,41.151942],[... [-0.0723098, -0.0724127, -0.0725671, -0.072206... [0.724872, 0.724405, 0.724572, 0.725189, 0.724... [-8.608, 41.1429] [0.724872, 0.724405, 0.724572, 0.725189, 0.724... 6 68 26
4 1372637091620000337 C 0 0 5 1372637091 0 False [[-8.645994,41.18049],[-8.645949,41.180517],[-... [0.312708, 0.313068, 0.306789, 0.291092, 0.285... [-0.5219, -0.521117, -0.522834, -0.536055, -0.... [-8.68727, 41.1781] [-0.5219, -0.521117, -0.522834, -0.536055, -0.... 6 68 26

MEANSHIFT

Meanshift clustering as performed in the paper


In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK',
                            'QUARTER_HOUR', "WEEK_OF_YEAR", "TARGET", "COORD_FEATURES"])

Clustering performed on the targets


In [532]:
y_targ = np.vstack(train["TARGET"].as_matrix())

In [524]:
from sklearn.cluster import MeanShift, estimate_bandwidth

Can use the commented out code for a estimate of bandwidth, which causes clustering to converge much quicker.

This is not mentioned in the paper but is included in the code. In order to get results similar to the paper's, they manually chose the uncommented bandwidth


In [533]:
#bw = estimate_bandwidth(y_targ, quantile=.1, n_samples=1000)
bw = 0.001

This takes some time


In [545]:
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(y_targ)


Out[545]:
MeanShift(bandwidth=0.001, bin_seeding=True, cluster_all=True, min_bin_freq=5,
     n_jobs=1, seeds=None)

In [546]:
cluster_centers = ms.cluster_centers_

This is very close to the number of clusters mentioned in the paper


In [547]:
cluster_centers.shape


Out[547]:
(3421, 2)

In [548]:
utils.save_array(data_path+"cluster_centers_bw_001.bc", cluster_centers)

Formatting Features for Bcolz iterator / garbage


In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [ ]:
cluster_centers = utils.load_array(data_path+"cluster_centers_bw_001.bc")

In [50]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [ ]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)

In [11]:
def get_features(data):
    return [np.vstack(data['COORD_FEATURES'].as_matrix()), np.vstack(data['ORIGIN_CALL'].as_matrix()), 
           np.vstack(data['TAXI_ID'].as_matrix()), np.vstack(data['ORIGIN_STAND'].as_matrix()),
           np.vstack(data['QUARTER_HOUR'].as_matrix()), np.vstack(data['DAY_OF_WEEK'].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'].as_matrix()), np.array([long for i in range(0,data.shape[0])]),
               np.array([lat for i in range(0,data.shape[0])])]

In [7]:
def get_target(data):
    return np.vstack(data["TARGET"].as_matrix())

In [ ]:
X_train_features = get_features(X_train)

In [14]:
X_train_target = get_target(X_train)

In [13]:
utils.save_array(data_path+'train/X_train_features.bc', get_features(X_train))


Out[13]:
(1339328, 20)

MODEL

Load training data and cluster centers


In [16]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

Validation cuts


In [17]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [41]:
print(datetime.datetime.fromtimestamp(1376503200))


2013-08-14 11:00:00

In [22]:
train.shape


Out[22]:
(1674160, 16)

In [24]:
val_indices = []
index = 0
for index, row in train.iterrows():
    time = row['TIMESTAMP']
    latitude = row['LATITUDE']
    for ts in cuts:
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            val_indices.append(index)
            break
    index += 1

In [60]:
X_valid = train.iloc[val_indices]

In [53]:
valid.head()


Out[53]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE TARGET COORD_FEATURES DAY_OF_WEEK QUARTER_HOUR WEEK_OF_YEAR
200153 1376502576620000126 B 0 36 247 1376502576 0 False [[-8.649504,41.15421],[-8.649684,41.154201],[-... [-0.0418419, -0.0419448, -0.0449813, -0.046422... [-0.583255, -0.586407, -0.59711, -0.589074, -0... [-8.61122, 41.1463] [-0.583255, -0.586407, -0.59711, -0.589074, -0... 2 43 33
200186 1376503146620000161 B 0 35 19 1376503146 0 False [[-8.649621,41.167323],[-8.64963,41.167251],[-... [0.135098, 0.134121, 0.126709, 0.125371, 0.124... [-0.585306, -0.585456, -0.589241, -0.588774, -... [-8.64504, 41.1586] [-0.585306, -0.585456, -0.589241, -0.588774, -... 2 43 33
200200 1376502942620000500 B 0 15 428 1376502942 0 False [[-8.585694,41.148522],[-8.585712,41.148801],[... [-0.118578, -0.114821, -0.112402, -0.116982, -... [0.532287, 0.531971, 0.523018, 0.524735, 0.524... [-8.61524, 41.1418] [0.532287, 0.531971, 0.523018, 0.524735, 0.524... 2 43 33
200202 1376502604620000105 C 0 0 87 1376502604 0 False [[-8.61093,41.145498],[-8.610939,41.145516],[-... [-0.15939, -0.159133, -0.153883, -0.145392, -0... [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1... [-8.64832, 41.1648] [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1... 2 43 33
200227 1376502611620000022 C 0 0 304 1376502611 0 False [[-8.591301,41.162715],[-8.591004,41.162562],[... [0.0729274, 0.0708687, 0.0587228, 0.0539879, 0... [0.43427, 0.439455, 0.42735, 0.423566, 0.41539... [-8.60977, 41.1512] [0.43427, 0.439455, 0.42735, 0.423566, 0.41539... 2 43 33

In [35]:
for d in valid['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(d))


2013-08-14 10:49:36
2013-08-14 10:59:06
2013-08-14 10:55:42
2013-08-14 10:50:04
2013-08-14 10:50:11
2013-08-14 10:56:57
2013-08-14 10:36:51
2013-08-14 10:44:15
2013-08-14 10:55:50
2013-08-14 10:50:35
2013-08-14 10:50:27
2013-08-14 10:43:57
2013-08-14 10:16:48
2013-08-14 10:40:47
2013-08-14 10:45:55
2013-08-14 10:43:00
2013-08-14 10:53:22
2013-08-14 10:50:03
2013-08-14 10:26:22
2013-08-14 10:59:15
2013-08-14 10:50:17
2013-08-14 10:56:34
2013-08-14 10:53:42
2013-08-14 10:47:46
2013-08-14 10:58:46
2013-08-14 10:24:23
2013-08-14 10:55:19
2013-08-14 10:57:03
2013-08-14 10:56:11
2013-08-14 10:56:52
2013-08-14 10:57:57
2013-08-14 10:08:15
2013-08-14 10:51:14
2013-08-14 10:58:31
2013-08-14 10:47:31
2013-08-14 10:30:36
2013-08-14 10:17:59
2013-08-14 10:48:03
2013-08-14 10:55:52
2013-08-14 10:49:06
2013-08-14 10:58:55
2013-08-14 10:51:24
2013-08-14 10:54:12
2013-08-14 10:54:26
2013-08-14 10:51:18
2013-08-14 10:59:56
2013-08-14 10:48:31
2013-08-14 10:51:56
2013-08-14 10:39:22
2013-08-14 10:57:25
2013-08-14 10:57:28
2013-08-14 10:57:40
2013-08-14 10:39:01
2013-08-14 10:50:39
2013-08-14 09:48:19
2013-10-01 01:16:12
2013-10-01 01:28:04
2013-10-01 01:18:37
2013-10-01 01:24:48
2013-10-01 01:23:39
2013-10-01 01:28:37
2013-10-01 01:20:16
2013-10-01 01:23:49
2013-10-01 01:27:11
2013-10-01 01:06:20
2013-10-01 01:28:08
2013-10-01 01:29:02
2013-10-01 01:24:44
2013-10-01 01:24:44
2013-10-01 01:19:06
2013-10-01 00:28:33
2013-10-01 01:29:28
2013-10-01 01:27:31
2013-10-01 01:22:13
2013-10-01 01:26:03
2013-10-01 01:28:55
2013-10-01 01:18:10
2013-10-01 01:22:13
2013-10-01 01:14:30
2013-10-01 01:24:41
2013-10-01 01:22:16
2013-10-01 01:25:35
2013-10-01 01:21:27
2013-10-01 01:11:33
2013-10-01 01:10:18
2013-10-01 01:09:33
2013-10-01 01:01:15
2013-10-01 01:17:58
2013-10-01 01:18:00
2013-10-01 01:13:26
2013-10-01 01:18:01
2013-10-01 01:25:54
2013-10-01 01:21:20
2013-10-01 01:25:31
2013-10-01 01:25:54
2013-10-01 01:23:40
2013-10-01 01:26:46
2013-10-01 01:23:31
2013-10-01 01:17:09
2013-10-01 01:21:57
2013-10-01 00:29:09
2013-10-01 01:14:47
2013-10-01 01:04:25
2013-10-01 01:14:09
2013-10-01 01:16:59
2013-10-01 01:27:16
2013-10-01 01:16:26
2013-10-01 01:23:18
2013-10-01 01:16:05
2013-10-01 01:27:43
2013-10-01 01:08:13
2013-10-01 01:19:21
2013-10-01 01:21:19
2013-10-01 01:24:20
2013-10-01 01:26:45
2013-10-01 01:18:28
2013-10-01 01:19:45
2013-10-01 01:28:10
2013-10-01 01:22:20
2013-10-01 01:18:42
2013-10-01 01:19:52
2013-10-01 01:18:44
2013-10-01 01:15:11
2013-10-01 01:19:24
2013-10-01 01:23:58
2013-10-01 01:28:50
2013-10-01 01:13:24
2013-10-01 01:28:38
2013-10-01 01:24:50
2013-10-01 01:14:19
2013-10-01 01:10:05
2013-10-01 01:26:31
2013-10-01 01:28:01
2013-09-30 23:44:16
2013-10-01 01:21:43
2013-10-01 01:26:57
2013-10-01 01:25:25
2013-10-01 01:25:36
2013-10-01 01:16:34
2013-10-01 01:26:40
2013-10-01 01:14:56
2013-10-01 01:13:10
2013-10-01 01:28:34
2013-10-01 01:19:08
2013-10-01 01:24:57
2013-10-01 00:52:43
2013-10-01 01:25:28
2013-10-01 01:22:54
2013-10-01 01:28:49
2013-10-01 00:13:25
2013-10-07 10:34:47
2013-10-07 10:38:08
2013-10-07 10:31:10
2013-10-07 10:35:12
2013-10-07 10:41:50
2013-10-07 10:34:31
2013-10-07 10:42:02
2013-10-07 10:39:05
2013-10-07 10:31:43
2013-10-07 10:34:27
2013-10-07 10:31:48
2013-10-07 10:42:24
2013-10-07 10:38:37
2013-10-07 10:29:02
2013-10-07 10:33:55
2013-10-07 10:17:07
2013-10-07 10:44:31
2013-10-07 10:42:52
2013-10-07 10:26:05
2013-10-07 10:34:07
2013-10-07 10:40:59
2013-10-07 10:41:36
2013-10-07 10:33:47
2013-10-07 10:30:59
2013-10-07 10:38:59
2013-10-07 10:28:56
2013-10-07 10:41:24
2013-10-07 10:41:49
2013-10-07 10:42:47
2013-10-07 10:34:09
2013-10-07 10:40:31
2013-10-07 10:21:34
2013-10-07 10:43:52
2013-10-07 10:18:11
2013-10-07 10:41:47
2013-10-07 10:33:04
2013-10-07 10:40:53
2013-10-07 10:36:38
2013-10-07 10:41:46
2013-10-07 10:03:36
2013-10-07 10:44:45
2013-10-07 10:21:42
2013-10-07 10:24:07
2013-10-07 10:40:35
2013-10-07 10:41:00
2013-10-07 10:43:10
2013-10-07 10:23:55
2013-10-07 10:43:30
2013-10-07 10:25:24
2013-10-07 10:35:07
2013-10-07 10:43:33
2013-10-07 10:39:30
2013-10-07 10:31:42
2013-10-07 10:39:17
2013-10-07 10:42:47
2013-10-07 10:39:20
2013-10-07 10:44:41
2013-10-07 10:24:22
2013-10-07 10:12:39
2013-10-07 10:37:25
2013-10-07 10:42:55
2013-10-07 10:14:35
2013-10-07 10:37:12
2013-10-07 10:32:29
2013-10-07 10:42:37
2013-10-07 10:26:52
2013-10-07 10:31:19
2013-10-07 10:44:58
2013-11-01 20:47:37
2013-11-01 20:54:00
2013-11-01 20:58:53
2013-11-01 20:56:37
2013-11-01 20:56:09
2013-11-01 20:51:05
2013-11-01 20:50:58
2013-11-01 20:55:26
2013-11-01 20:53:43
2013-11-01 20:53:46
2013-11-01 20:54:55
2013-11-01 20:59:28
2013-11-01 20:56:54
2013-11-01 20:50:37
2013-11-01 20:48:40
2013-11-01 20:55:46
2013-11-01 20:45:20
2013-11-01 20:46:22
2013-11-01 20:48:25
2013-11-01 20:47:19
2013-11-01 20:57:31
2013-11-01 20:58:14
2013-11-01 20:49:30
2013-11-01 20:43:31
2013-11-01 20:59:00
2013-11-01 20:54:23
2013-11-01 20:51:01
2013-11-01 20:38:12
2013-11-01 20:59:31
2013-11-01 20:56:46
2013-11-01 20:53:51
2013-11-01 20:48:00
2013-11-01 20:58:04
2013-11-01 20:52:50
2013-11-01 20:58:12
2013-11-01 20:57:37
2013-11-01 20:53:33
2013-11-01 20:54:11
2013-11-01 20:48:49
2013-11-01 20:42:56
2013-11-01 20:55:36
2013-11-01 20:51:36
2013-11-01 20:48:45
2013-11-01 20:49:17
2013-11-01 20:53:50
2013-11-01 20:45:28
2013-11-01 20:45:04
2013-11-01 20:52:17
2013-11-01 20:52:10
2013-11-01 20:59:16
2013-11-01 20:51:37
2013-11-01 20:50:10
2013-12-22 06:24:50
2013-12-22 06:04:12
2013-12-22 06:16:27
2013-12-22 06:23:06
2013-12-22 06:24:04
2013-12-22 06:17:33
2013-12-22 06:22:55
2013-12-22 06:24:35
2013-12-22 06:21:56
2013-12-22 06:22:49
2013-12-22 06:25:31
2013-12-22 06:21:31
2013-12-22 06:27:31
2013-12-22 06:29:45
2013-12-22 06:26:09
2013-12-22 06:17:08
2013-12-22 06:26:00
2013-12-22 06:20:56
2013-12-22 06:23:09
2013-12-22 06:22:31
2013-12-22 06:29:59
2013-12-22 06:27:43
2013-12-22 06:23:04
2013-12-22 06:25:30
2013-12-22 06:19:16
2013-12-22 06:23:06
2013-12-22 06:26:01
2013-12-22 06:19:45
2013-12-22 02:34:23
2013-12-22 06:29:54
2013-12-22 06:28:39
2013-12-22 06:27:43
2013-12-22 06:16:23
2013-12-22 06:17:26

In [58]:
X_train = train.drop(train.index[[val_indices]])

In [5]:
cluster_centers = utils.load_array(data_path+"/data/cluster_centers_bw_001.bc")

In [6]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [62]:
utils.save_array(data_path+'train/X_train.bc', X_train.as_matrix())

In [64]:
utils.save_array(data_path+'valid/X_val.bc', X_valid.as_matrix())

In [24]:
X_train = pd.DataFrame(utils.load_array(data_path+'train/X_train.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [25]:
X_val = pd.DataFrame(utils.load_array(data_path+'valid/X_val.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [ ]:


In [ ]:

The equirectangular loss function mentioned in the paper.

Note: Very important that y[0] is longitude and y[1] is latitude.

Omitted the radius of the earth constant "R" as it does not affect minimization and units were not given in the paper.


In [7]:
def equirectangular_loss(y_true, y_pred):
    deg2rad = 3.141592653589793 / 180
    long_1 = y_true[:,0]*deg2rad
    long_2 = y_pred[:,0]*deg2rad
    lat_1 = y_true[:,1]*deg2rad
    lat_2 = y_pred[:,1]*deg2rad
    return 6371*K.sqrt(K.square((long_1 - long_2)*K.cos((lat_1 + lat_2)/2.))
                       +K.square(lat_1 - lat_2))

In [9]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

The following returns a fully-connected model as mentioned in the paper. Takes as input k as defined before, and the cluster centers.

Inputs: Embeddings for each category, concatenated w/ the 4*k continous variable representing the first/last k coords as mentioned above.

Embeddings have no regularization, as it was not mentioned in paper, though are easily equipped to include.

Paper mentions global normalization. Didn't specify exactly how they did that, whether thay did it sequentially or whatnot. I just included a batchnorm layer for the continuous inputs.

After concatenation, 1 hidden layer of 500 neurons as called for in paper.

Finally, output layer has as many outputs as there are cluster centers, w/ a softmax activation. Call this output P.

The prediction is the weighted sum of each cluster center c_i w/ corresponding predicted prob P_i.

To facilitate this, dotted output w/ cluster latitudes and longitudes separately. (this happens at variable y), then concatenated into single tensor.

NOTE!!: You will see that I have the cluster center coords as inputs. Ideally, This function should store the cluster longs/lats as a constant to be used in the model, but I could not figure out. As a consequence, I pass them in as a repeated input.


In [67]:
def taxi_mlp(k, cluster_centers):
    shp = cluster_centers.shape[0]
    nums = Input(shape=(4*k,))

    center_longs = Input(shape=(shp,))
    center_lats = Input(shape=(shp,))

    emb_names = ['client_ID', 'taxi_ID', "stand_ID", "quarter_hour", "day_of_week", "week_of_year"]
    emb_ins = [57106, 448, 64, 96, 7, 52]
    emb_outs = [10 for i in range(0,6)]
    regs = [0 for i in range(0,6)]

    embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]

    x = merge([nums] + [Flatten()(e[1]) for e in embs], mode='concat')

    x = Dense(500, activation='relu')(x)

    x = Dense(shp, activation='softmax')(x)

    y = merge([merge([x, center_longs], mode='dot'), merge([x, center_lats], mode='dot')], mode='concat')

    return Model(input = [nums]+[e[0] for e in embs] + [center_longs, center_lats], output = y)

As mentioned, construction of repeated cluster longs/lats for input

Iterator for in memory train pandas dataframe. I did this as opposed to bcolz iterator due to the pre-processing


In [43]:
def data_iter(data, batch_size, cluster_centers):
    long = [c[0] for c in cluster_centers]
    lat = [c[1] for c in cluster_centers]
    i = 0
    N = data.shape[0]
    while True:
        yield ([np.vstack(data['COORD_FEATURES'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_CALL'][i:i+batch_size].as_matrix()), 
           np.vstack(data['TAXI_ID'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_STAND'][i:i+batch_size].as_matrix()),
           np.vstack(data['QUARTER_HOUR'][i:i+batch_size].as_matrix()), np.vstack(data['DAY_OF_WEEK'][i:i+batch_size].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'][i:i+batch_size].as_matrix()), np.array([long for i in range(0,batch_size)]),
               np.array([lat for i in range(0,batch_size)])], np.vstack(data["TARGET"][i:i+batch_size].as_matrix()))
        i += batch_size

In [ ]:
x=Lambda(thing)([x,long,lat])

Of course, k in the model needs to match k from feature construction. We again use 5 as they did in the paper


In [68]:
model = taxi_mlp(5, cluster_centers)

Paper used SGD opt w/ following paramerters


In [69]:
model.compile(optimizer=SGD(0.01, momentum=0.9), loss=equirectangular_loss, metrics=['mse'])

In [73]:
X_train_feat = get_features(X_train)

In [74]:
X_train_target = get_target(X_train)

In [76]:
X_val_feat = get_features(X_valid)

In [77]:
X_val_target = get_target(X_valid)

In [78]:
tqdm = TQDMNotebookCallback()

In [79]:
checkpoint = ModelCheckpoint(filepath=data_path+'models/tmp/weights.{epoch:03d}.{val_loss:.8f}.hdf5', save_best_only=True)

In [80]:
batch_size=256

original


In [84]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


5272/|/[loss: 0.469, mean_squared_error: 0.000] 100%|| 5272/5273 [01:54<00:00, 47.14it/s]
Out[84]:
<keras.callbacks.History at 0x7fb2bb8a19e8>

In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=30, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


5272/|/[loss: 0.107, mean_squared_error: 0.000] 100%|| 5272/5273 [01:54<00:00, 49.65it/s]

In [20]:
model = load_model(data_path+'models/weights.0.0799.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [42]:
model.fit(X_train_feat, X_train_target, nb_epoch=100, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


5231/|/[loss: 0.074, mean_squared_error: 0.000] 100%|| 5231/5232 [01:58<00:00, 50.19it/s]
Out[42]:
<keras.callbacks.History at 0x7fced25954a8>

In [43]:
model.save(data_path+'models/current_model.hdf5')

new valid


In [81]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)



Out[81]:
<keras.callbacks.History at 0x7f82d815c550>

In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=400, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)

In [102]:
model.save(data_path+'/models/current_model.hdf5')

In [84]:
len(X_val_feat[0])


Out[84]:
304

It works, but it seems to converge unrealistically quick and the loss values are not the same. The paper does not mention what it's using as "error" in it's results. I assume the same equirectangular? Not very clear. The difference in values could be due to the missing Earth-radius factor

Kaggle Entry


In [23]:
best_model = load_model(data_path+'models/weights.308.0.03373993.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [104]:
best_model.evaluate(X_val_feat, X_val_target)


 32/304 [==>...........................] - ETA: 0s
Out[104]:
[0.033743755401749363, 2.5798687967213293e-07]

In [61]:
test = pd.DataFrame(utils.load_array(data_path+'test/test_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [62]:
test['ORIGIN_CALL'] = pd.read_csv(data_path+'real_origin_call.csv', header=None)

In [63]:
test['TAXI_ID'] = pd.read_csv(data_path+'real_taxi_id.csv',header=None)

In [64]:
X_test = get_features(test)

In [65]:
b = np.sort(X_test[1],axis=None)

In [67]:
test_preds = np.round(best_model.predict(X_test), decimals=6)

In [68]:
d = {0:test['TRIP_ID'], 1:test_preds[:,1], 2:test_preds[:,0]}
kaggle_out = pd.DataFrame(data=d)

In [121]:
kaggle_out.to_csv(data_path+'submission.csv', header=['TRIP_ID','LATITUDE', 'LONGITUDE'], index=False)

In [117]:
def hdist(a, b):
    deg2rad = 3.141592653589793 / 180

    lat1 = a[:, 1] * deg2rad
    lon1 = a[:, 0] * deg2rad
    lat2 = b[:, 1] * deg2rad
    lon2 = b[:, 0] * deg2rad

    dlat = abs(lat1-lat2)
    dlon = abs(lon1-lon2)

    al = np.sin(dlat/2)**2  + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2)**2)
    d = np.arctan2(np.sqrt(al), np.sqrt(1-al))

    hd = 2 * 6371 * d

    return hd

In [118]:
val_preds = best_model.predict(X_val_feat)

In [88]:
trn_preds = model.predict(X_train_feat)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-88-7606f80b50cf> in <module>()
----> 1 trn_preds = model.predict(X_train_feat)

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/training.py in predict(self, x, batch_size, verbose)
   1270         f = self.predict_function
   1271         return self._predict_loop(f, ins,
-> 1272                                   batch_size=batch_size, verbose=verbose)
   1273 
   1274     def train_on_batch(self, x, y,

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/training.py in _predict_loop(self, f, ins, batch_size, verbose)
    943                 ins_batch = slice_X(ins, batch_ids)
    944 
--> 945             batch_outs = f(ins_batch)
    946             if not isinstance(batch_outs, list):
    947                 batch_outs = [batch_outs]

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/theano_backend.py in __call__(self, inputs)
    957     def __call__(self, inputs):
    958         assert isinstance(inputs, (list, tuple))
--> 959         return self.function(*inputs)
    960 
    961 

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/theano/compile/function_module.py in __call__(self, *args, **kwargs)
    857         t0_fn = time.time()
    858         try:
--> 859             outputs = self.fn()
    860         except Exception:
    861             if hasattr(self.fn, 'position_of_error'):

KeyboardInterrupt: 

In [119]:
er = hdist(val_preds, X_val_target)

In [120]:
er.mean()


Out[120]:
0.033741556

In [ ]:


In [ ]:
K.equal()

To-do: simple to extend to validation data

Uh oh... training data not representative of test


In [67]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [86]:
np.any([train['TIMESTAMP'].map(lambda x: x in cuts)])


Out[86]:
False

In [87]:
train['TIMESTAMP']


Out[87]:
0          1372636858
1          1372637303
2          1372636951
3          1372636854
4          1372637091
5          1372636965
6          1372637210
7          1372637299
8          1372637274
9          1372637905
10         1372636875
11         1372637984
12         1372637343
13         1372638595
14         1372638151
15         1372637610
16         1372638481
17         1372639135
18         1372637482
19         1372639181
20         1372638161
21         1372637254
22         1372638502
23         1372639960
24         1372637658
25         1372639092
26         1372639535
27         1372640499
28         1372639635
29         1372640555
              ...    
1710640    1404151621
1710641    1404152121
1710642    1404170192
1710643    1386603894
1710644    1401596832
1710645    1404151410
1710646    1404172198
1710647    1404155241
1710648    1404171548
1710649    1404151498
1710650    1404168899
1710651    1404153627
1710652    1401475142
1710653    1403935197
1710654    1404166892
1710655    1404143157
1710656    1404014448
1710657    1380123541
1710658    1373986578
1710659    1403941536
1710660    1384165182
1710661    1404164723
1710662    1404155105
1710663    1388660427
1710664    1390403767
1710665    1404171463
1710666    1404171367
1710667    1388745716
1710668    1404141826
1710669    1404157147
Name: TIMESTAMP, dtype: int64

In [90]:
np.any(train['TIMESTAMP']==1381167900)


Out[90]:
False

In [91]:
times = train['TIMESTAMP'].as_matrix()

In [98]:
X_train.columns


Out[98]:
Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE',
       'LONGITUDE', 'TARGET', 'COORD_FEATURES', 'DAY_OF_WEEK', 'QUARTER_HOUR',
       'WEEK_OF_YEAR'],
      dtype='object')

In [92]:
times


Out[92]:
array([1372636858, 1372637303, 1372636951, ..., 1388745716, 1404141826, 1404157147])

In [102]:
count = 0
for index, row in X_val.iterrows():
    for ts in cuts:
        time = row['TIMESTAMP']
        latitude = row['LATITUDE']
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            count += 1

In [101]:
one = count

In [104]:
count + one


Out[104]:
304

In [6]:
import h5py

In [7]:
h = h5py.File(data_path+'original/data.hdf5', 'r')

In [15]:
evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-15-4bc7106cf938> in <module>()
----> 1 evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1482475225177/work/h5py/_objects.c:2856)()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1482475225177/work/h5py/_objects.c:2814)()

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/h5py/_hl/group.py in __getitem__(self, name)
    164                 raise ValueError("Invalid HDF5 object reference")
    165         else:
--> 166             oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    167 
    168         otype = h5i.get_type(oid)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1482475225177/work/h5py/_objects.c:2856)()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1482475225177/work/h5py/_objects.c:2814)()

h5py/h5o.pyx in h5py.h5o.open (/home/ilan/minonda/conda-bld/h5py_1482475225177/work/h5py/h5o.c:3742)()

KeyError: 'Unable to open object (Component not found)'

In [13]:
c = np.load(data_path+'original/arrival-clusters.pkl')


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
    412             try:
--> 413                 return pickle.load(fid, **pickle_kwargs)
    414             except:

UnicodeDecodeError: 'ascii' codec can't decode byte 0xf7 in position 0: ordinal not in range(128)

During handling of the above exception, another exception occurred:

OSError                                   Traceback (most recent call last)
<ipython-input-13-2213758ffef0> in <module>()
----> 1 c = np.load(data_path+'original/arrival-clusters.pkl')

/home/bckenstler/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
    414             except:
    415                 raise IOError(
--> 416                     "Failed to interpret file %s as a pickle" % repr(file))
    417     finally:
    418         if own_fid:

OSError: Failed to interpret file '/data/bckenstler/data/taxi/original/arrival-clusters.pkl' as a pickle

hd5f files


In [10]:
from fuel.utils import find_in_data_path
from fuel.datasets import H5PYDataset

In [7]:
original_path = '/data/bckenstler/data/taxi/original/'

In [33]:
train_set = H5PYDataset(original_path+'data.hdf5', which_sets=('train',),load_in_memory=True)

In [48]:
valid_set = H5PYDataset(original_path+'valid.hdf5', which_sets=('cuts/test_times_0',),load_in_memory=True)

In [34]:
print(train_set.num_examples)


1710670

In [28]:
print(valid_set.num_examples)


304

In [37]:
data = train_set.data_sources

In [44]:
data[0]


Out[44]:
array([2, 1, 2, ..., 2, 1, 1], dtype=int8)

In [49]:
valid_data = valid_set.data_sources

In [89]:
valid_data[4][0]


Out[89]:
array([ 41.1542,  41.1542,  41.154 ,  41.1539,  41.1542,  41.1544,  41.1542,  41.1538,  41.1533,
        41.1528,  41.1525,  41.1525,  41.1527,  41.1527,  41.1527,  41.1526,  41.1524,  41.1526,
        41.1526,  41.1522,  41.1508,  41.1507,  41.1497,  41.1489,  41.1489,  41.1486,  41.1479,
        41.1475,  41.1468,  41.1461,  41.1463,  41.1464,  41.146 ,  41.1449,  41.1451,  41.1454,
        41.1458,  41.1459,  41.1458,  41.1459,  41.146 ,  41.146 ], dtype=float32)

In [77]:
stamps = valid_data[-3]

In [99]:
stamps[0]


Out[99]:
1376502576

In [115]:
for i in range(0,304):    
    print(np.any([t==int(stamps[i]) for t in X_val['TIMESTAMP']]))


False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False

In [101]:
type(X_train['TIMESTAMP'][0])


Out[101]:
int

In [83]:
type(stamps[0])


Out[83]:
numpy.int32

In [ ]:


In [78]:
check = [s in stamps for s in X_val['TIMESTAMP']]

In [86]:
for s in X_val['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(s))


2013-08-14 10:07:32
2013-08-14 10:14:21
2013-08-14 10:28:47
2013-08-14 10:36:23
2013-08-14 10:25:13
2013-08-14 10:31:23
2013-08-14 10:14:21
2013-08-14 10:14:13
2013-08-14 10:03:40
2013-08-14 11:06:08
2013-08-14 11:00:40
2013-08-14 11:18:32
2013-08-14 10:51:01
2013-08-14 10:15:37
2013-08-14 10:42:00
2013-08-14 09:15:51
2013-08-14 10:35:23
2013-08-14 11:05:51
2013-08-14 11:16:11
2013-08-14 11:47:27
2013-08-14 11:35:11
2013-08-14 11:43:53
2013-08-14 12:01:14
2013-08-14 11:09:23
2013-08-14 10:26:21
2013-08-14 11:22:43
2013-08-14 12:07:18
2013-08-14 10:29:38
2013-08-14 11:57:18
2013-08-14 11:23:06
2013-08-14 12:15:02
2013-08-14 11:06:17
2013-08-14 12:33:55
2013-08-13 22:42:40
2013-08-14 12:07:26
2013-08-14 09:02:36
2013-08-14 13:08:03
2013-08-14 07:25:36
2013-08-14 13:37:10
2013-08-14 13:52:50
2013-08-14 14:24:04
2013-08-14 15:15:05
2013-08-14 15:41:34
2013-08-14 19:15:39
2013-08-14 20:28:13
2013-08-14 19:58:07
2013-08-14 21:43:57
2013-08-14 21:41:07
2013-08-14 22:46:27
2013-08-14 23:11:28
2013-08-15 00:01:02
2013-08-15 01:40:11
2013-08-15 01:31:05
2013-08-15 04:04:21
2013-08-29 01:54:35
2013-09-30 07:58:58
2013-10-01 00:57:30
2013-10-01 01:14:21
2013-10-01 01:07:59
2013-10-01 01:12:46
2013-10-01 01:15:23
2013-10-01 00:56:55
2013-10-01 01:34:44
2013-09-30 10:05:15
2013-10-01 01:13:14
2013-10-01 00:50:05
2013-10-01 01:12:57
2013-10-01 01:34:34
2013-09-30 08:35:34
2013-10-01 01:39:09
2013-10-01 00:58:29
2013-10-01 00:53:42
2013-09-30 08:50:42
2013-10-01 00:59:11
2013-10-01 01:46:21
2013-10-01 00:57:02
2013-10-01 01:30:39
2013-10-01 00:40:31
2013-10-01 01:49:07
2013-10-01 01:52:21
2013-10-01 00:43:41
2013-10-01 02:06:20
2013-10-01 01:54:00
2013-10-01 01:13:36
2013-10-01 00:55:21
2013-10-01 02:00:42
2013-09-30 16:02:13
2013-10-01 01:55:31
2013-10-01 01:20:36
2013-09-30 14:18:09
2013-10-01 02:12:26
2013-10-01 01:46:34
2013-10-01 01:58:42
2013-10-01 01:59:55
2013-10-01 01:48:49
2013-10-01 01:50:59
2013-10-01 00:33:27
2013-09-30 13:02:58
2013-10-01 01:20:56
2013-10-01 02:05:00
2013-10-01 01:42:57
2013-10-01 01:37:42
2013-10-01 01:51:28
2013-10-01 01:40:01
2013-10-01 01:53:52
2013-10-01 02:18:16
2013-10-01 02:20:50
2013-10-01 02:22:04
2013-10-01 01:38:33
2013-10-01 01:53:27
2013-10-01 01:59:50
2013-10-01 00:59:27
2013-10-01 01:53:45
2013-10-01 02:11:18
2013-10-01 01:51:55
2013-10-01 01:46:14
2013-10-01 01:49:47
2013-10-01 02:17:16
2013-10-01 01:57:39
2013-10-01 02:09:57
2013-10-01 02:36:04
2013-10-01 01:51:49
2013-10-01 02:10:14
2013-10-01 02:15:34
2013-10-01 02:03:47
2013-10-01 02:01:06
2013-10-01 02:02:54
2013-10-01 02:39:46
2013-09-30 14:47:45
2013-10-01 02:34:19
2013-10-01 01:55:35
2013-10-01 02:04:15
2013-10-01 02:25:37
2013-10-01 02:53:51
2013-10-01 02:21:52
2013-10-01 02:17:23
2013-10-01 02:52:09
2013-10-01 03:10:34
2013-10-01 02:50:11
2013-10-01 02:17:02
2013-10-01 02:51:34
2013-10-01 02:47:29
2013-10-01 02:47:58
2013-10-01 02:48:11
2013-10-01 02:44:48
2013-10-01 02:55:34
2013-10-01 03:06:12
2013-10-01 04:22:22
2013-10-01 03:55:25
2013-10-01 09:55:50
2013-10-07 09:39:25
2013-10-07 10:22:21
2013-10-07 04:17:58
2013-10-07 10:25:18
2013-10-07 07:28:48
2013-10-07 09:53:31
2013-10-07 10:28:40
2013-10-07 09:43:36
2013-10-07 11:33:33
2013-10-07 09:47:13
2013-10-07 10:45:36
2013-10-07 11:36:41
2013-10-07 12:02:04
2013-10-07 11:37:48
2013-10-07 11:52:38
2013-10-07 12:06:22
2013-10-07 11:34:34
2013-10-07 10:18:22
2013-10-07 11:31:49
2013-10-07 11:54:39
2013-10-07 11:15:50
2013-10-07 11:25:14
2013-10-07 12:22:42
2013-10-07 11:58:31
2013-10-07 11:56:48
2013-10-07 11:58:08
2013-10-07 11:59:03
2013-10-07 06:53:29
2013-10-07 08:41:29
2013-10-07 12:23:19
2013-10-07 12:13:27
2013-10-07 12:52:41
2013-10-07 10:52:23
2013-10-07 11:12:36
2013-10-07 12:53:53
2013-10-07 12:45:15
2013-10-07 12:54:38
2013-10-07 10:46:32
2013-10-07 11:54:15
2013-10-07 11:52:09
2013-10-07 12:01:28
2013-10-07 11:35:00
2013-10-07 12:24:21
2013-10-07 13:07:04
2013-10-07 13:40:22
2013-10-07 13:47:05
2013-10-07 10:10:45
2013-10-07 13:28:27
2013-10-07 12:35:05
2013-10-07 13:09:15
2013-10-07 11:44:18
2013-10-07 14:42:34
2013-10-07 13:24:59
2013-10-07 13:11:00
2013-10-07 14:10:43
2013-10-07 15:09:55
2013-10-07 22:16:07
2013-10-07 21:46:40
2013-10-07 23:43:29
2013-10-07 09:15:06
2013-10-07 19:40:37
2013-10-08 00:10:51
2013-10-07 12:39:02
2013-10-07 13:55:44
2013-10-08 00:31:15
2013-10-07 23:57:18
2013-10-08 01:08:20
2013-10-08 04:09:15
2013-11-01 21:32:47
2013-11-01 21:14:53
2013-11-01 21:39:50
2013-11-01 21:30:52
2013-11-01 21:20:27
2013-11-01 21:09:21
2013-11-01 21:22:48
2013-11-01 21:38:38
2013-11-01 21:05:58
2013-11-01 21:38:29
2013-11-01 20:24:41
2013-11-01 21:45:04
2013-11-01 21:32:47
2013-11-01 21:06:05
2013-11-01 21:32:46
2013-11-01 21:40:51
2013-11-01 21:37:10
2013-11-01 20:36:02
2013-11-01 21:45:05
2013-11-01 21:33:28
2013-11-01 21:49:08
2013-11-01 21:37:25
2013-11-01 21:51:12
2013-11-01 21:13:05
2013-11-01 21:33:50
2013-11-01 21:35:31
2013-11-01 21:46:46
2013-11-01 21:37:35
2013-11-01 21:42:36
2013-11-01 21:53:26
2013-11-01 22:01:40
2013-11-01 21:38:20
2013-11-01 21:36:27
2013-11-01 22:05:16
2013-11-01 21:59:10
2013-11-01 18:00:02
2013-11-01 22:09:29
2013-11-01 21:58:45
2013-11-01 22:16:30
2013-11-01 21:06:47
2013-11-01 22:21:46
2013-11-01 22:12:47
2013-11-01 22:10:46
2013-11-01 22:20:50
2013-11-01 21:52:14
2013-11-01 22:12:02
2013-11-01 22:12:30
2013-11-01 22:59:32
2013-11-01 22:11:17
2013-11-01 23:35:01
2013-11-01 23:27:56
2013-11-02 09:37:04
2013-12-22 06:39:00
2013-12-22 06:39:18
2013-12-22 06:56:09
2013-12-22 07:57:34
2013-12-22 07:19:53
2013-12-22 07:33:46
2013-12-22 08:01:08
2013-12-22 08:01:17
2013-12-22 08:29:30
2013-12-22 08:01:29
2013-12-22 07:45:23
2013-12-22 08:08:20
2013-12-22 08:30:08
2013-12-21 13:07:37
2013-12-22 07:51:17
2013-12-22 07:11:40
2013-12-22 08:57:33
2013-12-22 08:49:51
2013-12-22 06:49:38
2013-12-22 09:00:47
2013-12-22 09:36:42
2013-12-22 09:02:56
2013-12-22 08:21:05
2013-12-22 10:05:26
2013-12-22 04:01:53
2013-12-22 10:02:21
2013-12-22 08:54:18
2013-12-22 10:31:35
2013-12-22 10:37:30
2013-12-22 11:28:57
2013-12-22 11:56:01
2013-12-22 15:40:59
2013-12-22 10:02:07
2013-12-23 00:48:48

In [85]:
for s in stamps:
    print(datetime.datetime.fromtimestamp(s))


2013-08-14 10:49:36
2013-08-14 10:59:06
2013-08-14 10:55:42
2013-08-14 10:50:04
2013-08-14 10:50:11
2013-08-14 10:56:57
2013-08-14 10:36:51
2013-08-14 10:44:15
2013-08-14 10:55:50
2013-08-14 10:50:35
2013-08-14 10:50:27
2013-08-14 10:43:57
2013-08-14 10:16:48
2013-08-14 10:40:47
2013-08-14 10:45:55
2013-08-14 10:43:00
2013-08-14 10:53:22
2013-08-14 10:50:03
2013-08-14 10:26:22
2013-08-14 10:59:15
2013-08-14 10:50:17
2013-08-14 10:56:34
2013-08-14 10:53:42
2013-08-14 10:47:46
2013-08-14 10:58:46
2013-08-14 10:24:23
2013-08-14 10:55:19
2013-08-14 10:57:03
2013-08-14 10:56:11
2013-08-14 10:56:52
2013-08-14 10:57:57
2013-08-14 10:08:15
2013-08-14 10:51:14
2013-08-14 10:58:31
2013-08-14 10:47:31
2013-08-14 10:30:36
2013-08-14 10:17:59
2013-08-14 10:48:03
2013-08-14 10:55:52
2013-08-14 10:49:06
2013-08-14 10:58:55
2013-08-14 10:51:24
2013-08-14 10:54:12
2013-08-14 10:54:26
2013-08-14 10:51:18
2013-08-14 10:59:56
2013-08-14 10:48:31
2013-08-14 10:51:56
2013-08-14 10:39:22
2013-08-14 10:57:25
2013-08-14 10:57:28
2013-08-14 10:57:40
2013-08-14 10:39:01
2013-08-14 10:50:39
2013-08-14 09:48:19
2013-10-01 01:16:12
2013-10-01 01:28:04
2013-10-01 01:18:37
2013-10-01 01:24:48
2013-10-01 01:23:39
2013-10-01 01:28:37
2013-10-01 01:20:16
2013-10-01 01:23:49
2013-10-01 01:27:11
2013-10-01 01:06:20
2013-10-01 01:28:08
2013-10-01 01:29:02
2013-10-01 01:24:44
2013-10-01 01:24:44
2013-10-01 01:19:06
2013-10-01 00:28:33
2013-10-01 01:29:28
2013-10-01 01:27:31
2013-10-01 01:22:13
2013-10-01 01:26:03
2013-10-01 01:28:55
2013-10-01 01:18:10
2013-10-01 01:22:13
2013-10-01 01:14:30
2013-10-01 01:24:41
2013-10-01 01:22:16
2013-10-01 01:25:35
2013-10-01 01:21:27
2013-10-01 01:11:33
2013-10-01 01:10:18
2013-10-01 01:09:33
2013-10-01 01:01:15
2013-10-01 01:17:58
2013-10-01 01:18:00
2013-10-01 01:13:26
2013-10-01 01:18:01
2013-10-01 01:25:54
2013-10-01 01:21:20
2013-10-01 01:25:31
2013-10-01 01:25:54
2013-10-01 01:23:40
2013-10-01 01:26:46
2013-10-01 01:23:31
2013-10-01 01:17:09
2013-10-01 01:21:57
2013-10-01 00:29:09
2013-10-01 01:14:47
2013-10-01 01:04:25
2013-10-01 01:14:09
2013-10-01 01:16:59
2013-10-01 01:27:16
2013-10-01 01:16:26
2013-10-01 01:23:18
2013-10-01 01:16:05
2013-10-01 01:27:43
2013-10-01 01:08:13
2013-10-01 01:19:21
2013-10-01 01:21:19
2013-10-01 01:24:20
2013-10-01 01:26:45
2013-10-01 01:18:28
2013-10-01 01:19:45
2013-10-01 01:28:10
2013-10-01 01:22:20
2013-10-01 01:18:42
2013-10-01 01:19:52
2013-10-01 01:18:44
2013-10-01 01:15:11
2013-10-01 01:19:24
2013-10-01 01:23:58
2013-10-01 01:28:50
2013-10-01 01:13:24
2013-10-01 01:28:38
2013-10-01 01:24:50
2013-10-01 01:14:19
2013-10-01 01:10:05
2013-10-01 01:26:31
2013-10-01 01:28:01
2013-09-30 23:44:16
2013-10-01 01:21:43
2013-10-01 01:26:57
2013-10-01 01:25:25
2013-10-01 01:25:36
2013-10-01 01:16:34
2013-10-01 01:26:40
2013-10-01 01:14:56
2013-10-01 01:13:10
2013-10-01 01:28:34
2013-10-01 01:19:08
2013-10-01 01:24:57
2013-10-01 00:52:43
2013-10-01 01:25:28
2013-10-01 01:22:54
2013-10-01 01:28:49
2013-10-01 00:13:25
2013-10-07 10:34:47
2013-10-07 10:38:08
2013-10-07 10:31:10
2013-10-07 10:35:12
2013-10-07 10:41:50
2013-10-07 10:34:31
2013-10-07 10:42:02
2013-10-07 10:39:05
2013-10-07 10:31:43
2013-10-07 10:34:27
2013-10-07 10:31:48
2013-10-07 10:42:24
2013-10-07 10:38:37
2013-10-07 10:29:02
2013-10-07 10:33:55
2013-10-07 10:17:07
2013-10-07 10:44:31
2013-10-07 10:42:52
2013-10-07 10:26:05
2013-10-07 10:34:07
2013-10-07 10:40:59
2013-10-07 10:41:36
2013-10-07 10:33:47
2013-10-07 10:30:59
2013-10-07 10:38:59
2013-10-07 10:28:56
2013-10-07 10:41:24
2013-10-07 10:41:49
2013-10-07 10:42:47
2013-10-07 10:34:09
2013-10-07 10:40:31
2013-10-07 10:21:34
2013-10-07 10:43:52
2013-10-07 10:18:11
2013-10-07 10:41:47
2013-10-07 10:33:04
2013-10-07 10:40:53
2013-10-07 10:36:38
2013-10-07 10:41:46
2013-10-07 10:03:36
2013-10-07 10:44:45
2013-10-07 10:21:42
2013-10-07 10:24:07
2013-10-07 10:40:35
2013-10-07 10:41:00
2013-10-07 10:43:10
2013-10-07 10:23:55
2013-10-07 10:43:30
2013-10-07 10:25:24
2013-10-07 10:35:07
2013-10-07 10:43:33
2013-10-07 10:39:30
2013-10-07 10:31:42
2013-10-07 10:39:17
2013-10-07 10:42:47
2013-10-07 10:39:20
2013-10-07 10:44:41
2013-10-07 10:24:22
2013-10-07 10:12:39
2013-10-07 10:37:25
2013-10-07 10:42:55
2013-10-07 10:14:35
2013-10-07 10:37:12
2013-10-07 10:32:29
2013-10-07 10:42:37
2013-10-07 10:26:52
2013-10-07 10:31:19
2013-10-07 10:44:58
2013-11-01 20:47:37
2013-11-01 20:54:00
2013-11-01 20:58:53
2013-11-01 20:56:37
2013-11-01 20:56:09
2013-11-01 20:51:05
2013-11-01 20:50:58
2013-11-01 20:55:26
2013-11-01 20:53:43
2013-11-01 20:53:46
2013-11-01 20:54:55
2013-11-01 20:59:28
2013-11-01 20:56:54
2013-11-01 20:50:37
2013-11-01 20:48:40
2013-11-01 20:55:46
2013-11-01 20:45:20
2013-11-01 20:46:22
2013-11-01 20:48:25
2013-11-01 20:47:19
2013-11-01 20:57:31
2013-11-01 20:58:14
2013-11-01 20:49:30
2013-11-01 20:43:31
2013-11-01 20:59:00
2013-11-01 20:54:23
2013-11-01 20:51:01
2013-11-01 20:38:12
2013-11-01 20:59:31
2013-11-01 20:56:46
2013-11-01 20:53:51
2013-11-01 20:48:00
2013-11-01 20:58:04
2013-11-01 20:52:50
2013-11-01 20:58:12
2013-11-01 20:57:37
2013-11-01 20:53:33
2013-11-01 20:54:11
2013-11-01 20:48:49
2013-11-01 20:42:56
2013-11-01 20:55:36
2013-11-01 20:51:36
2013-11-01 20:48:45
2013-11-01 20:49:17
2013-11-01 20:53:50
2013-11-01 20:45:28
2013-11-01 20:45:04
2013-11-01 20:52:17
2013-11-01 20:52:10
2013-11-01 20:59:16
2013-11-01 20:51:37
2013-11-01 20:50:10
2013-12-22 06:24:50
2013-12-22 06:04:12
2013-12-22 06:16:27
2013-12-22 06:23:06
2013-12-22 06:24:04
2013-12-22 06:17:33
2013-12-22 06:22:55
2013-12-22 06:24:35
2013-12-22 06:21:56
2013-12-22 06:22:49
2013-12-22 06:25:31
2013-12-22 06:21:31
2013-12-22 06:27:31
2013-12-22 06:29:45
2013-12-22 06:26:09
2013-12-22 06:17:08
2013-12-22 06:26:00
2013-12-22 06:20:56
2013-12-22 06:23:09
2013-12-22 06:22:31
2013-12-22 06:29:59
2013-12-22 06:27:43
2013-12-22 06:23:04
2013-12-22 06:25:30
2013-12-22 06:19:16
2013-12-22 06:23:06
2013-12-22 06:26:01
2013-12-22 06:19:45
2013-12-22 02:34:23
2013-12-22 06:29:54
2013-12-22 06:28:39
2013-12-22 06:27:43
2013-12-22 06:16:23
2013-12-22 06:17:26

In [71]:
ids = valid_data[-1]

In [74]:
type(ids[0])


Out[74]:
numpy.bytes_

In [70]:
ids


Out[70]:
["b'1376502576620000126'",
 "b'1376503146620000161'",
 "b'1376502942620000500'",
 "b'1376502604620000105'",
 "b'1376502611620000022'",
 "b'1376503017620000272'",
 "b'1376501811620000617'",
 "b'1376502255620000663'",
 "b'1376502950620000005'",
 "b'1376502635620000276'",
 "b'1376502627620000596'",
 "b'1376502237620000675'",
 "b'1376500608620000409'",
 "b'1376502047620000574'",
 "b'1376502355620000338'",
 "b'1376502180620000080'",
 "b'1376502802620000680'",
 "b'1376502603620000142'",
 "b'1376501182620000651'",
 "b'1376503155620000026'",
 "b'1376502617620000657'",
 "b'1376502994620000604'",
 "b'1376502822620000093'",
 "b'1376502466620000561'",
 "b'1376503126620000410'",
 "b'1376501063620000343'",
 "b'1376502919620000166'",
 "b'1376503023620000010'",
 "b'1376502971620000517'",
 "b'1376503012620000273'",
 "b'1376503077620000470'",
 "b'1376500095620000569'",
 "b'1376502674620000426'",
 "b'1376503111620000674'",
 "b'1376502451620000310'",
 "b'1376501436620000344'",
 "b'1376500679620000108'",
 "b'1376502483620000356'",
 "b'1376502952620000687'",
 "b'1376502546620000254'",
 "b'1376503135620000053'",
 "b'1376502684620000503'",
 "b'1376502852620000321'",
 "b'1376502866620000421'",
 "b'1376502678620000460'",
 "b'1376503196620000386'",
 "b'1376502511620000480'",
 "b'1376502716620000224'",
 "b'1376501962620000507'",
 "b'1376503045620000633'",
 "b'1376503048620000349'",
 "b'1376503060620000049'",
 "b'1376501941620000667'",
 "b'1376502639620000281'",
 "b'1376498899620000172'",
 "b'1380615372620000303'",
 "b'1380616084620000260'",
 "b'1380615517620000372'",
 "b'1380615888620000588'",
 "b'1380615819620000042'",
 "b'1380616117620000325'",
 "b'1380615616620000040'",
 "b'1380615829620000682'",
 "b'1380616031620000001'",
 "b'1380614780620000352'",
 "b'1380616088620000513'",
 "b'1380616142620000289'",
 "b'1380615884620000166'",
 "b'1380615884620000671'",
 "b'1380615546620000187'",
 "b'1380612513620000172'",
 "b'1380616168620000472'",
 "b'1380616051620000597'",
 "b'1380615733620000105'",
 "b'1380615963620000137'",
 "b'1380616135620000672'",
 "b'1380615490620000574'",
 "b'1380615733620000051'",
 "b'1380615270620000612'",
 "b'1380615881620000031'",
 "b'1380615736620000246'",
 "b'1380615935620000367'",
 "b'1380615687620000577'",
 "b'1380615093620000272'",
 "b'1380615018620000632'",
 "b'1380614973620000258'",
 "b'1380614475620000032'",
 "b'1380615478620000138'",
 "b'1380615480620000381'",
 "b'1380615206620000397'",
 "b'1380615481620000077'",
 "b'1380615954620000546'",
 "b'1380615680620000192'",
 "b'1380615931620000068'",
 "b'1380615954620000395'",
 "b'1380615820620000482'",
 "b'1380616006620000080'",
 "b'1380615811620000431'",
 "b'1380615429620000602'",
 "b'1380615717620000497'",
 "b'1380612549620000161'",
 "b'1380615287620000675'",
 "b'1380614665620000458'",
 "b'1380615249620000222'",
 "b'1380615419620000487'",
 "b'1380616036620000669'",
 "b'1380615386620000476'",
 "b'1380615798620000523'",
 "b'1380615365620000215'",
 "b'1380616063620000065'",
 "b'1380614893620000011'",
 "b'1380615561620000391'",
 "b'1380615679620000004'",
 "b'1380615860620000429'",
 "b'1380616005620000695'",
 "b'1380615508620000361'",
 "b'1380615585620000665'",
 "b'1380616090620000562'",
 "b'1380615740620000398'",
 "b'1380615522620000156'",
 "b'1380615592620000674'",
 "b'1380615524620000279'",
 "b'1380615311620000540'",
 "b'1380615564620000216'",
 "b'1380615838620000324'",
 "b'1380616130620000356'",
 "b'1380615204620000387'",
 "b'1380616118620000649'",
 "b'1380615890620000159'",
 "b'1380615259620000393'",
 "b'1380615005620000249'",
 "b'1380615991620000589'",
 "b'1380616081620000633'",
 "b'1380609856620000609'",
 "b'1380615703620000410'",
 "b'1380616017620000470'",
 "b'1380615925620000177'",
 "b'1380615936620000547'",
 "b'1380615394620000400'",
 "b'1380616000620000140'",
 "b'1380615296620000020'",
 "b'1380615190620000477'",
 "b'1380616114620000151'",
 "b'1380615548620000247'",
 "b'1380615897620000616'",
 "b'1380613963620000005'",
 "b'1380615928620000449'",
 "b'1380615774620000158'",
 "b'1380616129620000281'",
 "b'1380611605620000351'",
 "b'1381167287620000123'",
 "b'1381167488620000626'",
 "b'1381167070620000142'",
 "b'1381167312620000337'",
 "b'1381167710620000684'",
 "b'1381167271620000159'",
 "b'1381167722620000624'",
 "b'1381167545620000419'",
 "b'1381167103620000114'",
 "b'1381167267620000668'",
 "b'1381167108620000307'",
 "b'1381167744620000051'",
 "b'1381167517620000356'",
 "b'1381166942620000518'",
 "b'1381167235620000529'",
 "b'1381166227620000901'",
 "b'1381167871620000463'",
 "b'1381167772620000495'",
 "b'1381166765620000008'",
 "b'1381167247620000345'",
 "b'1381167659620000235'",
 "b'1381167696620000085'",
 "b'1381167227620000156'",
 "b'1381167059620000004'",
 "b'1381167539620000256'",
 "b'1381166936620000426'",
 "b'1381167684620000621'",
 "b'1381167709620000249'",
 "b'1381167767620000094'",
 "b'1381167249620000675'",
 "b'1381167631620000116'",
 "b'1381166494620000480'",
 "b'1381167832620000074'",
 "b'1381166291620000326'",
 "b'1381167707620000653'",
 "b'1381167184620000560'",
 "b'1381167653620000295'",
 "b'1381167398620000686'",
 "b'1381167706620000321'",
 "b'1381165416620000697'",
 "b'1381167885620000280'",
 "b'1381166502620000297'",
 "b'1381166647620000657'",
 "b'1381167635620000662'",
 "b'1381167660620000594'",
 "b'1381167790620000093'",
 "b'1381166635620000195'",
 "b'1381167810620000431'",
 "b'1381166724620000311'",
 "b'1381167307620000591'",
 "b'1381167813620000267'",
 "b'1381167570620000648'",
 "b'1381167102620000525'",
 "b'1381167557620000424'",
 "b'1381167767620000160'",
 "b'1381167560620000633'",
 "b'1381167881620000391'",
 "b'1381166662620000189'",
 "b'1381165959620000138'",
 "b'1381167445620000344'",
 "b'1381167775620000049'",
 "b'1381166075620000068'",
 "b'1381167432620000001'",
 "b'1381167149620000257'",
 "b'1381167757620000324'",
 "b'1381166812620000595'",
 "b'1381167079620000535'",
 "b'1381167898620000667'",
 "b'1383364057620000066'",
 "b'1383364440620000010'",
 "b'1383364733620000009'",
 "b'1383364597620000601'",
 "b'1383364569620000356'",
 "b'1383364265620000007'",
 "b'1383364258620000574'",
 "b'1383364526620000108'",
 "b'1383364423620000015'",
 "b'1383364426620000632'",
 "b'1383364495620000611'",
 "b'1383364768620000388'",
 "b'1383364614620000372'",
 "b'1383364237620000455'",
 "b'1383364120620000403'",
 "b'1383364546620000041'",
 "b'1383363920620000020'",
 "b'1383363982620000591'",
 "b'1383364105620000665'",
 "b'1383364039620000618'",
 "b'1383364651620000513'",
 "b'1383364694620000364'",
 "b'1383364170620000239'",
 "b'1383363811620000031'",
 "b'1383364740620000252'",
 "b'1383364463620000345'",
 "b'1383364261620000436'",
 "b'1383363492620000672'",
 "b'1383364771620000320'",
 "b'1383364606620000508'",
 "b'1383364431620000233'",
 "b'1383364080620000527'",
 "b'1383364684620000005'",
 "b'1383364370620000140'",
 "b'1383364692620000118'",
 "b'1383364657620000570'",
 "b'1383364413620000492'",
 "b'1383364451620000309'",
 "b'1383364129620000013'",
 "b'1383363776620000434'",
 "b'1383364536620000217'",
 "b'1383364296620000112'",
 "b'1383364125620000625'",
 "b'1383364157620000648'",
 "b'1383364430620000542'",
 "b'1383363928620000616'",
 "b'1383363904620000105'",
 "b'1383364337620000612'",
 "b'1383364330620000333'",
 "b'1383364756620000540'",
 "b'1383364297620000596'",
 "b'1383364210620000153'",
 "b'1387722290620000362'",
 "b'1387721052620000311'",
 "b'1387721787620000046'",
 "b'1387722186620000565'",
 "b'1387722244620000068'",
 "b'1387721853620000403'",
 "b'1387722175620000633'",
 "b'1387722275620000172'",
 "b'1387722116620000187'",
 "b'1387722169620000060'",
 "b'1387722331620000058'",
 "b'1387722091620000607'",
 "b'1387722451620000540'",
 "b'1387722585620000430'",
 "b'1387722369620000120'",
 "b'1387721828620000123'",
 "b'1387722360620000391'",
 "b'1387722056620000089'",
 "b'1387722189620000480'",
 "b'1387722151620000184'",
 "b'1387722599620000137'",
 "b'1387722463620000314'",
 "b'1387722184620000057'",
 "b'1387722330620000171'",
 "b'1387721956620000373'",
 "b'1387722186620000197'",
 "b'1387722361620000697'",
 "b'1387721985620000173'",
 "b'1387708463620000329'",
 "b'1387722594620000900'",
 "b'1387722519620000482'",
 "b'1387722463620000481'",
 "b'1387721783620000030'",
 "b'1387721846620000247'"]

In [64]:
X_val


Out[64]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE TARGET COORD_FEATURES DAY_OF_WEEK QUARTER_HOUR WEEK_OF_YEAR
0 1376500052620000184 C 0 0 115 1376500052 0 False [[-8.649891,41.154399],[-8.649981,41.154417],[... [-0.0392686, -0.0390627, -0.0440035, -0.049458... [-0.590024, -0.591592, -0.596627, -0.596793, -... [-8.61043, 41.1411] [-0.590024, -0.591592, -0.596627, -0.596793, -... 2 40 33
1 1376500461620000525 C 0 0 214 1376500461 0 False [[-8.610876,41.145759],[-8.610849,41.145759],[... [-0.155839, -0.155839, -0.151619, -0.14673, -0... [0.0920491, 0.0925159, 0.0985014, 0.105587, 0.... [-8.63072, 41.1547] [0.0920491, 0.0925159, 0.0985014, 0.105587, 0.... 2 40 33
2 1376501327620000095 B 0 11 367 1376501327 0 False [[-8.613243,41.166873],[-8.613252,41.166747],[... [0.129025, 0.127327, 0.125474, 0.118835, 0.104... [0.0506678, 0.0505178, 0.0497175, 0.0700247, 0... [-8.61534, 41.1407] [0.0506678, 0.0505178, 0.0497175, 0.0700247, 0... 2 41 33
3 1376501783620000173 B 0 10 39 1376501783 0 False [[-8.606988,41.15025],[-8.607213,41.150007],[-... [-0.0952637, -0.0985575, -0.112865, -0.113843,... [0.160023, 0.156088, 0.148386, 0.145868, 0.144... [-8.55426, 41.1628] [0.160023, 0.156088, 0.148386, 0.145868, 0.144... 2 42 33
4 1376501113620000252 B 0 13 364 1376501113 0 False [[-8.628273,41.157405],[-8.628255,41.157423],[... [0.00128665, 0.00149252, 0.00236744, 0.0135356... [-0.212091, -0.211775, -0.209724, -0.20894, -0... [-8.61928, 41.1786] [-0.212091, -0.211775, -0.209724, -0.20894, -0... 2 41 33
5 1376501483620000424 B 0 19 25 1376501483 0 False [[-8.605818,41.153391],[-8.607339,41.153427],[... [-0.0528556, -0.0523924, -0.0513116, -0.050694... [0.18048, 0.153888, 0.112506, 0.0797781, 0.071... [-8.64643, 41.1616] [0.18048, 0.153888, 0.112506, 0.0797781, 0.071... 2 42 33
6 1376500461620000326 B 0 14 240 1376500461 0 False [[-8.611137,41.149332],[-8.611263,41.149161],[... [-0.107667, -0.109931, -0.110086, -0.110086, -... [0.0874808, 0.08528, 0.0849633, 0.0848132, 0.0... [-8.61446, 41.1422] [0.0874808, 0.08528, 0.0849633, 0.0848132, 0.0... 2 40 33
7 1376500453620000263 C 0 0 407 1376500453 0 False [[-8.586396,41.149224],[-8.586378,41.149026],[... [-0.109108, -0.111784, -0.11199, -0.107873, -0... [0.520016, 0.520333, 0.513247, 0.49249, 0.4643... [-8.58591, 41.1486] [0.520016, 0.520333, 0.513247, 0.49249, 0.4643... 2 40 33
8 1376499820620000467 C 0 0 270 1376499820 0 False [[-8.625177,41.157333],[-8.625609,41.157405],[... [0.000308796, 0.00128665, 0.00494074, 0.006021... [-0.157972, -0.165525, -0.194935, -0.202171, -... [-8.64726, 41.1732] [-0.157972, -0.165525, -0.194935, -0.202171, -... 2 40 33
9 1376503568620000213 B 0 28 431 1376503568 0 False [[-8.584335,41.163111],[-8.585127,41.162922],[... [0.0782799, 0.0757066, 0.0835809, 0.0913522, 0... [0.556046, 0.542208, 0.51058, 0.479736, 0.4769... [-8.58525, 41.1689] [0.556046, 0.542208, 0.51058, 0.479736, 0.4769... 2 44 33
10 1376503240620000002 B 0 63 421 1376503240 0 False [[-8.609688,41.160348],[-8.609967,41.159277],[... [0.040967, 0.0265565, 0.00370556, 0.000669059,... [0.112823, 0.107938, 0.107938, 0.107471, 0.106... [-8.61071, 41.1456] [0.112823, 0.107938, 0.107938, 0.107471, 0.106... 2 44 33
11 1376504312620000617 C 0 0 199 1376504312 0 False [[-8.624502,41.179554],[-8.624511,41.179527],[... [0.300099, 0.299738, 0.299738, 0.299841, 0.299... [-0.146168, -0.146318, -0.146485, -0.146318, -... [-8.62455, 41.1796] [-0.146168, -0.146318, -0.146485, -0.146318, -... 2 45 33
12 1376502661620000400 B 0 29 117 1376502661 0 False [[-8.638443,41.170797],[-8.6382,41.170716],[-8... [0.181932, 0.180852, 0.184866, 0.192174, 0.200... [-0.389887, -0.385636, -0.36046, -0.330883, -0... [-8.6206, 41.1739] [-0.389887, -0.385636, -0.36046, -0.330883, -0... 2 43 33
13 1376500537620000246 B 0 13 318 1376500537 0 False [[-8.628147,41.157198],[-8.628156,41.157198],[... [-0.00149252, -0.00149252, -0.00128665, -0.001... [-0.209891, -0.210041, -0.20879, -0.208473, -0... [-8.61782, 41.1525] [-0.209891, -0.210041, -0.20879, -0.208473, -0... 2 41 33
14 1376502120620000557 B 0 32 245 1376502120 0 False [[-8.627643,41.157765],[-8.627958,41.1579],[-8... [0.00612446, 0.00797724, 0.0135356, 0.0206894,... [-0.201071, -0.206589, -0.20879, -0.228147, -0... [-8.61148, 41.1461] [-0.201071, -0.206589, -0.20879, -0.228147, -0... 2 42 33
15 1376496951620000012 A 7 0 79 1376496951 0 False [[-8.604045,41.182569],[-8.604135,41.182353],[... [0.340757, 0.337875, 0.316876, 0.295724, 0.278... [0.211474, 0.209907, 0.197003, 0.183148, 0.161... [-8.62064, 41.1643] [0.211474, 0.209907, 0.197003, 0.183148, 0.161... 2 37 33
16 1376501723620000554 B 0 53 183 1376501723 0 False [[-8.613945,41.141277],[-8.613972,41.141286],[... [-0.216312, -0.216209, -0.221047, -0.222642, -... [0.0383969, 0.03793, 0.0220411, 0.0168393, 0.0... [-8.63607, 41.1592] [0.0383969, 0.03793, 0.0220411, 0.0168393, 0.0... 2 42 33
17 1376503551620000376 B 0 34 246 1376503551 0 False [[-8.615556,41.14071],[-8.615565,41.140692],[-... [-0.22398, -0.224186, -0.22434, -0.22362, -0.2... [0.0102369, 0.0100702, 0.0100702, 0.010387, 0.... [-8.64072, 41.1612] [0.0102369, 0.0100702, 0.0100702, 0.010387, 0.... 2 44 33
18 1376504171620000146 B 0 10 338 1376504171 0 False [[-8.606979,41.150268],[-8.607285,41.150124],[... [-0.0950063, -0.096962, -0.0962415, -0.0962415... [0.160173, 0.154838, 0.148852, 0.148536, 0.128... [-8.61805, 41.1525] [0.160173, 0.154838, 0.148852, 0.148536, 0.128... 2 45 33
19 1376506047620000026 B 0 57 167 1376506047 0 False [[-8.610804,41.145741],[-8.610822,41.145768],[... [-0.156097, -0.155736, -0.155839, -0.151722, -... [0.0933162, 0.0929994, 0.0917323, 0.0961339, 0... [-8.60417, 41.1489] [0.0933162, 0.0929994, 0.0917323, 0.0961339, 0... 2 47 33
20 1376505311620000392 A 7 0 349 1376505311 0 False [[-8.583165,41.164713],[-8.583012,41.164407],[... [0.0998956, 0.0957268, 0.0964474, 0.105557, 0.... [0.576503, 0.579187, 0.580438, 0.580121, 0.597... [-8.6118, 41.1429] [0.576503, 0.579187, 0.580438, 0.580121, 0.597... 2 46 33
21 1376505833620000120 B 0 13 144 1376505833 0 False [[-8.628345,41.15763],[-8.628345,41.157576],[-... [0.00432315, 0.00360262, 0.00504367, 0.0026247... [-0.213342, -0.213342, -0.206906, -0.178896, -... [-8.61802, 41.1501] [-0.213342, -0.213342, -0.206906, -0.178896, -... 2 46 33
22 1376506874620000255 B 0 33 194 1376506874 0 False [[-8.600184,41.182686],[-8.600031,41.182758],[... [0.342352, 0.34333, 0.33736, 0.335559, 0.33314... [0.278965, 0.281649, 0.310276, 0.319096, 0.341... [-8.56627, 41.1814] [0.278965, 0.281649, 0.310276, 0.319096, 0.341... 2 48 33
23 1376503763620000015 B 0 60 48 1376503763 0 False [[-8.609706,41.151276],[-8.609679,41.151294],[... [-0.0814193, -0.081162, -0.0792063, -0.0644355... [0.112506, 0.112973, 0.115491, 0.107788, 0.108... [-8.61818, 41.1696] [0.112506, 0.112973, 0.115491, 0.107788, 0.108... 2 44 33
24 1376501181620000360 B 0 0 37 1376501181 0 False [[-8.598996,41.149026],[-8.598843,41.148873],[... [-0.111784, -0.113843, -0.115284, -0.119195, -... [0.299739, 0.302423, 0.303357, 0.305408, 0.304... [-8.60023, 41.1493] [0.299739, 0.302423, 0.303357, 0.305408, 0.304... 2 41 33
25 1376504563620000017 A 954 0 335 1376504563 0 False [[-8.618022,41.151519],[-8.618337,41.151447],[... [-0.0781255, -0.0791033, -0.0844558, -0.087132... [-0.0328782, -0.0383802, -0.0624553, -0.079444... [-8.59822, 41.1484] [-0.0328782, -0.0383802, -0.0624553, -0.079444... 2 45 33
26 1376507238620000114 C 0 0 165 1376507238 0 False [[-8.63028,41.157432],[-8.630505,41.157153],[-... [0.00164691, -0.00211011, -0.00452901, 0.01085... [-0.24717, -0.251105, -0.271246, -0.289819, -0... [-8.65056, 41.1615] [-0.24717, -0.251105, -0.271246, -0.289819, -0... 2 48 33
27 1376501378620000195 B 0 60 67 1376501378 0 False [[-8.609499,41.151294],[-8.609535,41.151312],[... [-0.081162, -0.0809046, -0.0778681, -0.0758095... [0.116124, 0.115491, 0.117375, 0.111556, 0.100... [-8.61674, 41.137] [0.116124, 0.115491, 0.117375, 0.111556, 0.100... 2 41 33
28 1376506638620000038 B 0 17 140 1376506638 0 False [[-8.632323,41.164326],[-8.632917,41.164065],[... [0.0946461, 0.0911464, 0.0867718, 0.093205, 0.... [-0.2829, -0.293287, -0.317345, -0.346305, -0.... [-8.65428, 41.181] [-0.2829, -0.293287, -0.317345, -0.346305, -0.... 2 47 33
29 1376504586620000608 B 0 18 310 1376504586 0 False [[-8.619921,41.148018],[-8.620218,41.147712],[... [-0.125371, -0.129489, -0.1176, -0.104013, -0.... [-0.0660733, -0.0712751, -0.0792946, -0.084179... [-8.61061, 41.1515] [-0.0660733, -0.0712751, -0.0792946, -0.084179... 2 45 33
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
274 1387725593620000440 B 0 42 233 1387725593 0 False [[-8.612145,41.172777],[-8.612568,41.172768],[... [0.208643, 0.20854, 0.216569, 0.226142, 0.2362... [0.0698579, 0.062472, 0.0542858, 0.0513014, 0.... [-8.58568, 41.1489] [0.0698579, 0.062472, 0.0542858, 0.0513014, 0.... 6 29 51
275 1387726426620000621 A 1602 0 34 1387726426 0 False [[-8.648964,41.179752],[-8.648982,41.179752],[... [0.302775, 0.302775, 0.314663, 0.316104, 0.316... [-0.573819, -0.574136, -0.57807, -0.576336, -0... [-8.63323, 41.1756] [-0.573819, -0.574136, -0.57807, -0.576336, -0... 6 30 51
276 1387728068620000012 A 3521 0 79 1387728068 0 False [[-8.658126,41.154876],[-8.657829,41.154579],[... [-0.0328353, -0.0368497, -0.0219245, -0.001286... [-0.733992, -0.728807, -0.743912, -0.769238, -... [-8.65425, 41.1809] [-0.733992, -0.728807, -0.743912, -0.769238, -... 6 32 51
277 1387728077620000502 B 0 54 116 1387728077 0 False [[-8.630316,41.15754],[-8.629668,41.157],[-8.6... [0.00308796, -0.00416875, -0.00844043, -0.0272... [-0.247804, -0.236483, -0.233182, -0.24497, -0... [-8.6304, 41.1554] [-0.247804, -0.236483, -0.233182, -0.24497, -0... 6 32 51
278 1387729770620000384 A 3184 0 225 1387729770 0 False [[-8.6121,41.158674],[-8.6121,41.158674],[-8.6... [0.0183734, 0.0183734, 0.00452901, -0.0170353,... [0.0706582, 0.0706582, 0.064356, 0.0511347, 0.... [-8.62106, 41.151] [0.0706582, 0.0706582, 0.064356, 0.0511347, 0.... 6 33 51
279 1387728089620000640 B 0 26 218 1387728089 0 False [[-8.580204,41.15934],[-8.580627,41.159241],[-... [0.0273799, 0.0260418, 0.015131, -0.005301, -0... [0.628271, 0.620869, 0.633156, 0.637241, 0.637... [-8.58601, 41.1486] [0.628271, 0.620869, 0.633156, 0.637241, 0.637... 6 32 51
280 1387727123620000055 B 0 7 352 1387727123 0 False [[-8.63991,41.15979],[-8.640693,41.159664],[-8... [0.0334529, 0.0317546, 0.00586713, -0.0161089,... [-0.41553, -0.429218, -0.438821, -0.447324, -0... [-8.6178, 41.1471] [-0.41553, -0.429218, -0.438821, -0.447324, -0... 6 31 51
281 1387728500620000271 B 0 57 234 1387728500 0 False [[-8.610885,41.14566],[-8.610885,41.145669],[-... [-0.157177, -0.157074, -0.157435, -0.156714, -... [0.091899, 0.091899, 0.0917323, 0.0923659, 0.0... [-8.66138, 41.1481] [0.091899, 0.091899, 0.0917323, 0.0923659, 0.0... 6 32 51
282 1387729808620000151 B 0 28 146 1387729808 0 False [[-8.584335,41.163156],[-8.584425,41.163102],[... [0.078846, 0.0781255, 0.0776623, 0.0841985, 0.... [0.556046, 0.554479, 0.533088, 0.506012, 0.478... [-8.6117, 41.16] [0.556046, 0.554479, 0.533088, 0.506012, 0.478... 6 34 51
283 1387660057620000026 B 0 57 167 1387660057 0 False [[-8.610768,41.145642],[-8.610759,41.145642],[... [-0.157435, -0.157435, -0.157074, -0.156354, -... [0.0939331, 0.0940998, 0.0942499, 0.0939331, 0... [-8.63085, 41.1466] [0.0939331, 0.0940998, 0.0942499, 0.0939331, 0... 5 52 51
284 1387727477620000513 B 0 53 366 1387727477 0 False [[-8.613972,41.141349],[-8.613963,41.141349],[... [-0.215334, -0.215334, -0.216929, -0.205607, -... [0.03793, 0.0380801, 0.029277, 0.0308442, 0.03... [-8.61403, 41.1499] [0.03793, 0.0380801, 0.029277, 0.0308442, 0.03... 6 31 51
285 1387725100620000157 A 254 0 390 1387725100 0 False [[-8.676234,41.15484],[-8.676198,41.154822],[-... [-0.03335, -0.0335559, -0.0335559, -0.0334529,... [-1.05057, -1.04994, -1.04994, -1.04962, -1.04... [-8.6488, 41.1486] [-1.05057, -1.04994, -1.04994, -1.04962, -1.04... 6 28 51
286 1387731453620000032 A 9559 0 371 1387731453 0 False [[-8.657946,41.148234],[-8.657937,41.148207],[... [-0.122438, -0.122798, -0.122798, -0.122695, -... [-0.730841, -0.730691, -0.730691, -0.730541, -... [-8.65648, 41.1532] [-0.730841, -0.730691, -0.730691, -0.730541, -... 6 35 51
287 1387730991620000217 A 20908 0 321 1387730991 0 False [[-8.569818,41.170158],[-8.569278,41.169996],[... [0.173338, 0.171125, 0.158052, 0.160934, 0.166... [0.809852, 0.819288, 0.845881, 0.854534, 0.858... [-8.572, 41.1629] [0.809852, 0.819288, 0.845881, 0.854534, 0.858... 6 35 51
288 1387723778620000364 B 0 21 419 1387723778 0 False [[-8.628867,41.160996],[-8.628849,41.160951],[... [0.0497162, 0.0490986, 0.0543481, 0.0718466, 0... [-0.222478, -0.222162, -0.209724, -0.202021, -... [-8.71435, 41.2082] [-0.222478, -0.222162, -0.209724, -0.202021, -... 6 27 51
289 1387731647620000129 B 0 57 265 1387731647 0 False [[-8.610759,41.145651],[-8.610768,41.145678],[... [-0.15728, -0.15692, -0.155839, -0.154244, -0.... [0.0940998, 0.0939331, 0.0936163, 0.0936163, 0... [-8.63835, 41.1592] [0.0940998, 0.0939331, 0.0936163, 0.0936163, 0... 6 36 51
290 1387733802620000364 B 0 21 419 1387733802 0 False [[-8.628786,41.161041],[-8.628579,41.160897],[... [0.0503338, 0.0483781, 0.0476576, 0.0474002, 0... [-0.221061, -0.217443, -0.21776, -0.21791, -0.... [-8.596, 41.1696] [-0.221061, -0.217443, -0.21776, -0.21791, -0.... 6 38 51
291 1387731776620000207 B 0 36 211 1387731776 0 False [[-8.649423,41.154345],[-8.6499,41.154273],[-8... [-0.0399891, -0.040967, -0.0452387, -0.0437976... [-0.581838, -0.590191, -0.59616, -0.579487, -0... [-8.57125, 41.1646] [-0.581838, -0.590191, -0.59616, -0.579487, -0... 6 36 51
292 1387729265620000068 B 0 0 185 1387729265 0 False [[-8.608779,41.147793],[-8.608734,41.147802],[... [-0.128408, -0.128305, -0.128305, -0.128408, -... [0.128712, 0.129496, 0.129812, 0.133114, 0.133... [-8.62051, 41.1651] [0.128712, 0.129496, 0.129812, 0.133114, 0.133... 6 33 51
293 1387735526620000023 C 0 0 404 1387735526 0 False [[-8.597673,41.142681],[-8.597682,41.142681]] [-0.197372, -0.197372] [0.322864, 0.322714] [-8.59768, 41.1427] [0.322864, 0.322864, 0.322864, 0.322864, 0.322... 6 40 51
294 1387713713620000255 A 34988 0 194 1387713713 0 False [[-8.594352,41.169375],[-8.594352,41.169375],[... [0.162787, 0.162787, 0.16289, 0.162993, 0.1631... [0.380934, 0.380934, 0.381084, 0.381084, 0.381... [-8.58298, 41.1704] [0.380934, 0.380934, 0.381084, 0.381084, 0.381... 6 16 51
295 1387735341620000216 B 0 12 331 1387735341 0 False [[-8.630766,41.154948],[-8.631414,41.15439],[-... [-0.0318575, -0.039423, -0.054554, -0.0752434,... [-0.255673, -0.267011, -0.283683, -0.29422, -0... [-8.63564, 41.1406] [-0.255673, -0.267011, -0.283683, -0.29422, -0... 6 40 51
296 1387731258620000486 C 0 0 75 1387731258 0 False [[-8.59698,41.171328],[-8.595054,41.172327],[-... [0.189138, 0.20257, 0.253367, 0.308848, 0.3575... [0.334985, 0.368663, 0.395873, 0.406426, 0.397... [-8.33168, 41.2035] [0.334985, 0.368663, 0.395873, 0.406426, 0.397... 6 35 51
297 1387737095620000217 A 495 0 321 1387737095 0 False [[-8.591688,41.159556],[-8.591625,41.159421],[... [0.0303135, 0.0284607, 0.0216672, 0.0165721, 0... [0.427501, 0.428601, 0.428134, 0.413496, 0.402... [-8.60578, 41.1498] [0.427501, 0.428601, 0.428134, 0.413496, 0.402... 6 42 51
298 1387737450620000384 B 0 52 225 1387737450 0 False [[-8.61327,41.154453],[-8.613297,41.154147],[-... [-0.0385481, -0.0426654, -0.0465768, -0.047657... [0.050201, 0.0497175, 0.0495675, 0.0564866, 0.... [-8.58762, 41.1885] [0.050201, 0.0497175, 0.0495675, 0.0564866, 0.... 6 42 51
299 1387740537620000657 B 0 47 17 1387740537 0 False [[-8.654796,41.173551],[-8.654526,41.173668],[... [0.219091, 0.220686, 0.236486, 0.239369, 0.233... [-0.675771, -0.671053, -0.652646, -0.632039, -... [-8.63023, 41.1584] [-0.675771, -0.671053, -0.652646, -0.632039, -... 6 45 51
300 1387742161620000503 C 0 0 33 1387742161 0 False [[-8.639487,41.167422],[-8.639424,41.16753],[-... [0.136436, 0.137878, 0.135819, 0.12393, 0.1178... [-0.408144, -0.407043, -0.402008, -0.397757, -... [-8.66577, 41.2102] [-0.408144, -0.407043, -0.402008, -0.397757, -... 6 47 51
301 1387755659620000372 A 481 0 27 1387755659 0 False [[-8.679753,41.156559],[-8.679717,41.156568],[... [-0.0101388, -0.0100359, -0.00308796, -0.00710... [-1.11209, -1.11146, -1.0954, -1.07763, -1.058... [-8.61165, 41.1461] [-1.11209, -1.11146, -1.0954, -1.07763, -1.058... 6 62 51
302 1387735327620000068 B 0 27 185 1387735327 0 False [[-8.608707,41.147811],[-8.608689,41.147829],[... [-0.12815, -0.127945, -0.128665, -0.13304, -0.... [0.129962, 0.130279, 0.13328, 0.129812, 0.1073... [-8.62782, 41.1698] [0.129962, 0.130279, 0.13328, 0.129812, 0.1073... 6 40 51
303 1387788528620000010 A 8312 0 26 1387788528 0 False [[-8.609247,41.155182],[-8.60922,41.155254],[-... [-0.0287181, -0.0277402, -0.0210496, -0.021409... [0.120526, 0.121009, 0.117541, 0.108105, 0.106... [-8.61635, 41.163] [0.120526, 0.121009, 0.117541, 0.108105, 0.106... 0 3 52

304 rows × 16 columns