In [1]:

    
import ast

import pandas as pd

import datetime

from keras.layers import Input, Dense, Embedding, concatenate, dot, Flatten, Merge, BatchNormalization, Lambda
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np

from sklearn.cluster import MeanShift, estimate_bandwidth

import utils2

import data

from sklearn.model_selection import train_test_split

from bcolz_array_iterator import BcolzArrayIterator

import bcolz

from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam









    



Using TensorFlow backend.
/home/roebius/pj/p3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Below path is a shared directory, swap to own



In [2]:

    
data_path = "data/taxi/"

Replication of 'csv_to_hdf5.py'

Original repo used some bizarre tuple method of reading in data to save in a hdf5 file using fuel. The following does the same approach in that module, only using pandas and saving in a bcolz format (w/ training data as example)



In [ ]:

    
meta = pd.read_csv(data_path+'metaData_taxistandsID_name_GPSlocation.csv', header=0)



In [ ]:

    
meta.head()



In [ ]:

    
train = pd.read_csv(data_path+'train/train.csv', header=0)



In [ ]:

    
train.head()



In [ ]:

    
train['ORIGIN_CALL'] = pd.Series(pd.factorize(train['ORIGIN_CALL'])[0]) + 1



In [ ]:

    
train['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in train["ORIGIN_STAND"]])



In [ ]:

    
train['TAXI_ID'] = pd.Series(pd.factorize(train['TAXI_ID'])[0]) + 1



In [ ]:

    
# train['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in train['DAY_TYPE']]) 
train['DAY_TYPE'] = pd.Series([(ord(x[0]) - ord('A')) for x in train['DAY_TYPE']])  # - correct

The array of long/lat coordinates per trip (row) is read in as a string. The function ast.literal_eval(x) evaluates the string into the expression it represents (safely). This happens below



In [ ]:

    
polyline = pd.Series([ast.literal_eval(x) for x in train['POLYLINE']])

Split into latitude/longitude



In [ ]:

    
train['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])



In [ ]:

    
train['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])



In [ ]:

    
utils2.save_array(data_path+'train/train.bc', train.as_matrix())



In [ ]:

    
utils2.save_array(data_path+'train/meta_train.bc', meta.as_matrix())

Further Feature Engineering

After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.



In [ ]:

    
train = pd.DataFrame(utils2.load_array(data_path+'train/train.bc'), columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE'])



In [ ]:

    
train.head()

The paper discusses how many categorical variables there are per category. The following all check out



In [ ]:

    
train['ORIGIN_CALL'].max()



In [ ]:

    
train['ORIGIN_STAND'].max()



In [ ]:

    
train['TAXI_ID'].max()

Self-explanatory



In [ ]:

    
train['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in train['TIMESTAMP']])

Quarter hour of the day, i.e. 1 of the 4*24 = 96 quarter hours of the day



In [ ]:

    
train['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                   for t in train['TIMESTAMP']])

Self-explanatory



In [ ]:

    
train['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in train['TIMESTAMP']])

Target coords are the last in the sequence (final position). If there are no positions, or only 1, then mark as invalid w/ nan in order to drop later



In [ ]:

    
train['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in train[['LONGITUDE','LATITUDE']].iterrows()])

This function creates the continuous inputs, which are the concatened k first and k last coords in a sequence, as discussed in the paper.

If there aren't at least 2* k coords excluding the target, then the k first and k last overlap. In this case the sequence (excluding target) is padded at the end with the last coord in the sequence. The paper mentioned they padded front and back but didn't specify in what manner.

Also marks any invalid w/ na's



In [ ]:

    
def start_stop_inputs(k):
    result = []
    for l in train[['LONGITUDE','LATITUDE']].iterrows():
        if len(l[1][0]) < 2 or len(l[1][1]) < 2:
            result.append(np.nan)
        elif len(l[1][0][:-1]) >= 2*k:
            result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
        else:
            l1 = np.lib.pad(l[1][0][:-1], (0,20-len(l[1][0][:-1])), mode='edge')
            l2 = np.lib.pad(l[1][1][:-1], (0,20-len(l[1][1][:-1])), mode='edge')
            result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)



In [ ]:

    
train['COORD_FEATURES'] = start_stop_inputs(5)



In [ ]:

    
train.shape



In [ ]:

    
train.dropna().shape

Drop na's



In [ ]:

    
train = train.dropna()



In [ ]:

    
utils2.save_array(data_path+'train/train_features.bc', train.as_matrix())

End to end feature transformation



In [ ]:

    
train = pd.read_csv(data_path+'train/train.csv', header=0)



In [ ]:

    
test = pd.read_csv(data_path+'test/test.csv', header=0)



In [ ]:

    
def start_stop_inputs(k, data, test):
    result = []
    for l in data[['LONGITUDE','LATITUDE']].iterrows():
        if not test:
            if len(l[1][0]) < 2 or len(l[1][1]) < 2:
                result.append(np.nan)
            elif len(l[1][0][:-1]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0][:-1], (0,4*k-len(l[1][0][:-1])), mode='edge')
                l2 = np.lib.pad(l[1][1][:-1], (0,4*k-len(l[1][1][:-1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
        else:
            if len(l[1][0]) < 1 or len(l[1][1]) < 1:
                result.append(np.nan)
            elif len(l[1][0]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-k:],l[1][1][0:k],l[1][1][-k:]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0], (0,4*k-len(l[1][0])), mode='edge')
                l2 = np.lib.pad(l[1][1], (0,4*k-len(l[1][1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)

Pre-calculated below on train set



In [ ]:

    
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309



In [ ]:

    
def feature_ext(data, test=False):   
    
    data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1

    data['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in data["ORIGIN_STAND"]])

    data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0]) + 1

    data['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in data['DAY_TYPE']])

    polyline = pd.Series([ast.literal_eval(x) for x in data['POLYLINE']])

    data['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

    data['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
    
    if not test:
    
        data['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in data[['LONGITUDE','LATITUDE']].iterrows()])

    
    data['LATITUDE'] = pd.Series([(t-lat_mean)/lat_std for t in data['LATITUDE']])
    
    data['LONGITUDE'] = pd.Series([(t-long_mean)/long_std for t in data['LONGITUDE']])
    
    data['COORD_FEATURES'] = start_stop_inputs(5, data, test)

    data['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in data['TIMESTAMP']])

    data['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                       for t in data['TIMESTAMP']])

    data['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in data['TIMESTAMP']])
    
        
    data = data.dropna()

    return data



In [ ]:

    
train = feature_ext(train)



In [ ]:

    
# train["TARGET"]
train.head()



In [ ]:

    
test = feature_ext(test, test=True)



In [ ]:

    
test.head()



In [ ]:

    
utils2.save_array(data_path+'train/train_features.bc', train.as_matrix())



In [ ]:

    
utils2.save_array(data_path+'test/test_features.bc', test.as_matrix())



In [ ]:

    
train.head()

MEANSHIFT

Meanshift clustering as performed in the paper



In [ ]:

    
# train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
#        'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK',
#                             'QUARTER_HOUR', "WEEK_OF_YEAR", "TARGET", "COORD_FEATURES"])

# - Correct column order to load the Bcolz array that was saved above
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET', 'COORD_FEATURES', 'DAY_OF_WEEK',
                            'QUARTER_HOUR', 'WEEK_OF_YEAR'])

Clustering performed on the targets



In [ ]:

    
y_targ = np.vstack(train["TARGET"].as_matrix())



In [ ]:

    
from sklearn.cluster import MeanShift, estimate_bandwidth

Can use the commented out code for a estimate of bandwidth, which causes clustering to converge much quicker.

This is not mentioned in the paper but is included in the code. In order to get results similar to the paper's, they manually chose the uncommented bandwidth



In [ ]:

    
#bw = estimate_bandwidth(y_targ, quantile=.1, n_samples=1000)
bw = 0.001

This takes some time



In [ ]:

    
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(y_targ)



In [ ]:

    
cluster_centers = ms.cluster_centers_

This is very close to the number of clusters mentioned in the paper



In [ ]:

    
cluster_centers.shape



In [ ]:

    
utils2.save_array(data_path+"cluster_centers_bw_001.bc", cluster_centers)

Formatting Features for Bcolz iterator / garbage



In [3]:

    
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', 'DAY_OF_WEEK', "QUARTER_HOUR", "WEEK_OF_YEAR"])



In [4]:

    
cluster_centers = utils2.load_array(data_path+"cluster_centers_bw_001.bc")



In [5]:

    
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])



In [6]:

    
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)



In [7]:

    
def get_features(data):
    return [np.vstack(data['COORD_FEATURES'].as_matrix()), np.vstack(data['ORIGIN_CALL'].as_matrix()), 
           np.vstack(data['TAXI_ID'].as_matrix()), np.vstack(data['ORIGIN_STAND'].as_matrix()),
           np.vstack(data['QUARTER_HOUR'].as_matrix()), np.vstack(data['DAY_OF_WEEK'].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'].as_matrix()), np.array([long for i in range(0,data.shape[0])]),
               np.array([lat for i in range(0,data.shape[0])])]



In [8]:

    
def get_target(data):
    return np.vstack(data["TARGET"].as_matrix())



In [9]:

    
X_train_features = get_features(X_train)



In [10]:

    
X_train_target = get_target(X_train)



In [11]:

    
# utils2.save_array(data_path+'train/X_train_features.bc', get_features(X_train))  # - doesn't work - needs an array, not a list

MODEL

Load training data and cluster centers



In [12]:

    
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

Validation cuts



In [13]:

    
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]



In [14]:

    
print(datetime.datetime.fromtimestamp(1376503200))









    



2013-08-14 20:00:00



In [15]:

    
train.shape









    Out[15]:





(1674160, 16)



In [16]:

    
val_indices = []
index = 0
for index, row in train.iterrows():
    time = row['TIMESTAMP']
    latitude = row['LATITUDE']
    for ts in cuts:
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            val_indices.append(index)
            break
    index += 1



In [17]:

    
X_valid = train.iloc[val_indices]



In [18]:

    
X_valid.head()









    Out[18]:






  
    
      
      TRIP_ID
      CALL_TYPE
      ORIGIN_CALL
      ORIGIN_STAND
      TAXI_ID
      TIMESTAMP
      DAY_TYPE
      MISSING_DATA
      POLYLINE
      LATITUDE
      LONGITUDE
      TARGET
      COORD_FEATURES
      DAY_OF_WEEK
      QUARTER_HOUR
      WEEK_OF_YEAR
    
  
  
    
      200153
      1376502576620000126
      B
      0
      36
      247
      1376502576
      0
      False
      [[-8.649504,41.15421],[-8.649684,41.154201],[-...
      [-0.0418419, -0.0419448, -0.0449813, -0.046422...
      [-0.583255, -0.586407, -0.59711, -0.589074, -0...
      [-8.61122, 41.1463]
      [-0.583255, -0.586407, -0.59711, -0.589074, -0...
      2
      79
      33
    
    
      200186
      1376503146620000161
      B
      0
      35
      19
      1376503146
      0
      False
      [[-8.649621,41.167323],[-8.64963,41.167251],[-...
      [0.135098, 0.134121, 0.126709, 0.125371, 0.124...
      [-0.585306, -0.585456, -0.589241, -0.588774, -...
      [-8.64504, 41.1586]
      [-0.585306, -0.585456, -0.589241, -0.588774, -...
      2
      79
      33
    
    
      200200
      1376502942620000500
      B
      0
      15
      428
      1376502942
      0
      False
      [[-8.585694,41.148522],[-8.585712,41.148801],[...
      [-0.118578, -0.114821, -0.112402, -0.116982, -...
      [0.532287, 0.531971, 0.523018, 0.524735, 0.524...
      [-8.61524, 41.1418]
      [0.532287, 0.531971, 0.523018, 0.524735, 0.524...
      2
      79
      33
    
    
      200202
      1376502604620000105
      C
      0
      0
      87
      1376502604
      0
      False
      [[-8.61093,41.145498],[-8.610939,41.145516],[-...
      [-0.15939, -0.159133, -0.153883, -0.145392, -0...
      [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...
      [-8.64832, 41.1648]
      [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...
      2
      79
      33
    
    
      200227
      1376502611620000022
      C
      0
      0
      304
      1376502611
      0
      False
      [[-8.591301,41.162715],[-8.591004,41.162562],[...
      [0.0729274, 0.0708687, 0.0587228, 0.0539879, 0...
      [0.43427, 0.439455, 0.42735, 0.423566, 0.41539...
      [-8.60977, 41.1512]
      [0.43427, 0.439455, 0.42735, 0.423566, 0.41539...
      2
      79
      33



In [19]:

    
for d in X_valid['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(d))









    



2013-08-14 19:49:36
2013-08-14 19:59:06
2013-08-14 19:55:42
2013-08-14 19:50:04
2013-08-14 19:50:11
2013-08-14 19:56:57
2013-08-14 19:36:51
2013-08-14 19:44:15
2013-08-14 19:55:50
2013-08-14 19:50:35
2013-08-14 19:50:27
2013-08-14 19:43:57
2013-08-14 19:16:48
2013-08-14 19:40:47
2013-08-14 19:45:55
2013-08-14 19:43:00
2013-08-14 19:53:22
2013-08-14 19:50:03
2013-08-14 19:26:22
2013-08-14 19:59:15
2013-08-14 19:50:17
2013-08-14 19:56:34
2013-08-14 19:53:42
2013-08-14 19:47:46
2013-08-14 19:58:46
2013-08-14 19:24:23
2013-08-14 19:55:19
2013-08-14 19:57:03
2013-08-14 19:56:11
2013-08-14 19:56:52
2013-08-14 19:57:57
2013-08-14 19:08:15
2013-08-14 19:51:14
2013-08-14 19:58:31
2013-08-14 19:47:31
2013-08-14 19:30:36
2013-08-14 19:17:59
2013-08-14 19:48:03
2013-08-14 19:55:52
2013-08-14 19:49:06
2013-08-14 19:58:55
2013-08-14 19:51:24
2013-08-14 19:54:12
2013-08-14 19:54:26
2013-08-14 19:51:18
2013-08-14 19:59:56
2013-08-14 19:48:31
2013-08-14 19:51:56
2013-08-14 19:39:22
2013-08-14 19:57:25
2013-08-14 19:57:28
2013-08-14 19:57:40
2013-08-14 19:39:01
2013-08-14 19:50:39
2013-08-14 18:48:19
2013-10-01 10:16:12
2013-10-01 10:28:04
2013-10-01 10:18:37
2013-10-01 10:24:48
2013-10-01 10:23:39
2013-10-01 10:28:37
2013-10-01 10:20:16
2013-10-01 10:23:49
2013-10-01 10:27:11
2013-10-01 10:06:20
2013-10-01 10:28:08
2013-10-01 10:29:02
2013-10-01 10:24:44
2013-10-01 10:24:44
2013-10-01 10:19:06
2013-10-01 09:28:33
2013-10-01 10:29:28
2013-10-01 10:27:31
2013-10-01 10:22:13
2013-10-01 10:26:03
2013-10-01 10:28:55
2013-10-01 10:18:10
2013-10-01 10:22:13
2013-10-01 10:14:30
2013-10-01 10:24:41
2013-10-01 10:22:16
2013-10-01 10:25:35
2013-10-01 10:21:27
2013-10-01 10:11:33
2013-10-01 10:10:18
2013-10-01 10:09:33
2013-10-01 10:01:15
2013-10-01 10:17:58
2013-10-01 10:18:00
2013-10-01 10:13:26
2013-10-01 10:18:01
2013-10-01 10:25:54
2013-10-01 10:21:20
2013-10-01 10:25:31
2013-10-01 10:25:54
2013-10-01 10:23:40
2013-10-01 10:26:46
2013-10-01 10:23:31
2013-10-01 10:17:09
2013-10-01 10:21:57
2013-10-01 09:29:09
2013-10-01 10:14:47
2013-10-01 10:04:25
2013-10-01 10:14:09
2013-10-01 10:16:59
2013-10-01 10:27:16
2013-10-01 10:16:26
2013-10-01 10:23:18
2013-10-01 10:16:05
2013-10-01 10:27:43
2013-10-01 10:08:13
2013-10-01 10:19:21
2013-10-01 10:21:19
2013-10-01 10:24:20
2013-10-01 10:26:45
2013-10-01 10:18:28
2013-10-01 10:19:45
2013-10-01 10:28:10
2013-10-01 10:22:20
2013-10-01 10:18:42
2013-10-01 10:19:52
2013-10-01 10:18:44
2013-10-01 10:15:11
2013-10-01 10:19:24
2013-10-01 10:23:58
2013-10-01 10:28:50
2013-10-01 10:13:24
2013-10-01 10:28:38
2013-10-01 10:24:50
2013-10-01 10:14:19
2013-10-01 10:10:05
2013-10-01 10:26:31
2013-10-01 10:28:01
2013-10-01 08:44:16
2013-10-01 10:21:43
2013-10-01 10:26:57
2013-10-01 10:25:25
2013-10-01 10:25:36
2013-10-01 10:16:34
2013-10-01 10:26:40
2013-10-01 10:14:56
2013-10-01 10:13:10
2013-10-01 10:28:34
2013-10-01 10:19:08
2013-10-01 10:24:57
2013-10-01 09:52:43
2013-10-01 10:25:28
2013-10-01 10:22:54
2013-10-01 10:28:49
2013-10-01 09:13:25
2013-10-07 19:34:47
2013-10-07 19:38:08
2013-10-07 19:31:10
2013-10-07 19:35:12
2013-10-07 19:41:50
2013-10-07 19:34:31
2013-10-07 19:42:02
2013-10-07 19:39:05
2013-10-07 19:31:43
2013-10-07 19:34:27
2013-10-07 19:31:48
2013-10-07 19:42:24
2013-10-07 19:38:37
2013-10-07 19:29:02
2013-10-07 19:33:55
2013-10-07 19:17:07
2013-10-07 19:44:31
2013-10-07 19:42:52
2013-10-07 19:26:05
2013-10-07 19:34:07
2013-10-07 19:40:59
2013-10-07 19:41:36
2013-10-07 19:33:47
2013-10-07 19:30:59
2013-10-07 19:38:59
2013-10-07 19:28:56
2013-10-07 19:41:24
2013-10-07 19:41:49
2013-10-07 19:42:47
2013-10-07 19:34:09
2013-10-07 19:40:31
2013-10-07 19:21:34
2013-10-07 19:43:52
2013-10-07 19:18:11
2013-10-07 19:41:47
2013-10-07 19:33:04
2013-10-07 19:40:53
2013-10-07 19:36:38
2013-10-07 19:41:46
2013-10-07 19:03:36
2013-10-07 19:44:45
2013-10-07 19:21:42
2013-10-07 19:24:07
2013-10-07 19:40:35
2013-10-07 19:41:00
2013-10-07 19:43:10
2013-10-07 19:23:55
2013-10-07 19:43:30
2013-10-07 19:25:24
2013-10-07 19:35:07
2013-10-07 19:43:33
2013-10-07 19:39:30
2013-10-07 19:31:42
2013-10-07 19:39:17
2013-10-07 19:42:47
2013-10-07 19:39:20
2013-10-07 19:44:41
2013-10-07 19:24:22
2013-10-07 19:12:39
2013-10-07 19:37:25
2013-10-07 19:42:55
2013-10-07 19:14:35
2013-10-07 19:37:12
2013-10-07 19:32:29
2013-10-07 19:42:37
2013-10-07 19:26:52
2013-10-07 19:31:19
2013-10-07 19:44:58
2013-11-02 04:47:37
2013-11-02 04:54:00
2013-11-02 04:58:53
2013-11-02 04:56:37
2013-11-02 04:56:09
2013-11-02 04:51:05
2013-11-02 04:50:58
2013-11-02 04:55:26
2013-11-02 04:53:43
2013-11-02 04:53:46
2013-11-02 04:54:55
2013-11-02 04:59:28
2013-11-02 04:56:54
2013-11-02 04:50:37
2013-11-02 04:48:40
2013-11-02 04:55:46
2013-11-02 04:45:20
2013-11-02 04:46:22
2013-11-02 04:48:25
2013-11-02 04:47:19
2013-11-02 04:57:31
2013-11-02 04:58:14
2013-11-02 04:49:30
2013-11-02 04:43:31
2013-11-02 04:59:00
2013-11-02 04:54:23
2013-11-02 04:51:01
2013-11-02 04:38:12
2013-11-02 04:59:31
2013-11-02 04:56:46
2013-11-02 04:53:51
2013-11-02 04:48:00
2013-11-02 04:58:04
2013-11-02 04:52:50
2013-11-02 04:58:12
2013-11-02 04:57:37
2013-11-02 04:53:33
2013-11-02 04:54:11
2013-11-02 04:48:49
2013-11-02 04:42:56
2013-11-02 04:55:36
2013-11-02 04:51:36
2013-11-02 04:48:45
2013-11-02 04:49:17
2013-11-02 04:53:50
2013-11-02 04:45:28
2013-11-02 04:45:04
2013-11-02 04:52:17
2013-11-02 04:52:10
2013-11-02 04:59:16
2013-11-02 04:51:37
2013-11-02 04:50:10
2013-12-22 15:24:50
2013-12-22 15:04:12
2013-12-22 15:16:27
2013-12-22 15:23:06
2013-12-22 15:24:04
2013-12-22 15:17:33
2013-12-22 15:22:55
2013-12-22 15:24:35
2013-12-22 15:21:56
2013-12-22 15:22:49
2013-12-22 15:25:31
2013-12-22 15:21:31
2013-12-22 15:27:31
2013-12-22 15:29:45
2013-12-22 15:26:09
2013-12-22 15:17:08
2013-12-22 15:26:00
2013-12-22 15:20:56
2013-12-22 15:23:09
2013-12-22 15:22:31
2013-12-22 15:29:59
2013-12-22 15:27:43
2013-12-22 15:23:04
2013-12-22 15:25:30
2013-12-22 15:19:16
2013-12-22 15:23:06
2013-12-22 15:26:01
2013-12-22 15:19:45
2013-12-22 11:34:23
2013-12-22 15:29:54
2013-12-22 15:28:39
2013-12-22 15:27:43
2013-12-22 15:16:23
2013-12-22 15:17:26



In [20]:

    
X_train = train.drop(train.index[[val_indices]])



In [21]:

    
cluster_centers = utils2.load_array(data_path+"cluster_centers_bw_001.bc")



In [22]:

    
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])



In [23]:

    
utils2.save_array(data_path+'train/X_train.bc', X_train.as_matrix())



In [24]:

    
utils2.save_array(data_path+'valid/X_val.bc', X_valid.as_matrix())



In [25]:

    
X_train = pd.DataFrame(utils2.load_array(data_path+'train/X_train.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])



In [26]:

    
X_valid = pd.DataFrame(utils2.load_array(data_path+'valid/X_val.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])



In [ ]:



In [ ]:

The equirectangular loss function mentioned in the paper.

Note: Very important that y[0] is longitude and y[1] is latitude.

Omitted the radius of the earth constant "R" as it does not affect minimization and units were not given in the paper.



In [27]:

    
def equirectangular_loss(y_true, y_pred):
    deg2rad = 3.141592653589793 / 180
    long_1 = y_true[:,0]*deg2rad
    long_2 = y_pred[:,0]*deg2rad
    lat_1 = y_true[:,1]*deg2rad
    lat_2 = y_pred[:,1]*deg2rad
    return 6371*K.sqrt(K.square((long_1 - long_2)*K.cos((lat_1 + lat_2)/2.))
                       +K.square(lat_1 - lat_2))



In [28]:

    
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)  # Keras 2

The following returns a fully-connected model as mentioned in the paper. Takes as input k as defined before, and the cluster centers.

Inputs: Embeddings for each category, concatenated w/ the 4*k continous variable representing the first/last k coords as mentioned above.

Embeddings have no regularization, as it was not mentioned in paper, though are easily equipped to include.

Paper mentions global normalization. Didn't specify exactly how they did that, whether thay did it sequentially or whatnot. I just included a batchnorm layer for the continuous inputs.

After concatenation, 1 hidden layer of 500 neurons as called for in paper.

Finally, output layer has as many outputs as there are cluster centers, w/ a softmax activation. Call this output P.

The prediction is the weighted sum of each cluster center c_i w/ corresponding predicted prob P_i.

To facilitate this, dotted output w/ cluster latitudes and longitudes separately. (this happens at variable y), then concatenated into single tensor.

NOTE!!: You will see that I have the cluster center coords as inputs. Ideally, This function should store the cluster longs/lats as a constant to be used in the model, but I could not figure out. As a consequence, I pass them in as a repeated input.



In [29]:

    
def taxi_mlp(k, cluster_centers):
    shp = cluster_centers.shape[0]
    nums = Input(shape=(4*k,))

    center_longs = Input(shape=(shp,))
    center_lats = Input(shape=(shp,))

    emb_names = ['client_ID', 'taxi_ID', "stand_ID", "quarter_hour", "day_of_week", "week_of_year"]
    emb_ins = [57106, 448, 64, 96, 7, 52]
    emb_outs = [10 for i in range(0,6)]
    regs = [0 for i in range(0,6)]

    embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]

    x = concatenate([nums] + [Flatten()(e[1]) for e in embs])  # Keras 2

    x = Dense(500, activation='relu')(x)

    x = Dense(shp, activation='softmax')(x)

    y = concatenate([dot([x, center_longs], axes=1), dot([x, center_lats], axes=1)])  # Keras 2

    return Model(inputs = [nums]+[e[0] for e in embs] + [center_longs, center_lats], outputs = y)  # Keras 2

As mentioned, construction of repeated cluster longs/lats for input

Iterator for in memory train pandas dataframe. I did this as opposed to bcolz iterator due to the pre-processing



In [30]:

    
def data_iter(data, batch_size, cluster_centers):
    long = [c[0] for c in cluster_centers]
    lat = [c[1] for c in cluster_centers]
    i = 0
    N = data.shape[0]
    while True:
        yield ([np.vstack(data['COORD_FEATURES'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_CALL'][i:i+batch_size].as_matrix()), 
           np.vstack(data['TAXI_ID'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_STAND'][i:i+batch_size].as_matrix()),
           np.vstack(data['QUARTER_HOUR'][i:i+batch_size].as_matrix()), np.vstack(data['DAY_OF_WEEK'][i:i+batch_size].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'][i:i+batch_size].as_matrix()), np.array([long for i in range(0,batch_size)]),
               np.array([lat for i in range(0,batch_size)])], np.vstack(data["TARGET"][i:i+batch_size].as_matrix()))
        i += batch_size



In [31]:

    
# x=Lambda(thing)([x,long,lat])

Of course, k in the model needs to match k from feature construction. We again use 5 as they did in the paper



In [50]:

    
del model
model = taxi_mlp(5, cluster_centers)

Paper used SGD opt w/ following paramerters



In [51]:

    
# Reduced the initial 0.001 learning rate to avoid NaN's
model.compile(optimizer=SGD(1e-6, momentum=0.9), loss=equirectangular_loss, metrics=['mse'])

# - Try also Adam optimizer
# optim = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# model.compile(optimizer=optim, loss=equirectangular_loss, metrics=['mse'])



In [34]:

    
X_train_feat = get_features(X_train)



In [35]:

    
X_train_target = get_target(X_train)



In [36]:

    
X_val_feat = get_features(X_valid)



In [37]:

    
X_val_target = get_target(X_valid)



In [38]:

    
tqdm = TQDMNotebookCallback()



In [39]:

    
# - Added verbose=1 to track improvement through epochs
checkpoint = ModelCheckpoint(verbose=1, filepath=data_path+'models/weights.{epoch:03d}.{val_loss:.8f}.hdf5', save_best_only=True)



In [40]:

    
batch_size=256

original



In [55]:

    
model.fit(X_train_feat, X_train_target, epochs=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)









    





 
 










    





 
 










    



Epoch 00000: val_loss improved from 4.31887 to 4.31829, saving model to data/taxi/models/weights.000.4.31829121.hdf5







    Out[55]:





<keras.callbacks.History at 0x7fe5f041a4a8>



In [56]:

    
model.fit(X_train_feat, X_train_target, epochs=30, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)









    





 
 










    





 
 










    



Epoch 00000: val_loss improved from 4.31829 to 4.31774, saving model to data/taxi/models/weights.000.4.31774099.hdf5






    





 
 










    



Epoch 00001: val_loss improved from 4.31774 to 4.31718, saving model to data/taxi/models/weights.001.4.31717622.hdf5






    





 
 










    



Epoch 00002: val_loss improved from 4.31718 to 4.31663, saving model to data/taxi/models/weights.002.4.31662745.hdf5






    





 
 










    



Epoch 00003: val_loss improved from 4.31663 to 4.31606, saving model to data/taxi/models/weights.003.4.31606152.hdf5






    





 
 










    



Epoch 00004: val_loss improved from 4.31606 to 4.31548, saving model to data/taxi/models/weights.004.4.31548033.hdf5






    





 
 










    



Epoch 00005: val_loss improved from 4.31548 to 4.31494, saving model to data/taxi/models/weights.005.4.31493714.hdf5






    





 
 










    



Epoch 00006: val_loss improved from 4.31494 to 4.31436, saving model to data/taxi/models/weights.006.4.31435751.hdf5






    





 
 










    



Epoch 00007: val_loss improved from 4.31436 to 4.31374, saving model to data/taxi/models/weights.007.4.31374216.hdf5






    





 
 










    



Epoch 00008: val_loss improved from 4.31374 to 4.31317, saving model to data/taxi/models/weights.008.4.31317252.hdf5






    





 
 










    



Epoch 00009: val_loss improved from 4.31317 to 4.31260, saving model to data/taxi/models/weights.009.4.31259594.hdf5






    





 
 










    



Epoch 00010: val_loss improved from 4.31260 to 4.31199, saving model to data/taxi/models/weights.010.4.31199114.hdf5






    





 
 










    



Epoch 00011: val_loss improved from 4.31199 to 4.31134, saving model to data/taxi/models/weights.011.4.31133757.hdf5






    





 
 










    



Epoch 00012: val_loss improved from 4.31134 to 4.31072, saving model to data/taxi/models/weights.012.4.31071869.hdf5






    





 
 










    



Epoch 00013: val_loss improved from 4.31072 to 4.31008, saving model to data/taxi/models/weights.013.4.31008419.hdf5






    





 
 










    



Epoch 00014: val_loss improved from 4.31008 to 4.30941, saving model to data/taxi/models/weights.014.4.30940849.hdf5






    





 
 










    



Epoch 00015: val_loss improved from 4.30941 to 4.30867, saving model to data/taxi/models/weights.015.4.30866721.hdf5






    





 
 










    



Epoch 00016: val_loss improved from 4.30867 to 4.30792, saving model to data/taxi/models/weights.016.4.30791654.hdf5






    





 
 










    



Epoch 00017: val_loss improved from 4.30792 to 4.30702, saving model to data/taxi/models/weights.017.4.30701692.hdf5






    





 
 










    



Epoch 00018: val_loss improved from 4.30702 to 4.30598, saving model to data/taxi/models/weights.018.4.30598387.hdf5






    





 
 










    



Epoch 00019: val_loss improved from 4.30598 to 4.30465, saving model to data/taxi/models/weights.019.4.30465317.hdf5






    





 
 










    



Epoch 00020: val_loss improved from 4.30465 to 4.30299, saving model to data/taxi/models/weights.020.4.30298820.hdf5






    





 
 










    



Epoch 00021: val_loss improved from 4.30299 to 4.30127, saving model to data/taxi/models/weights.021.4.30127284.hdf5






    





 
 










    



Epoch 00022: val_loss improved from 4.30127 to 4.29971, saving model to data/taxi/models/weights.022.4.29970756.hdf5






    





 
 










    



Epoch 00023: val_loss improved from 4.29971 to 4.29832, saving model to data/taxi/models/weights.023.4.29831633.hdf5






    





 
 










    



Epoch 00024: val_loss improved from 4.29832 to 4.29692, saving model to data/taxi/models/weights.024.4.29692163.hdf5






    





 
 










    



Epoch 00025: val_loss improved from 4.29692 to 4.29569, saving model to data/taxi/models/weights.025.4.29569347.hdf5






    





 
 










    



Epoch 00026: val_loss improved from 4.29569 to 4.29461, saving model to data/taxi/models/weights.026.4.29460874.hdf5






    





 
 










    



Epoch 00027: val_loss improved from 4.29461 to 4.29367, saving model to data/taxi/models/weights.027.4.29367482.hdf5






    





 
 










    



Epoch 00028: val_loss improved from 4.29367 to 4.29283, saving model to data/taxi/models/weights.028.4.29282813.hdf5






    





 
 










    



Epoch 00029: val_loss improved from 4.29283 to 4.29210, saving model to data/taxi/models/weights.029.4.29209712.hdf5







    Out[56]:





<keras.callbacks.History at 0x7fe5f2e6f160>



In [58]:

    
# - Load the saved best model, otherwise the training would go on from the current model
# - which is not guaranteed to be the best one
# - (check the actual file name)
model = load_model(data_path+'models/weights.028.4.29282813.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})



In [69]:

    
# - trying also learning rate annealing
K.set_value(model.optimizer.lr, 5e-4)









    



1190656/|/[loss: 3.316, mean_squared_error: 0.001]  71%|| 1190656/1673856 [00:57<00:16, 29457.51it/s]



In [60]:

    
model.fit(X_train_feat, X_train_target, epochs=100, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)









    





 
 










    





 
 










    



Epoch 00000: val_loss improved from 4.29210 to 4.29207, saving model to data/taxi/models/weights.000.4.29207400.hdf5






    





 
 










    



Epoch 00001: val_loss improved from 4.29207 to 4.29137, saving model to data/taxi/models/weights.001.4.29137112.hdf5






    





 
 










    



Epoch 00002: val_loss improved from 4.29137 to 4.29064, saving model to data/taxi/models/weights.002.4.29063544.hdf5






    





 
 










    



Epoch 00003: val_loss improved from 4.29064 to 4.28992, saving model to data/taxi/models/weights.003.4.28991975.hdf5






    





 
 










    



Epoch 00004: val_loss improved from 4.28992 to 4.28912, saving model to data/taxi/models/weights.004.4.28912484.hdf5






    





 
 










    



Epoch 00005: val_loss improved from 4.28912 to 4.28833, saving model to data/taxi/models/weights.005.4.28833231.hdf5






    





 
 










    



Epoch 00006: val_loss improved from 4.28833 to 4.28753, saving model to data/taxi/models/weights.006.4.28752909.hdf5






    





 
 










    



Epoch 00007: val_loss improved from 4.28753 to 4.28669, saving model to data/taxi/models/weights.007.4.28669458.hdf5






    





 
 










    



Epoch 00008: val_loss improved from 4.28669 to 4.28584, saving model to data/taxi/models/weights.008.4.28584460.hdf5






    





 
 










    



Epoch 00009: val_loss improved from 4.28584 to 4.28503, saving model to data/taxi/models/weights.009.4.28503423.hdf5






    





 
 










    



Epoch 00010: val_loss improved from 4.28503 to 4.28416, saving model to data/taxi/models/weights.010.4.28415504.hdf5






    





 
 










    



Epoch 00011: val_loss improved from 4.28416 to 4.28327, saving model to data/taxi/models/weights.011.4.28326883.hdf5






    





 
 










    



Epoch 00012: val_loss improved from 4.28327 to 4.28235, saving model to data/taxi/models/weights.012.4.28234735.hdf5






    





 
 










    



Epoch 00013: val_loss improved from 4.28235 to 4.28138, saving model to data/taxi/models/weights.013.4.28138186.hdf5






    





 
 










    



Epoch 00014: val_loss improved from 4.28138 to 4.28039, saving model to data/taxi/models/weights.014.4.28039204.hdf5






    





 
 










    



Epoch 00015: val_loss improved from 4.28039 to 4.27937, saving model to data/taxi/models/weights.015.4.27936888.hdf5






    





 
 










    



Epoch 00016: val_loss improved from 4.27937 to 4.27843, saving model to data/taxi/models/weights.016.4.27843129.hdf5






    





 
 










    



Epoch 00017: val_loss improved from 4.27843 to 4.27752, saving model to data/taxi/models/weights.017.4.27751621.hdf5






    





 
 










    



Epoch 00018: val_loss improved from 4.27752 to 4.27680, saving model to data/taxi/models/weights.018.4.27680246.hdf5






    





 
 










    



Epoch 00019: val_loss improved from 4.27680 to 4.27610, saving model to data/taxi/models/weights.019.4.27610297.hdf5






    





 
 










    



Epoch 00020: val_loss improved from 4.27610 to 4.27546, saving model to data/taxi/models/weights.020.4.27546325.hdf5






    





 
 










    



Epoch 00021: val_loss improved from 4.27546 to 4.27483, saving model to data/taxi/models/weights.021.4.27482771.hdf5






    





 
 










    



Epoch 00022: val_loss improved from 4.27483 to 4.27427, saving model to data/taxi/models/weights.022.4.27427272.hdf5






    





 
 










    



Epoch 00023: val_loss improved from 4.27427 to 4.27365, saving model to data/taxi/models/weights.023.4.27365168.hdf5






    





 
 










    



Epoch 00024: val_loss improved from 4.27365 to 4.27306, saving model to data/taxi/models/weights.024.4.27306346.hdf5






    





 
 










    



Epoch 00025: val_loss improved from 4.27306 to 4.27245, saving model to data/taxi/models/weights.025.4.27245205.hdf5






    





 
 










    



Epoch 00026: val_loss improved from 4.27245 to 4.27180, saving model to data/taxi/models/weights.026.4.27180182.hdf5






    





 
 










    



Epoch 00027: val_loss improved from 4.27180 to 4.27114, saving model to data/taxi/models/weights.027.4.27114311.hdf5






    





 
 










    



Epoch 00028: val_loss improved from 4.27114 to 4.27049, saving model to data/taxi/models/weights.028.4.27049180.hdf5






    





 
 










    



Epoch 00029: val_loss improved from 4.27049 to 4.26976, saving model to data/taxi/models/weights.029.4.26975740.hdf5






    





 
 










    



Epoch 00030: val_loss improved from 4.26976 to 4.26907, saving model to data/taxi/models/weights.030.4.26907022.hdf5






    





 
 










    



Epoch 00031: val_loss improved from 4.26907 to 4.26839, saving model to data/taxi/models/weights.031.4.26839241.hdf5






    





 
 










    



Epoch 00032: val_loss improved from 4.26839 to 4.26764, saving model to data/taxi/models/weights.032.4.26763758.hdf5






    





 
 










    



Epoch 00033: val_loss improved from 4.26764 to 4.26691, saving model to data/taxi/models/weights.033.4.26690501.hdf5






    





 
 










    



Epoch 00034: val_loss improved from 4.26691 to 4.26616, saving model to data/taxi/models/weights.034.4.26615564.hdf5






    





 
 










    



Epoch 00035: val_loss improved from 4.26616 to 4.26541, saving model to data/taxi/models/weights.035.4.26540591.hdf5






    





 
 










    



Epoch 00036: val_loss improved from 4.26541 to 4.26463, saving model to data/taxi/models/weights.036.4.26462643.hdf5






    





 
 










    



Epoch 00037: val_loss improved from 4.26463 to 4.26388, saving model to data/taxi/models/weights.037.4.26387523.hdf5






    





 
 










    



Epoch 00038: val_loss improved from 4.26388 to 4.26311, saving model to data/taxi/models/weights.038.4.26311260.hdf5






    





 
 










    



Epoch 00039: val_loss improved from 4.26311 to 4.26232, saving model to data/taxi/models/weights.039.4.26231723.hdf5






    





 
 










    



Epoch 00040: val_loss improved from 4.26232 to 4.26153, saving model to data/taxi/models/weights.040.4.26152596.hdf5






    





 
 










    



Epoch 00041: val_loss improved from 4.26153 to 4.26076, saving model to data/taxi/models/weights.041.4.26075772.hdf5






    





 
 










    



Epoch 00042: val_loss improved from 4.26076 to 4.25994, saving model to data/taxi/models/weights.042.4.25994256.hdf5






    





 
 










    



Epoch 00043: val_loss improved from 4.25994 to 4.25914, saving model to data/taxi/models/weights.043.4.25913650.hdf5






    





 
 










    



Epoch 00044: val_loss improved from 4.25914 to 4.25829, saving model to data/taxi/models/weights.044.4.25829431.hdf5






    





 
 










    



Epoch 00045: val_loss improved from 4.25829 to 4.25749, saving model to data/taxi/models/weights.045.4.25748587.hdf5






    





 
 










    



Epoch 00046: val_loss improved from 4.25749 to 4.25669, saving model to data/taxi/models/weights.046.4.25668621.hdf5






    





 
 










    



Epoch 00047: val_loss improved from 4.25669 to 4.25585, saving model to data/taxi/models/weights.047.4.25585277.hdf5






    





 
 










    



Epoch 00048: val_loss improved from 4.25585 to 4.25503, saving model to data/taxi/models/weights.048.4.25503128.hdf5






    





 
 










    



Epoch 00049: val_loss improved from 4.25503 to 4.25423, saving model to data/taxi/models/weights.049.4.25422764.hdf5






    





 
 










    



Epoch 00050: val_loss improved from 4.25423 to 4.25340, saving model to data/taxi/models/weights.050.4.25340289.hdf5






    





 
 










    



Epoch 00051: val_loss improved from 4.25340 to 4.25257, saving model to data/taxi/models/weights.051.4.25257196.hdf5






    





 
 










    



Epoch 00052: val_loss improved from 4.25257 to 4.25175, saving model to data/taxi/models/weights.052.4.25175014.hdf5






    





 
 










    



Epoch 00053: val_loss improved from 4.25175 to 4.25094, saving model to data/taxi/models/weights.053.4.25094499.hdf5






    





 
 










    



Epoch 00054: val_loss improved from 4.25094 to 4.25016, saving model to data/taxi/models/weights.054.4.25016092.hdf5






    





 
 










    



Epoch 00055: val_loss improved from 4.25016 to 4.24938, saving model to data/taxi/models/weights.055.4.24938102.hdf5






    





 
 










    



Epoch 00056: val_loss improved from 4.24938 to 4.24858, saving model to data/taxi/models/weights.056.4.24857754.hdf5






    





 
 










    



Epoch 00057: val_loss improved from 4.24858 to 4.24785, saving model to data/taxi/models/weights.057.4.24785084.hdf5






    





 
 










    



Epoch 00058: val_loss improved from 4.24785 to 4.24709, saving model to data/taxi/models/weights.058.4.24709157.hdf5






    





 
 










    



Epoch 00059: val_loss improved from 4.24709 to 4.24633, saving model to data/taxi/models/weights.059.4.24633237.hdf5






    





 
 










    



Epoch 00060: val_loss improved from 4.24633 to 4.24565, saving model to data/taxi/models/weights.060.4.24564756.hdf5






    





 
 










    



Epoch 00061: val_loss improved from 4.24565 to 4.24495, saving model to data/taxi/models/weights.061.4.24494578.hdf5






    





 
 










    



Epoch 00062: val_loss improved from 4.24495 to 4.24427, saving model to data/taxi/models/weights.062.4.24427030.hdf5






    





 
 










    



Epoch 00063: val_loss improved from 4.24427 to 4.24364, saving model to data/taxi/models/weights.063.4.24363914.hdf5






    





 
 










    



Epoch 00064: val_loss improved from 4.24364 to 4.24298, saving model to data/taxi/models/weights.064.4.24298467.hdf5






    





 
 










    



Epoch 00065: val_loss improved from 4.24298 to 4.24235, saving model to data/taxi/models/weights.065.4.24235387.hdf5






    





 
 










    



Epoch 00066: val_loss improved from 4.24235 to 4.24173, saving model to data/taxi/models/weights.066.4.24172959.hdf5






    





 
 










    



Epoch 00067: val_loss improved from 4.24173 to 4.24112, saving model to data/taxi/models/weights.067.4.24111878.hdf5






    





 
 










    



Epoch 00068: val_loss improved from 4.24112 to 4.24047, saving model to data/taxi/models/weights.068.4.24047199.hdf5






    





 
 










    



Epoch 00069: val_loss improved from 4.24047 to 4.23987, saving model to data/taxi/models/weights.069.4.23986701.hdf5






    





 
 










    



Epoch 00070: val_loss improved from 4.23987 to 4.23925, saving model to data/taxi/models/weights.070.4.23924511.hdf5






    





 
 










    



Epoch 00071: val_loss improved from 4.23925 to 4.23864, saving model to data/taxi/models/weights.071.4.23863888.hdf5






    





 
 










    



Epoch 00072: val_loss improved from 4.23864 to 4.23801, saving model to data/taxi/models/weights.072.4.23801241.hdf5






    





 
 










    



Epoch 00073: val_loss improved from 4.23801 to 4.23742, saving model to data/taxi/models/weights.073.4.23742124.hdf5






    





 
 










    



Epoch 00074: val_loss improved from 4.23742 to 4.23681, saving model to data/taxi/models/weights.074.4.23681405.hdf5






    





 
 










    



Epoch 00075: val_loss improved from 4.23681 to 4.23622, saving model to data/taxi/models/weights.075.4.23622094.hdf5






    





 
 










    



Epoch 00076: val_loss improved from 4.23622 to 4.23559, saving model to data/taxi/models/weights.076.4.23558968.hdf5






    





 
 










    



Epoch 00077: val_loss improved from 4.23559 to 4.23499, saving model to data/taxi/models/weights.077.4.23499022.hdf5






    





 
 










    



Epoch 00078: val_loss improved from 4.23499 to 4.23439, saving model to data/taxi/models/weights.078.4.23439420.hdf5






    





 
 










    



Epoch 00079: val_loss improved from 4.23439 to 4.23375, saving model to data/taxi/models/weights.079.4.23374766.hdf5






    





 
 










    



Epoch 00080: val_loss improved from 4.23375 to 4.23311, saving model to data/taxi/models/weights.080.4.23310862.hdf5






    





 
 










    



Epoch 00081: val_loss improved from 4.23311 to 4.23246, saving model to data/taxi/models/weights.081.4.23245877.hdf5






    





 
 










    



Epoch 00082: val_loss improved from 4.23246 to 4.23182, saving model to data/taxi/models/weights.082.4.23181968.hdf5






    





 
 










    



Epoch 00083: val_loss improved from 4.23182 to 4.23121, saving model to data/taxi/models/weights.083.4.23121372.hdf5






    





 
 










    



Epoch 00084: val_loss improved from 4.23121 to 4.23057, saving model to data/taxi/models/weights.084.4.23057433.hdf5






    





 
 










    



Epoch 00085: val_loss improved from 4.23057 to 4.22994, saving model to data/taxi/models/weights.085.4.22994373.hdf5






    





 
 










    



Epoch 00086: val_loss improved from 4.22994 to 4.22928, saving model to data/taxi/models/weights.086.4.22927731.hdf5






    





 
 










    



Epoch 00087: val_loss improved from 4.22928 to 4.22862, saving model to data/taxi/models/weights.087.4.22862314.hdf5






    





 
 










    



Epoch 00088: val_loss improved from 4.22862 to 4.22799, saving model to data/taxi/models/weights.088.4.22799080.hdf5






    





 
 










    



Epoch 00089: val_loss improved from 4.22799 to 4.22734, saving model to data/taxi/models/weights.089.4.22733974.hdf5






    





 
 










    



Epoch 00090: val_loss improved from 4.22734 to 4.22672, saving model to data/taxi/models/weights.090.4.22672006.hdf5






    





 
 










    



Epoch 00091: val_loss improved from 4.22672 to 4.22605, saving model to data/taxi/models/weights.091.4.22605153.hdf5






    





 
 










    



Epoch 00092: val_loss improved from 4.22605 to 4.22537, saving model to data/taxi/models/weights.092.4.22537447.hdf5






    





 
 










    



Epoch 00093: val_loss improved from 4.22537 to 4.22466, saving model to data/taxi/models/weights.093.4.22465932.hdf5






    





 
 










    



Epoch 00094: val_loss improved from 4.22466 to 4.22399, saving model to data/taxi/models/weights.094.4.22399192.hdf5






    





 
 










    



Epoch 00095: val_loss improved from 4.22399 to 4.22327, saving model to data/taxi/models/weights.095.4.22327027.hdf5






    





 
 










    



Epoch 00096: val_loss improved from 4.22327 to 4.22250, saving model to data/taxi/models/weights.096.4.22249749.hdf5






    





 
 










    



Epoch 00097: val_loss improved from 4.22250 to 4.22179, saving model to data/taxi/models/weights.097.4.22179410.hdf5






    





 
 










    



Epoch 00098: val_loss improved from 4.22179 to 4.22109, saving model to data/taxi/models/weights.098.4.22108590.hdf5






    





 
 










    



Epoch 00099: val_loss improved from 4.22109 to 4.22030, saving model to data/taxi/models/weights.099.4.22030055.hdf5







    Out[60]:





<keras.callbacks.History at 0x7fe5f04261d0>



In [61]:

    
model.save(data_path+'models/current_model.hdf5')

new valid



In [62]:

    
model.fit(X_train_feat, X_train_target, epochs=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)









    





 
 










    





 
 










    



Epoch 00000: val_loss improved from 4.22030 to 4.21952, saving model to data/taxi/models/weights.000.4.21952278.hdf5







    Out[62]:





<keras.callbacks.History at 0x7fe5f33ac940>



In [ ]:

    
# - Load again the saved best model, otherwise the training would go on from the current model
# - which is not guaranteed to be the best one
# - (check the actual file name)
model = load_model(data_path+'models/weights.000.0.73703137.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})



In [ ]:

    
model.fit(X_train_feat, X_train_target, epochs=400, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)









    





 
 










    





 
 










    



Epoch 00000: val_loss improved from 4.21952 to 0.73703, saving model to data/taxi/models/weights.000.0.73703137.hdf5






    





 
 










    



Epoch 00001: val_loss did not improve






    





 
 










    



Epoch 00002: val_loss did not improve






    





 
 










    



Epoch 00003: val_loss did not improve






    





 
 










    



Epoch 00004: val_loss did not improve






    





 
 










    



Epoch 00005: val_loss did not improve






    





 
 










    



Epoch 00006: val_loss did not improve






    





 
 










    



Epoch 00007: val_loss did not improve






    





 
 










    



Epoch 00008: val_loss did not improve






    





 
 










    



Epoch 00009: val_loss did not improve






    





 
 










    



Epoch 00010: val_loss did not improve






    





 
 










    



Epoch 00011: val_loss did not improve






    





 
 










    



Epoch 00012: val_loss did not improve






    





 
 










    



Epoch 00013: val_loss did not improve






    





 
 










    



Epoch 00014: val_loss did not improve






    





 
 










    



Epoch 00015: val_loss did not improve






    





 
 










    



Epoch 00016: val_loss did not improve






    





 
 










    



Epoch 00017: val_loss did not improve






    





 
 










    



Epoch 00018: val_loss did not improve






    





 
 










    



Epoch 00019: val_loss did not improve






    





 
 










    



Epoch 00020: val_loss did not improve






    





 
 










    



Epoch 00021: val_loss did not improve






    





 
 










    



Epoch 00022: val_loss did not improve






    





 
 










    



Epoch 00023: val_loss did not improve






    





 
 










    



Epoch 00024: val_loss did not improve



In [ ]:

    
model.save(data_path+'models/current_model.hdf5')



In [ ]:

    
len(X_val_feat[0])

It works, but it seems to converge unrealistically quick and the loss values are not the same. The paper does not mention what it's using as "error" in it's results. I assume the same equirectangular? Not very clear. The difference in values could be due to the missing Earth-radius factor

Kaggle Entry



In [ ]:

    
# - Use the filename of the best model
best_model = load_model(data_path+'models/weights.308.0.03373993.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})



In [ ]:

    
best_model.evaluate(X_val_feat, X_val_target)



In [ ]:

    
test = pd.DataFrame(utils2.load_array(data_path+'test/test_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])



In [ ]:

    
# test['ORIGIN_CALL'] = pd.read_csv(data_path+'real_origin_call.csv', header=None)  # - file not available



In [ ]:

    
# test['TAXI_ID'] = pd.read_csv(data_path+'real_taxi_id.csv',header=None)  #  # - file not available



In [ ]:

    
X_test = get_features(test)



In [ ]:

    
b = np.sort(X_test[1],axis=None)



In [ ]:

    
test_preds = np.round(best_model.predict(X_test), decimals=6)



In [ ]:

    
d = {0:test['TRIP_ID'], 1:test_preds[:,1], 2:test_preds[:,0]}
kaggle_out = pd.DataFrame(data=d)



In [ ]:

    
kaggle_out.to_csv(data_path+'submission.csv', header=['TRIP_ID','LATITUDE', 'LONGITUDE'], index=False)



In [ ]:

    
def hdist(a, b):
    deg2rad = 3.141592653589793 / 180

    lat1 = a[:, 1] * deg2rad
    lon1 = a[:, 0] * deg2rad
    lat2 = b[:, 1] * deg2rad
    lon2 = b[:, 0] * deg2rad

    dlat = abs(lat1-lat2)
    dlon = abs(lon1-lon2)

    al = np.sin(dlat/2)**2  + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2)**2)
    d = np.arctan2(np.sqrt(al), np.sqrt(1-al))

    hd = 2 * 6371 * d

    return hd



In [ ]:

    
val_preds = best_model.predict(X_val_feat)



In [ ]:

    
trn_preds = model.predict(X_train_feat)



In [ ]:

    
er = hdist(val_preds, X_val_target)



In [ ]:

    
er.mean()



In [ ]:



In [ ]:

    
K.equal()

To-do: simple to extend to validation data

Uh oh... training data not representative of test



In [ ]:

    
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]



In [ ]:

    
np.any([train['TIMESTAMP'].map(lambda x: x in cuts)])



In [ ]:

    
train['TIMESTAMP']



In [ ]:

    
np.any(train['TIMESTAMP']==1381167900)



In [ ]:

    
times = train['TIMESTAMP'].as_matrix()



In [ ]:

    
X_train.columns



In [ ]:

    
times



In [ ]:

    
count = 0
for index, row in X_val.iterrows():
    for ts in cuts:
        time = row['TIMESTAMP']
        latitude = row['LATITUDE']
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            count += 1



In [ ]:

    
one = count



In [ ]:

    
count + one



In [ ]:

    
import h5py



In [ ]:

    
h = h5py.File(data_path+'original/data.hdf5', 'r')



In [ ]:

    
evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']



In [ ]:

    
c = np.load(data_path+'original/arrival-clusters.pkl')

hd5f files



In [ ]:

    
from fuel.utils import find_in_data_path
from fuel.datasets import H5PYDataset



In [ ]:

    
original_path = '/data/bckenstler/data/taxi/original/'



In [ ]:

    
train_set = H5PYDataset(original_path+'data.hdf5', which_sets=('train',),load_in_memory=True)



In [ ]:

    
valid_set = H5PYDataset(original_path+'valid.hdf5', which_sets=('cuts/test_times_0',),load_in_memory=True)



In [ ]:

    
print(train_set.num_examples)



In [ ]:

    
print(valid_set.num_examples)



In [ ]:

    
data = train_set.data_sources



In [ ]:

    
data[0]



In [ ]:

    
valid_data = valid_set.data_sources



In [ ]:

    
valid_data[4][0]



In [ ]:

    
stamps = valid_data[-3]



In [ ]:

    
stamps[0]



In [ ]:

    
for i in range(0,304):    
    print(np.any([t==int(stamps[i]) for t in X_val['TIMESTAMP']]))



In [ ]:

    
type(X_train['TIMESTAMP'][0])



In [ ]:

    
type(stamps[0])



In [ ]:



In [ ]:

    
check = [s in stamps for s in X_val['TIMESTAMP']]



In [ ]:

    
for s in X_val['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(s))



In [ ]:

    
for s in stamps:
    print(datetime.datetime.fromtimestamp(s))



In [ ]:

    
ids = valid_data[-1]



In [ ]:

    
type(ids[0])



In [ ]:

    
ids



In [ ]:

    
X_val

	TRIP_ID	CALL_TYPE	ORIGIN_STAND	TAXI_ID	TIMESTAMP	MISSING_DATA	POLYLINE	LATITUDE	LONGITUDE	TARGET	COORD_FEATURES	DAY_OF_WEEK	QUARTER_HOUR	WEEK_OF_YEAR
200153	1376502576620000126	B	36	247	1376502576	False	[[-8.649504,41.15421],[-8.649684,41.154201],[-...	[-0.0418419, -0.0419448, -0.0449813, -0.046422...	[-0.583255, -0.586407, -0.59711, -0.589074, -0...	[-8.61122, 41.1463]	[-0.583255, -0.586407, -0.59711, -0.589074, -0...	2	79	33
200186	1376503146620000161	B	35	19	1376503146	False	[[-8.649621,41.167323],[-8.64963,41.167251],[-...	[0.135098, 0.134121, 0.126709, 0.125371, 0.124...	[-0.585306, -0.585456, -0.589241, -0.588774, -...	[-8.64504, 41.1586]	[-0.585306, -0.585456, -0.589241, -0.588774, -...	2	79	33
200200	1376502942620000500	B	15	428	1376502942	False	[[-8.585694,41.148522],[-8.585712,41.148801],[...	[-0.118578, -0.114821, -0.112402, -0.116982, -...	[0.532287, 0.531971, 0.523018, 0.524735, 0.524...	[-8.61524, 41.1418]	[0.532287, 0.531971, 0.523018, 0.524735, 0.524...	2	79	33
200202	1376502604620000105	C	0	87	1376502604	False	[[-8.61093,41.145498],[-8.610939,41.145516],[-...	[-0.15939, -0.159133, -0.153883, -0.145392, -0...	[0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...	[-8.64832, 41.1648]	[0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...	2	79	33
200227	1376502611620000022	C	0	304	1376502611	False	[[-8.591301,41.162715],[-8.591004,41.162562],[...	[0.0729274, 0.0708687, 0.0587228, 0.0539879, 0...	[0.43427, 0.439455, 0.42735, 0.423566, 0.41539...	[-8.60977, 41.1512]	[0.43427, 0.439455, 0.42735, 0.423566, 0.41539...	2	79	33