In [1]:
import ast

import pandas as pd

import datetime

from keras.layers import Input, Dense, Embedding, concatenate, dot, Flatten, Merge, BatchNormalization, Lambda
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np

from sklearn.cluster import MeanShift, estimate_bandwidth

import utils2

import data

from sklearn.model_selection import train_test_split

from bcolz_array_iterator import BcolzArrayIterator

import bcolz

from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam


Using TensorFlow backend.
/home/roebius/pj/p3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Below path is a shared directory, swap to own


In [2]:
data_path = "data/taxi/"

Replication of 'csv_to_hdf5.py'

Original repo used some bizarre tuple method of reading in data to save in a hdf5 file using fuel. The following does the same approach in that module, only using pandas and saving in a bcolz format (w/ training data as example)


In [ ]:
meta = pd.read_csv(data_path+'metaData_taxistandsID_name_GPSlocation.csv', header=0)

In [ ]:
meta.head()

In [ ]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [ ]:
train.head()

In [ ]:
train['ORIGIN_CALL'] = pd.Series(pd.factorize(train['ORIGIN_CALL'])[0]) + 1

In [ ]:
train['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in train["ORIGIN_STAND"]])

In [ ]:
train['TAXI_ID'] = pd.Series(pd.factorize(train['TAXI_ID'])[0]) + 1

In [ ]:
# train['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in train['DAY_TYPE']]) 
train['DAY_TYPE'] = pd.Series([(ord(x[0]) - ord('A')) for x in train['DAY_TYPE']])  # - correct

The array of long/lat coordinates per trip (row) is read in as a string. The function ast.literal_eval(x) evaluates the string into the expression it represents (safely). This happens below


In [ ]:
polyline = pd.Series([ast.literal_eval(x) for x in train['POLYLINE']])

Split into latitude/longitude


In [ ]:
train['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

In [ ]:
train['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])

In [ ]:
utils2.save_array(data_path+'train/train.bc', train.as_matrix())

In [ ]:
utils2.save_array(data_path+'train/meta_train.bc', meta.as_matrix())

Further Feature Engineering

After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.


In [ ]:
train = pd.DataFrame(utils2.load_array(data_path+'train/train.bc'), columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE'])

In [ ]:
train.head()

The paper discusses how many categorical variables there are per category. The following all check out


In [ ]:
train['ORIGIN_CALL'].max()

In [ ]:
train['ORIGIN_STAND'].max()

In [ ]:
train['TAXI_ID'].max()

Self-explanatory


In [ ]:
train['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in train['TIMESTAMP']])

Quarter hour of the day, i.e. 1 of the 4*24 = 96 quarter hours of the day


In [ ]:
train['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                   for t in train['TIMESTAMP']])

Self-explanatory


In [ ]:
train['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in train['TIMESTAMP']])

Target coords are the last in the sequence (final position). If there are no positions, or only 1, then mark as invalid w/ nan in order to drop later


In [ ]:
train['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in train[['LONGITUDE','LATITUDE']].iterrows()])

This function creates the continuous inputs, which are the concatened k first and k last coords in a sequence, as discussed in the paper.

If there aren't at least 2* k coords excluding the target, then the k first and k last overlap. In this case the sequence (excluding target) is padded at the end with the last coord in the sequence. The paper mentioned they padded front and back but didn't specify in what manner.

Also marks any invalid w/ na's


In [ ]:
def start_stop_inputs(k):
    result = []
    for l in train[['LONGITUDE','LATITUDE']].iterrows():
        if len(l[1][0]) < 2 or len(l[1][1]) < 2:
            result.append(np.nan)
        elif len(l[1][0][:-1]) >= 2*k:
            result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
        else:
            l1 = np.lib.pad(l[1][0][:-1], (0,20-len(l[1][0][:-1])), mode='edge')
            l2 = np.lib.pad(l[1][1][:-1], (0,20-len(l[1][1][:-1])), mode='edge')
            result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)

In [ ]:
train['COORD_FEATURES'] = start_stop_inputs(5)

In [ ]:
train.shape

In [ ]:
train.dropna().shape

Drop na's


In [ ]:
train = train.dropna()

In [ ]:
utils2.save_array(data_path+'train/train_features.bc', train.as_matrix())

End to end feature transformation


In [ ]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [ ]:
test = pd.read_csv(data_path+'test/test.csv', header=0)

In [ ]:
def start_stop_inputs(k, data, test):
    result = []
    for l in data[['LONGITUDE','LATITUDE']].iterrows():
        if not test:
            if len(l[1][0]) < 2 or len(l[1][1]) < 2:
                result.append(np.nan)
            elif len(l[1][0][:-1]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0][:-1], (0,4*k-len(l[1][0][:-1])), mode='edge')
                l2 = np.lib.pad(l[1][1][:-1], (0,4*k-len(l[1][1][:-1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
        else:
            if len(l[1][0]) < 1 or len(l[1][1]) < 1:
                result.append(np.nan)
            elif len(l[1][0]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-k:],l[1][1][0:k],l[1][1][-k:]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0], (0,4*k-len(l[1][0])), mode='edge')
                l2 = np.lib.pad(l[1][1], (0,4*k-len(l[1][1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)

Pre-calculated below on train set


In [ ]:
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309

In [ ]:
def feature_ext(data, test=False):   
    
    data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1

    data['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in data["ORIGIN_STAND"]])

    data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0]) + 1

    data['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in data['DAY_TYPE']])

    polyline = pd.Series([ast.literal_eval(x) for x in data['POLYLINE']])

    data['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

    data['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
    
    if not test:
    
        data['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in data[['LONGITUDE','LATITUDE']].iterrows()])

    
    data['LATITUDE'] = pd.Series([(t-lat_mean)/lat_std for t in data['LATITUDE']])
    
    data['LONGITUDE'] = pd.Series([(t-long_mean)/long_std for t in data['LONGITUDE']])
    
    data['COORD_FEATURES'] = start_stop_inputs(5, data, test)

    data['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in data['TIMESTAMP']])

    data['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                       for t in data['TIMESTAMP']])

    data['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in data['TIMESTAMP']])
    
        
    data = data.dropna()

    return data

In [ ]:
train = feature_ext(train)

In [ ]:
# train["TARGET"]
train.head()

In [ ]:
test = feature_ext(test, test=True)

In [ ]:
test.head()

In [ ]:
utils2.save_array(data_path+'train/train_features.bc', train.as_matrix())

In [ ]:
utils2.save_array(data_path+'test/test_features.bc', test.as_matrix())

In [ ]:
train.head()

MEANSHIFT

Meanshift clustering as performed in the paper


In [ ]:
# train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
#        'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK',
#                             'QUARTER_HOUR', "WEEK_OF_YEAR", "TARGET", "COORD_FEATURES"])

# - Correct column order to load the Bcolz array that was saved above
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET', 'COORD_FEATURES', 'DAY_OF_WEEK',
                            'QUARTER_HOUR', 'WEEK_OF_YEAR'])

Clustering performed on the targets


In [ ]:
y_targ = np.vstack(train["TARGET"].as_matrix())

In [ ]:
from sklearn.cluster import MeanShift, estimate_bandwidth

Can use the commented out code for a estimate of bandwidth, which causes clustering to converge much quicker.

This is not mentioned in the paper but is included in the code. In order to get results similar to the paper's, they manually chose the uncommented bandwidth


In [ ]:
#bw = estimate_bandwidth(y_targ, quantile=.1, n_samples=1000)
bw = 0.001

This takes some time


In [ ]:
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(y_targ)

In [ ]:
cluster_centers = ms.cluster_centers_

This is very close to the number of clusters mentioned in the paper


In [ ]:
cluster_centers.shape

In [ ]:
utils2.save_array(data_path+"cluster_centers_bw_001.bc", cluster_centers)

Formatting Features for Bcolz iterator / garbage


In [3]:
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', 'DAY_OF_WEEK', "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [4]:
cluster_centers = utils2.load_array(data_path+"cluster_centers_bw_001.bc")

In [5]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [6]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)

In [7]:
def get_features(data):
    return [np.vstack(data['COORD_FEATURES'].as_matrix()), np.vstack(data['ORIGIN_CALL'].as_matrix()), 
           np.vstack(data['TAXI_ID'].as_matrix()), np.vstack(data['ORIGIN_STAND'].as_matrix()),
           np.vstack(data['QUARTER_HOUR'].as_matrix()), np.vstack(data['DAY_OF_WEEK'].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'].as_matrix()), np.array([long for i in range(0,data.shape[0])]),
               np.array([lat for i in range(0,data.shape[0])])]

In [8]:
def get_target(data):
    return np.vstack(data["TARGET"].as_matrix())

In [9]:
X_train_features = get_features(X_train)

In [10]:
X_train_target = get_target(X_train)

In [11]:
# utils2.save_array(data_path+'train/X_train_features.bc', get_features(X_train))  # - doesn't work - needs an array, not a list

MODEL

Load training data and cluster centers


In [12]:
train = pd.DataFrame(utils2.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

Validation cuts


In [13]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [14]:
print(datetime.datetime.fromtimestamp(1376503200))


2013-08-14 20:00:00

In [15]:
train.shape


Out[15]:
(1674160, 16)

In [16]:
val_indices = []
index = 0
for index, row in train.iterrows():
    time = row['TIMESTAMP']
    latitude = row['LATITUDE']
    for ts in cuts:
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            val_indices.append(index)
            break
    index += 1

In [17]:
X_valid = train.iloc[val_indices]

In [18]:
X_valid.head()


Out[18]:
TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP DAY_TYPE MISSING_DATA POLYLINE LATITUDE LONGITUDE TARGET COORD_FEATURES DAY_OF_WEEK QUARTER_HOUR WEEK_OF_YEAR
200153 1376502576620000126 B 0 36 247 1376502576 0 False [[-8.649504,41.15421],[-8.649684,41.154201],[-... [-0.0418419, -0.0419448, -0.0449813, -0.046422... [-0.583255, -0.586407, -0.59711, -0.589074, -0... [-8.61122, 41.1463] [-0.583255, -0.586407, -0.59711, -0.589074, -0... 2 79 33
200186 1376503146620000161 B 0 35 19 1376503146 0 False [[-8.649621,41.167323],[-8.64963,41.167251],[-... [0.135098, 0.134121, 0.126709, 0.125371, 0.124... [-0.585306, -0.585456, -0.589241, -0.588774, -... [-8.64504, 41.1586] [-0.585306, -0.585456, -0.589241, -0.588774, -... 2 79 33
200200 1376502942620000500 B 0 15 428 1376502942 0 False [[-8.585694,41.148522],[-8.585712,41.148801],[... [-0.118578, -0.114821, -0.112402, -0.116982, -... [0.532287, 0.531971, 0.523018, 0.524735, 0.524... [-8.61524, 41.1418] [0.532287, 0.531971, 0.523018, 0.524735, 0.524... 2 79 33
200202 1376502604620000105 C 0 0 87 1376502604 0 False [[-8.61093,41.145498],[-8.610939,41.145516],[-... [-0.15939, -0.159133, -0.153883, -0.145392, -0... [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1... [-8.64832, 41.1648] [0.0910987, 0.0909487, 0.093783, 0.108572, 0.1... 2 79 33
200227 1376502611620000022 C 0 0 304 1376502611 0 False [[-8.591301,41.162715],[-8.591004,41.162562],[... [0.0729274, 0.0708687, 0.0587228, 0.0539879, 0... [0.43427, 0.439455, 0.42735, 0.423566, 0.41539... [-8.60977, 41.1512] [0.43427, 0.439455, 0.42735, 0.423566, 0.41539... 2 79 33

In [19]:
for d in X_valid['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(d))


2013-08-14 19:49:36
2013-08-14 19:59:06
2013-08-14 19:55:42
2013-08-14 19:50:04
2013-08-14 19:50:11
2013-08-14 19:56:57
2013-08-14 19:36:51
2013-08-14 19:44:15
2013-08-14 19:55:50
2013-08-14 19:50:35
2013-08-14 19:50:27
2013-08-14 19:43:57
2013-08-14 19:16:48
2013-08-14 19:40:47
2013-08-14 19:45:55
2013-08-14 19:43:00
2013-08-14 19:53:22
2013-08-14 19:50:03
2013-08-14 19:26:22
2013-08-14 19:59:15
2013-08-14 19:50:17
2013-08-14 19:56:34
2013-08-14 19:53:42
2013-08-14 19:47:46
2013-08-14 19:58:46
2013-08-14 19:24:23
2013-08-14 19:55:19
2013-08-14 19:57:03
2013-08-14 19:56:11
2013-08-14 19:56:52
2013-08-14 19:57:57
2013-08-14 19:08:15
2013-08-14 19:51:14
2013-08-14 19:58:31
2013-08-14 19:47:31
2013-08-14 19:30:36
2013-08-14 19:17:59
2013-08-14 19:48:03
2013-08-14 19:55:52
2013-08-14 19:49:06
2013-08-14 19:58:55
2013-08-14 19:51:24
2013-08-14 19:54:12
2013-08-14 19:54:26
2013-08-14 19:51:18
2013-08-14 19:59:56
2013-08-14 19:48:31
2013-08-14 19:51:56
2013-08-14 19:39:22
2013-08-14 19:57:25
2013-08-14 19:57:28
2013-08-14 19:57:40
2013-08-14 19:39:01
2013-08-14 19:50:39
2013-08-14 18:48:19
2013-10-01 10:16:12
2013-10-01 10:28:04
2013-10-01 10:18:37
2013-10-01 10:24:48
2013-10-01 10:23:39
2013-10-01 10:28:37
2013-10-01 10:20:16
2013-10-01 10:23:49
2013-10-01 10:27:11
2013-10-01 10:06:20
2013-10-01 10:28:08
2013-10-01 10:29:02
2013-10-01 10:24:44
2013-10-01 10:24:44
2013-10-01 10:19:06
2013-10-01 09:28:33
2013-10-01 10:29:28
2013-10-01 10:27:31
2013-10-01 10:22:13
2013-10-01 10:26:03
2013-10-01 10:28:55
2013-10-01 10:18:10
2013-10-01 10:22:13
2013-10-01 10:14:30
2013-10-01 10:24:41
2013-10-01 10:22:16
2013-10-01 10:25:35
2013-10-01 10:21:27
2013-10-01 10:11:33
2013-10-01 10:10:18
2013-10-01 10:09:33
2013-10-01 10:01:15
2013-10-01 10:17:58
2013-10-01 10:18:00
2013-10-01 10:13:26
2013-10-01 10:18:01
2013-10-01 10:25:54
2013-10-01 10:21:20
2013-10-01 10:25:31
2013-10-01 10:25:54
2013-10-01 10:23:40
2013-10-01 10:26:46
2013-10-01 10:23:31
2013-10-01 10:17:09
2013-10-01 10:21:57
2013-10-01 09:29:09
2013-10-01 10:14:47
2013-10-01 10:04:25
2013-10-01 10:14:09
2013-10-01 10:16:59
2013-10-01 10:27:16
2013-10-01 10:16:26
2013-10-01 10:23:18
2013-10-01 10:16:05
2013-10-01 10:27:43
2013-10-01 10:08:13
2013-10-01 10:19:21
2013-10-01 10:21:19
2013-10-01 10:24:20
2013-10-01 10:26:45
2013-10-01 10:18:28
2013-10-01 10:19:45
2013-10-01 10:28:10
2013-10-01 10:22:20
2013-10-01 10:18:42
2013-10-01 10:19:52
2013-10-01 10:18:44
2013-10-01 10:15:11
2013-10-01 10:19:24
2013-10-01 10:23:58
2013-10-01 10:28:50
2013-10-01 10:13:24
2013-10-01 10:28:38
2013-10-01 10:24:50
2013-10-01 10:14:19
2013-10-01 10:10:05
2013-10-01 10:26:31
2013-10-01 10:28:01
2013-10-01 08:44:16
2013-10-01 10:21:43
2013-10-01 10:26:57
2013-10-01 10:25:25
2013-10-01 10:25:36
2013-10-01 10:16:34
2013-10-01 10:26:40
2013-10-01 10:14:56
2013-10-01 10:13:10
2013-10-01 10:28:34
2013-10-01 10:19:08
2013-10-01 10:24:57
2013-10-01 09:52:43
2013-10-01 10:25:28
2013-10-01 10:22:54
2013-10-01 10:28:49
2013-10-01 09:13:25
2013-10-07 19:34:47
2013-10-07 19:38:08
2013-10-07 19:31:10
2013-10-07 19:35:12
2013-10-07 19:41:50
2013-10-07 19:34:31
2013-10-07 19:42:02
2013-10-07 19:39:05
2013-10-07 19:31:43
2013-10-07 19:34:27
2013-10-07 19:31:48
2013-10-07 19:42:24
2013-10-07 19:38:37
2013-10-07 19:29:02
2013-10-07 19:33:55
2013-10-07 19:17:07
2013-10-07 19:44:31
2013-10-07 19:42:52
2013-10-07 19:26:05
2013-10-07 19:34:07
2013-10-07 19:40:59
2013-10-07 19:41:36
2013-10-07 19:33:47
2013-10-07 19:30:59
2013-10-07 19:38:59
2013-10-07 19:28:56
2013-10-07 19:41:24
2013-10-07 19:41:49
2013-10-07 19:42:47
2013-10-07 19:34:09
2013-10-07 19:40:31
2013-10-07 19:21:34
2013-10-07 19:43:52
2013-10-07 19:18:11
2013-10-07 19:41:47
2013-10-07 19:33:04
2013-10-07 19:40:53
2013-10-07 19:36:38
2013-10-07 19:41:46
2013-10-07 19:03:36
2013-10-07 19:44:45
2013-10-07 19:21:42
2013-10-07 19:24:07
2013-10-07 19:40:35
2013-10-07 19:41:00
2013-10-07 19:43:10
2013-10-07 19:23:55
2013-10-07 19:43:30
2013-10-07 19:25:24
2013-10-07 19:35:07
2013-10-07 19:43:33
2013-10-07 19:39:30
2013-10-07 19:31:42
2013-10-07 19:39:17
2013-10-07 19:42:47
2013-10-07 19:39:20
2013-10-07 19:44:41
2013-10-07 19:24:22
2013-10-07 19:12:39
2013-10-07 19:37:25
2013-10-07 19:42:55
2013-10-07 19:14:35
2013-10-07 19:37:12
2013-10-07 19:32:29
2013-10-07 19:42:37
2013-10-07 19:26:52
2013-10-07 19:31:19
2013-10-07 19:44:58
2013-11-02 04:47:37
2013-11-02 04:54:00
2013-11-02 04:58:53
2013-11-02 04:56:37
2013-11-02 04:56:09
2013-11-02 04:51:05
2013-11-02 04:50:58
2013-11-02 04:55:26
2013-11-02 04:53:43
2013-11-02 04:53:46
2013-11-02 04:54:55
2013-11-02 04:59:28
2013-11-02 04:56:54
2013-11-02 04:50:37
2013-11-02 04:48:40
2013-11-02 04:55:46
2013-11-02 04:45:20
2013-11-02 04:46:22
2013-11-02 04:48:25
2013-11-02 04:47:19
2013-11-02 04:57:31
2013-11-02 04:58:14
2013-11-02 04:49:30
2013-11-02 04:43:31
2013-11-02 04:59:00
2013-11-02 04:54:23
2013-11-02 04:51:01
2013-11-02 04:38:12
2013-11-02 04:59:31
2013-11-02 04:56:46
2013-11-02 04:53:51
2013-11-02 04:48:00
2013-11-02 04:58:04
2013-11-02 04:52:50
2013-11-02 04:58:12
2013-11-02 04:57:37
2013-11-02 04:53:33
2013-11-02 04:54:11
2013-11-02 04:48:49
2013-11-02 04:42:56
2013-11-02 04:55:36
2013-11-02 04:51:36
2013-11-02 04:48:45
2013-11-02 04:49:17
2013-11-02 04:53:50
2013-11-02 04:45:28
2013-11-02 04:45:04
2013-11-02 04:52:17
2013-11-02 04:52:10
2013-11-02 04:59:16
2013-11-02 04:51:37
2013-11-02 04:50:10
2013-12-22 15:24:50
2013-12-22 15:04:12
2013-12-22 15:16:27
2013-12-22 15:23:06
2013-12-22 15:24:04
2013-12-22 15:17:33
2013-12-22 15:22:55
2013-12-22 15:24:35
2013-12-22 15:21:56
2013-12-22 15:22:49
2013-12-22 15:25:31
2013-12-22 15:21:31
2013-12-22 15:27:31
2013-12-22 15:29:45
2013-12-22 15:26:09
2013-12-22 15:17:08
2013-12-22 15:26:00
2013-12-22 15:20:56
2013-12-22 15:23:09
2013-12-22 15:22:31
2013-12-22 15:29:59
2013-12-22 15:27:43
2013-12-22 15:23:04
2013-12-22 15:25:30
2013-12-22 15:19:16
2013-12-22 15:23:06
2013-12-22 15:26:01
2013-12-22 15:19:45
2013-12-22 11:34:23
2013-12-22 15:29:54
2013-12-22 15:28:39
2013-12-22 15:27:43
2013-12-22 15:16:23
2013-12-22 15:17:26

In [20]:
X_train = train.drop(train.index[[val_indices]])

In [21]:
cluster_centers = utils2.load_array(data_path+"cluster_centers_bw_001.bc")

In [22]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [23]:
utils2.save_array(data_path+'train/X_train.bc', X_train.as_matrix())

In [24]:
utils2.save_array(data_path+'valid/X_val.bc', X_valid.as_matrix())

In [25]:
X_train = pd.DataFrame(utils2.load_array(data_path+'train/X_train.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [26]:
X_valid = pd.DataFrame(utils2.load_array(data_path+'valid/X_val.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [ ]:


In [ ]:

The equirectangular loss function mentioned in the paper.

Note: Very important that y[0] is longitude and y[1] is latitude.

Omitted the radius of the earth constant "R" as it does not affect minimization and units were not given in the paper.


In [27]:
def equirectangular_loss(y_true, y_pred):
    deg2rad = 3.141592653589793 / 180
    long_1 = y_true[:,0]*deg2rad
    long_2 = y_pred[:,0]*deg2rad
    lat_1 = y_true[:,1]*deg2rad
    lat_2 = y_pred[:,1]*deg2rad
    return 6371*K.sqrt(K.square((long_1 - long_2)*K.cos((lat_1 + lat_2)/2.))
                       +K.square(lat_1 - lat_2))

In [28]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)  # Keras 2

The following returns a fully-connected model as mentioned in the paper. Takes as input k as defined before, and the cluster centers.

Inputs: Embeddings for each category, concatenated w/ the 4*k continous variable representing the first/last k coords as mentioned above.

Embeddings have no regularization, as it was not mentioned in paper, though are easily equipped to include.

Paper mentions global normalization. Didn't specify exactly how they did that, whether thay did it sequentially or whatnot. I just included a batchnorm layer for the continuous inputs.

After concatenation, 1 hidden layer of 500 neurons as called for in paper.

Finally, output layer has as many outputs as there are cluster centers, w/ a softmax activation. Call this output P.

The prediction is the weighted sum of each cluster center c_i w/ corresponding predicted prob P_i.

To facilitate this, dotted output w/ cluster latitudes and longitudes separately. (this happens at variable y), then concatenated into single tensor.

NOTE!!: You will see that I have the cluster center coords as inputs. Ideally, This function should store the cluster longs/lats as a constant to be used in the model, but I could not figure out. As a consequence, I pass them in as a repeated input.


In [29]:
def taxi_mlp(k, cluster_centers):
    shp = cluster_centers.shape[0]
    nums = Input(shape=(4*k,))

    center_longs = Input(shape=(shp,))
    center_lats = Input(shape=(shp,))

    emb_names = ['client_ID', 'taxi_ID', "stand_ID", "quarter_hour", "day_of_week", "week_of_year"]
    emb_ins = [57106, 448, 64, 96, 7, 52]
    emb_outs = [10 for i in range(0,6)]
    regs = [0 for i in range(0,6)]

    embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]

    x = concatenate([nums] + [Flatten()(e[1]) for e in embs])  # Keras 2

    x = Dense(500, activation='relu')(x)

    x = Dense(shp, activation='softmax')(x)

    y = concatenate([dot([x, center_longs], axes=1), dot([x, center_lats], axes=1)])  # Keras 2

    return Model(inputs = [nums]+[e[0] for e in embs] + [center_longs, center_lats], outputs = y)  # Keras 2

As mentioned, construction of repeated cluster longs/lats for input

Iterator for in memory train pandas dataframe. I did this as opposed to bcolz iterator due to the pre-processing


In [30]:
def data_iter(data, batch_size, cluster_centers):
    long = [c[0] for c in cluster_centers]
    lat = [c[1] for c in cluster_centers]
    i = 0
    N = data.shape[0]
    while True:
        yield ([np.vstack(data['COORD_FEATURES'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_CALL'][i:i+batch_size].as_matrix()), 
           np.vstack(data['TAXI_ID'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_STAND'][i:i+batch_size].as_matrix()),
           np.vstack(data['QUARTER_HOUR'][i:i+batch_size].as_matrix()), np.vstack(data['DAY_OF_WEEK'][i:i+batch_size].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'][i:i+batch_size].as_matrix()), np.array([long for i in range(0,batch_size)]),
               np.array([lat for i in range(0,batch_size)])], np.vstack(data["TARGET"][i:i+batch_size].as_matrix()))
        i += batch_size

In [31]:
# x=Lambda(thing)([x,long,lat])

Of course, k in the model needs to match k from feature construction. We again use 5 as they did in the paper


In [50]:
del model
model = taxi_mlp(5, cluster_centers)

Paper used SGD opt w/ following paramerters


In [51]:
# Reduced the initial 0.001 learning rate to avoid NaN's
model.compile(optimizer=SGD(1e-6, momentum=0.9), loss=equirectangular_loss, metrics=['mse'])

# - Try also Adam optimizer
# optim = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# model.compile(optimizer=optim, loss=equirectangular_loss, metrics=['mse'])

In [34]:
X_train_feat = get_features(X_train)

In [35]:
X_train_target = get_target(X_train)

In [36]:
X_val_feat = get_features(X_valid)

In [37]:
X_val_target = get_target(X_valid)

In [38]:
tqdm = TQDMNotebookCallback()

In [39]:
# - Added verbose=1 to track improvement through epochs
checkpoint = ModelCheckpoint(verbose=1, filepath=data_path+'models/weights.{epoch:03d}.{val_loss:.8f}.hdf5', save_best_only=True)

In [40]:
batch_size=256

original


In [55]:
model.fit(X_train_feat, X_train_target, epochs=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


Epoch 00000: val_loss improved from 4.31887 to 4.31829, saving model to data/taxi/models/weights.000.4.31829121.hdf5

Out[55]:
<keras.callbacks.History at 0x7fe5f041a4a8>

In [56]:
model.fit(X_train_feat, X_train_target, epochs=30, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


Epoch 00000: val_loss improved from 4.31829 to 4.31774, saving model to data/taxi/models/weights.000.4.31774099.hdf5
Epoch 00001: val_loss improved from 4.31774 to 4.31718, saving model to data/taxi/models/weights.001.4.31717622.hdf5
Epoch 00002: val_loss improved from 4.31718 to 4.31663, saving model to data/taxi/models/weights.002.4.31662745.hdf5
Epoch 00003: val_loss improved from 4.31663 to 4.31606, saving model to data/taxi/models/weights.003.4.31606152.hdf5
Epoch 00004: val_loss improved from 4.31606 to 4.31548, saving model to data/taxi/models/weights.004.4.31548033.hdf5
Epoch 00005: val_loss improved from 4.31548 to 4.31494, saving model to data/taxi/models/weights.005.4.31493714.hdf5
Epoch 00006: val_loss improved from 4.31494 to 4.31436, saving model to data/taxi/models/weights.006.4.31435751.hdf5
Epoch 00007: val_loss improved from 4.31436 to 4.31374, saving model to data/taxi/models/weights.007.4.31374216.hdf5
Epoch 00008: val_loss improved from 4.31374 to 4.31317, saving model to data/taxi/models/weights.008.4.31317252.hdf5
Epoch 00009: val_loss improved from 4.31317 to 4.31260, saving model to data/taxi/models/weights.009.4.31259594.hdf5
Epoch 00010: val_loss improved from 4.31260 to 4.31199, saving model to data/taxi/models/weights.010.4.31199114.hdf5
Epoch 00011: val_loss improved from 4.31199 to 4.31134, saving model to data/taxi/models/weights.011.4.31133757.hdf5
Epoch 00012: val_loss improved from 4.31134 to 4.31072, saving model to data/taxi/models/weights.012.4.31071869.hdf5
Epoch 00013: val_loss improved from 4.31072 to 4.31008, saving model to data/taxi/models/weights.013.4.31008419.hdf5
Epoch 00014: val_loss improved from 4.31008 to 4.30941, saving model to data/taxi/models/weights.014.4.30940849.hdf5
Epoch 00015: val_loss improved from 4.30941 to 4.30867, saving model to data/taxi/models/weights.015.4.30866721.hdf5
Epoch 00016: val_loss improved from 4.30867 to 4.30792, saving model to data/taxi/models/weights.016.4.30791654.hdf5
Epoch 00017: val_loss improved from 4.30792 to 4.30702, saving model to data/taxi/models/weights.017.4.30701692.hdf5
Epoch 00018: val_loss improved from 4.30702 to 4.30598, saving model to data/taxi/models/weights.018.4.30598387.hdf5
Epoch 00019: val_loss improved from 4.30598 to 4.30465, saving model to data/taxi/models/weights.019.4.30465317.hdf5
Epoch 00020: val_loss improved from 4.30465 to 4.30299, saving model to data/taxi/models/weights.020.4.30298820.hdf5
Epoch 00021: val_loss improved from 4.30299 to 4.30127, saving model to data/taxi/models/weights.021.4.30127284.hdf5
Epoch 00022: val_loss improved from 4.30127 to 4.29971, saving model to data/taxi/models/weights.022.4.29970756.hdf5
Epoch 00023: val_loss improved from 4.29971 to 4.29832, saving model to data/taxi/models/weights.023.4.29831633.hdf5
Epoch 00024: val_loss improved from 4.29832 to 4.29692, saving model to data/taxi/models/weights.024.4.29692163.hdf5
Epoch 00025: val_loss improved from 4.29692 to 4.29569, saving model to data/taxi/models/weights.025.4.29569347.hdf5
Epoch 00026: val_loss improved from 4.29569 to 4.29461, saving model to data/taxi/models/weights.026.4.29460874.hdf5
Epoch 00027: val_loss improved from 4.29461 to 4.29367, saving model to data/taxi/models/weights.027.4.29367482.hdf5
Epoch 00028: val_loss improved from 4.29367 to 4.29283, saving model to data/taxi/models/weights.028.4.29282813.hdf5
Epoch 00029: val_loss improved from 4.29283 to 4.29210, saving model to data/taxi/models/weights.029.4.29209712.hdf5

Out[56]:
<keras.callbacks.History at 0x7fe5f2e6f160>

In [58]:
# - Load the saved best model, otherwise the training would go on from the current model
# - which is not guaranteed to be the best one
# - (check the actual file name)
model = load_model(data_path+'models/weights.028.4.29282813.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [69]:
# - trying also learning rate annealing
K.set_value(model.optimizer.lr, 5e-4)


1190656/|/[loss: 3.316, mean_squared_error: 0.001]  71%|| 1190656/1673856 [00:57<00:16, 29457.51it/s]

In [60]:
model.fit(X_train_feat, X_train_target, epochs=100, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


Epoch 00000: val_loss improved from 4.29210 to 4.29207, saving model to data/taxi/models/weights.000.4.29207400.hdf5
Epoch 00001: val_loss improved from 4.29207 to 4.29137, saving model to data/taxi/models/weights.001.4.29137112.hdf5
Epoch 00002: val_loss improved from 4.29137 to 4.29064, saving model to data/taxi/models/weights.002.4.29063544.hdf5
Epoch 00003: val_loss improved from 4.29064 to 4.28992, saving model to data/taxi/models/weights.003.4.28991975.hdf5
Epoch 00004: val_loss improved from 4.28992 to 4.28912, saving model to data/taxi/models/weights.004.4.28912484.hdf5
Epoch 00005: val_loss improved from 4.28912 to 4.28833, saving model to data/taxi/models/weights.005.4.28833231.hdf5
Epoch 00006: val_loss improved from 4.28833 to 4.28753, saving model to data/taxi/models/weights.006.4.28752909.hdf5
Epoch 00007: val_loss improved from 4.28753 to 4.28669, saving model to data/taxi/models/weights.007.4.28669458.hdf5
Epoch 00008: val_loss improved from 4.28669 to 4.28584, saving model to data/taxi/models/weights.008.4.28584460.hdf5
Epoch 00009: val_loss improved from 4.28584 to 4.28503, saving model to data/taxi/models/weights.009.4.28503423.hdf5
Epoch 00010: val_loss improved from 4.28503 to 4.28416, saving model to data/taxi/models/weights.010.4.28415504.hdf5
Epoch 00011: val_loss improved from 4.28416 to 4.28327, saving model to data/taxi/models/weights.011.4.28326883.hdf5
Epoch 00012: val_loss improved from 4.28327 to 4.28235, saving model to data/taxi/models/weights.012.4.28234735.hdf5
Epoch 00013: val_loss improved from 4.28235 to 4.28138, saving model to data/taxi/models/weights.013.4.28138186.hdf5
Epoch 00014: val_loss improved from 4.28138 to 4.28039, saving model to data/taxi/models/weights.014.4.28039204.hdf5
Epoch 00015: val_loss improved from 4.28039 to 4.27937, saving model to data/taxi/models/weights.015.4.27936888.hdf5
Epoch 00016: val_loss improved from 4.27937 to 4.27843, saving model to data/taxi/models/weights.016.4.27843129.hdf5
Epoch 00017: val_loss improved from 4.27843 to 4.27752, saving model to data/taxi/models/weights.017.4.27751621.hdf5
Epoch 00018: val_loss improved from 4.27752 to 4.27680, saving model to data/taxi/models/weights.018.4.27680246.hdf5
Epoch 00019: val_loss improved from 4.27680 to 4.27610, saving model to data/taxi/models/weights.019.4.27610297.hdf5
Epoch 00020: val_loss improved from 4.27610 to 4.27546, saving model to data/taxi/models/weights.020.4.27546325.hdf5
Epoch 00021: val_loss improved from 4.27546 to 4.27483, saving model to data/taxi/models/weights.021.4.27482771.hdf5
Epoch 00022: val_loss improved from 4.27483 to 4.27427, saving model to data/taxi/models/weights.022.4.27427272.hdf5
Epoch 00023: val_loss improved from 4.27427 to 4.27365, saving model to data/taxi/models/weights.023.4.27365168.hdf5
Epoch 00024: val_loss improved from 4.27365 to 4.27306, saving model to data/taxi/models/weights.024.4.27306346.hdf5
Epoch 00025: val_loss improved from 4.27306 to 4.27245, saving model to data/taxi/models/weights.025.4.27245205.hdf5
Epoch 00026: val_loss improved from 4.27245 to 4.27180, saving model to data/taxi/models/weights.026.4.27180182.hdf5
Epoch 00027: val_loss improved from 4.27180 to 4.27114, saving model to data/taxi/models/weights.027.4.27114311.hdf5
Epoch 00028: val_loss improved from 4.27114 to 4.27049, saving model to data/taxi/models/weights.028.4.27049180.hdf5
Epoch 00029: val_loss improved from 4.27049 to 4.26976, saving model to data/taxi/models/weights.029.4.26975740.hdf5
Epoch 00030: val_loss improved from 4.26976 to 4.26907, saving model to data/taxi/models/weights.030.4.26907022.hdf5
Epoch 00031: val_loss improved from 4.26907 to 4.26839, saving model to data/taxi/models/weights.031.4.26839241.hdf5
Epoch 00032: val_loss improved from 4.26839 to 4.26764, saving model to data/taxi/models/weights.032.4.26763758.hdf5
Epoch 00033: val_loss improved from 4.26764 to 4.26691, saving model to data/taxi/models/weights.033.4.26690501.hdf5
Epoch 00034: val_loss improved from 4.26691 to 4.26616, saving model to data/taxi/models/weights.034.4.26615564.hdf5
Epoch 00035: val_loss improved from 4.26616 to 4.26541, saving model to data/taxi/models/weights.035.4.26540591.hdf5
Epoch 00036: val_loss improved from 4.26541 to 4.26463, saving model to data/taxi/models/weights.036.4.26462643.hdf5
Epoch 00037: val_loss improved from 4.26463 to 4.26388, saving model to data/taxi/models/weights.037.4.26387523.hdf5
Epoch 00038: val_loss improved from 4.26388 to 4.26311, saving model to data/taxi/models/weights.038.4.26311260.hdf5
Epoch 00039: val_loss improved from 4.26311 to 4.26232, saving model to data/taxi/models/weights.039.4.26231723.hdf5
Epoch 00040: val_loss improved from 4.26232 to 4.26153, saving model to data/taxi/models/weights.040.4.26152596.hdf5
Epoch 00041: val_loss improved from 4.26153 to 4.26076, saving model to data/taxi/models/weights.041.4.26075772.hdf5
Epoch 00042: val_loss improved from 4.26076 to 4.25994, saving model to data/taxi/models/weights.042.4.25994256.hdf5
Epoch 00043: val_loss improved from 4.25994 to 4.25914, saving model to data/taxi/models/weights.043.4.25913650.hdf5
Epoch 00044: val_loss improved from 4.25914 to 4.25829, saving model to data/taxi/models/weights.044.4.25829431.hdf5
Epoch 00045: val_loss improved from 4.25829 to 4.25749, saving model to data/taxi/models/weights.045.4.25748587.hdf5
Epoch 00046: val_loss improved from 4.25749 to 4.25669, saving model to data/taxi/models/weights.046.4.25668621.hdf5
Epoch 00047: val_loss improved from 4.25669 to 4.25585, saving model to data/taxi/models/weights.047.4.25585277.hdf5
Epoch 00048: val_loss improved from 4.25585 to 4.25503, saving model to data/taxi/models/weights.048.4.25503128.hdf5
Epoch 00049: val_loss improved from 4.25503 to 4.25423, saving model to data/taxi/models/weights.049.4.25422764.hdf5
Epoch 00050: val_loss improved from 4.25423 to 4.25340, saving model to data/taxi/models/weights.050.4.25340289.hdf5
Epoch 00051: val_loss improved from 4.25340 to 4.25257, saving model to data/taxi/models/weights.051.4.25257196.hdf5
Epoch 00052: val_loss improved from 4.25257 to 4.25175, saving model to data/taxi/models/weights.052.4.25175014.hdf5
Epoch 00053: val_loss improved from 4.25175 to 4.25094, saving model to data/taxi/models/weights.053.4.25094499.hdf5
Epoch 00054: val_loss improved from 4.25094 to 4.25016, saving model to data/taxi/models/weights.054.4.25016092.hdf5
Epoch 00055: val_loss improved from 4.25016 to 4.24938, saving model to data/taxi/models/weights.055.4.24938102.hdf5
Epoch 00056: val_loss improved from 4.24938 to 4.24858, saving model to data/taxi/models/weights.056.4.24857754.hdf5
Epoch 00057: val_loss improved from 4.24858 to 4.24785, saving model to data/taxi/models/weights.057.4.24785084.hdf5
Epoch 00058: val_loss improved from 4.24785 to 4.24709, saving model to data/taxi/models/weights.058.4.24709157.hdf5
Epoch 00059: val_loss improved from 4.24709 to 4.24633, saving model to data/taxi/models/weights.059.4.24633237.hdf5
Epoch 00060: val_loss improved from 4.24633 to 4.24565, saving model to data/taxi/models/weights.060.4.24564756.hdf5
Epoch 00061: val_loss improved from 4.24565 to 4.24495, saving model to data/taxi/models/weights.061.4.24494578.hdf5
Epoch 00062: val_loss improved from 4.24495 to 4.24427, saving model to data/taxi/models/weights.062.4.24427030.hdf5
Epoch 00063: val_loss improved from 4.24427 to 4.24364, saving model to data/taxi/models/weights.063.4.24363914.hdf5
Epoch 00064: val_loss improved from 4.24364 to 4.24298, saving model to data/taxi/models/weights.064.4.24298467.hdf5
Epoch 00065: val_loss improved from 4.24298 to 4.24235, saving model to data/taxi/models/weights.065.4.24235387.hdf5
Epoch 00066: val_loss improved from 4.24235 to 4.24173, saving model to data/taxi/models/weights.066.4.24172959.hdf5
Epoch 00067: val_loss improved from 4.24173 to 4.24112, saving model to data/taxi/models/weights.067.4.24111878.hdf5
Epoch 00068: val_loss improved from 4.24112 to 4.24047, saving model to data/taxi/models/weights.068.4.24047199.hdf5
Epoch 00069: val_loss improved from 4.24047 to 4.23987, saving model to data/taxi/models/weights.069.4.23986701.hdf5
Epoch 00070: val_loss improved from 4.23987 to 4.23925, saving model to data/taxi/models/weights.070.4.23924511.hdf5
Epoch 00071: val_loss improved from 4.23925 to 4.23864, saving model to data/taxi/models/weights.071.4.23863888.hdf5
Epoch 00072: val_loss improved from 4.23864 to 4.23801, saving model to data/taxi/models/weights.072.4.23801241.hdf5
Epoch 00073: val_loss improved from 4.23801 to 4.23742, saving model to data/taxi/models/weights.073.4.23742124.hdf5
Epoch 00074: val_loss improved from 4.23742 to 4.23681, saving model to data/taxi/models/weights.074.4.23681405.hdf5
Epoch 00075: val_loss improved from 4.23681 to 4.23622, saving model to data/taxi/models/weights.075.4.23622094.hdf5
Epoch 00076: val_loss improved from 4.23622 to 4.23559, saving model to data/taxi/models/weights.076.4.23558968.hdf5
Epoch 00077: val_loss improved from 4.23559 to 4.23499, saving model to data/taxi/models/weights.077.4.23499022.hdf5
Epoch 00078: val_loss improved from 4.23499 to 4.23439, saving model to data/taxi/models/weights.078.4.23439420.hdf5
Epoch 00079: val_loss improved from 4.23439 to 4.23375, saving model to data/taxi/models/weights.079.4.23374766.hdf5
Epoch 00080: val_loss improved from 4.23375 to 4.23311, saving model to data/taxi/models/weights.080.4.23310862.hdf5
Epoch 00081: val_loss improved from 4.23311 to 4.23246, saving model to data/taxi/models/weights.081.4.23245877.hdf5
Epoch 00082: val_loss improved from 4.23246 to 4.23182, saving model to data/taxi/models/weights.082.4.23181968.hdf5
Epoch 00083: val_loss improved from 4.23182 to 4.23121, saving model to data/taxi/models/weights.083.4.23121372.hdf5
Epoch 00084: val_loss improved from 4.23121 to 4.23057, saving model to data/taxi/models/weights.084.4.23057433.hdf5
Epoch 00085: val_loss improved from 4.23057 to 4.22994, saving model to data/taxi/models/weights.085.4.22994373.hdf5
Epoch 00086: val_loss improved from 4.22994 to 4.22928, saving model to data/taxi/models/weights.086.4.22927731.hdf5
Epoch 00087: val_loss improved from 4.22928 to 4.22862, saving model to data/taxi/models/weights.087.4.22862314.hdf5
Epoch 00088: val_loss improved from 4.22862 to 4.22799, saving model to data/taxi/models/weights.088.4.22799080.hdf5
Epoch 00089: val_loss improved from 4.22799 to 4.22734, saving model to data/taxi/models/weights.089.4.22733974.hdf5
Epoch 00090: val_loss improved from 4.22734 to 4.22672, saving model to data/taxi/models/weights.090.4.22672006.hdf5
Epoch 00091: val_loss improved from 4.22672 to 4.22605, saving model to data/taxi/models/weights.091.4.22605153.hdf5
Epoch 00092: val_loss improved from 4.22605 to 4.22537, saving model to data/taxi/models/weights.092.4.22537447.hdf5
Epoch 00093: val_loss improved from 4.22537 to 4.22466, saving model to data/taxi/models/weights.093.4.22465932.hdf5
Epoch 00094: val_loss improved from 4.22466 to 4.22399, saving model to data/taxi/models/weights.094.4.22399192.hdf5
Epoch 00095: val_loss improved from 4.22399 to 4.22327, saving model to data/taxi/models/weights.095.4.22327027.hdf5
Epoch 00096: val_loss improved from 4.22327 to 4.22250, saving model to data/taxi/models/weights.096.4.22249749.hdf5
Epoch 00097: val_loss improved from 4.22250 to 4.22179, saving model to data/taxi/models/weights.097.4.22179410.hdf5
Epoch 00098: val_loss improved from 4.22179 to 4.22109, saving model to data/taxi/models/weights.098.4.22108590.hdf5
Epoch 00099: val_loss improved from 4.22109 to 4.22030, saving model to data/taxi/models/weights.099.4.22030055.hdf5

Out[60]:
<keras.callbacks.History at 0x7fe5f04261d0>

In [61]:
model.save(data_path+'models/current_model.hdf5')

new valid


In [62]:
model.fit(X_train_feat, X_train_target, epochs=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


Epoch 00000: val_loss improved from 4.22030 to 4.21952, saving model to data/taxi/models/weights.000.4.21952278.hdf5

Out[62]:
<keras.callbacks.History at 0x7fe5f33ac940>

In [ ]:
# - Load again the saved best model, otherwise the training would go on from the current model
# - which is not guaranteed to be the best one
# - (check the actual file name)
model = load_model(data_path+'models/weights.000.0.73703137.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [ ]:
model.fit(X_train_feat, X_train_target, epochs=400, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


Epoch 00000: val_loss improved from 4.21952 to 0.73703, saving model to data/taxi/models/weights.000.0.73703137.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss did not improve
Epoch 00017: val_loss did not improve
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
Epoch 00020: val_loss did not improve
Epoch 00021: val_loss did not improve
Epoch 00022: val_loss did not improve
Epoch 00023: val_loss did not improve
Epoch 00024: val_loss did not improve

In [ ]:
model.save(data_path+'models/current_model.hdf5')

In [ ]:
len(X_val_feat[0])

It works, but it seems to converge unrealistically quick and the loss values are not the same. The paper does not mention what it's using as "error" in it's results. I assume the same equirectangular? Not very clear. The difference in values could be due to the missing Earth-radius factor

Kaggle Entry


In [ ]:
# - Use the filename of the best model
best_model = load_model(data_path+'models/weights.308.0.03373993.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [ ]:
best_model.evaluate(X_val_feat, X_val_target)

In [ ]:
test = pd.DataFrame(utils2.load_array(data_path+'test/test_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [ ]:
# test['ORIGIN_CALL'] = pd.read_csv(data_path+'real_origin_call.csv', header=None)  # - file not available

In [ ]:
# test['TAXI_ID'] = pd.read_csv(data_path+'real_taxi_id.csv',header=None)  #  # - file not available

In [ ]:
X_test = get_features(test)

In [ ]:
b = np.sort(X_test[1],axis=None)

In [ ]:
test_preds = np.round(best_model.predict(X_test), decimals=6)

In [ ]:
d = {0:test['TRIP_ID'], 1:test_preds[:,1], 2:test_preds[:,0]}
kaggle_out = pd.DataFrame(data=d)

In [ ]:
kaggle_out.to_csv(data_path+'submission.csv', header=['TRIP_ID','LATITUDE', 'LONGITUDE'], index=False)

In [ ]:
def hdist(a, b):
    deg2rad = 3.141592653589793 / 180

    lat1 = a[:, 1] * deg2rad
    lon1 = a[:, 0] * deg2rad
    lat2 = b[:, 1] * deg2rad
    lon2 = b[:, 0] * deg2rad

    dlat = abs(lat1-lat2)
    dlon = abs(lon1-lon2)

    al = np.sin(dlat/2)**2  + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2)**2)
    d = np.arctan2(np.sqrt(al), np.sqrt(1-al))

    hd = 2 * 6371 * d

    return hd

In [ ]:
val_preds = best_model.predict(X_val_feat)

In [ ]:
trn_preds = model.predict(X_train_feat)

In [ ]:
er = hdist(val_preds, X_val_target)

In [ ]:
er.mean()

In [ ]:


In [ ]:
K.equal()

To-do: simple to extend to validation data

Uh oh... training data not representative of test


In [ ]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [ ]:
np.any([train['TIMESTAMP'].map(lambda x: x in cuts)])

In [ ]:
train['TIMESTAMP']

In [ ]:
np.any(train['TIMESTAMP']==1381167900)

In [ ]:
times = train['TIMESTAMP'].as_matrix()

In [ ]:
X_train.columns

In [ ]:
times

In [ ]:
count = 0
for index, row in X_val.iterrows():
    for ts in cuts:
        time = row['TIMESTAMP']
        latitude = row['LATITUDE']
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            count += 1

In [ ]:
one = count

In [ ]:
count + one

In [ ]:
import h5py

In [ ]:
h = h5py.File(data_path+'original/data.hdf5', 'r')

In [ ]:
evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']

In [ ]:
c = np.load(data_path+'original/arrival-clusters.pkl')

hd5f files


In [ ]:
from fuel.utils import find_in_data_path
from fuel.datasets import H5PYDataset

In [ ]:
original_path = '/data/bckenstler/data/taxi/original/'

In [ ]:
train_set = H5PYDataset(original_path+'data.hdf5', which_sets=('train',),load_in_memory=True)

In [ ]:
valid_set = H5PYDataset(original_path+'valid.hdf5', which_sets=('cuts/test_times_0',),load_in_memory=True)

In [ ]:
print(train_set.num_examples)

In [ ]:
print(valid_set.num_examples)

In [ ]:
data = train_set.data_sources

In [ ]:
data[0]

In [ ]:
valid_data = valid_set.data_sources

In [ ]:
valid_data[4][0]

In [ ]:
stamps = valid_data[-3]

In [ ]:
stamps[0]

In [ ]:
for i in range(0,304):    
    print(np.any([t==int(stamps[i]) for t in X_val['TIMESTAMP']]))

In [ ]:
type(X_train['TIMESTAMP'][0])

In [ ]:
type(stamps[0])

In [ ]:


In [ ]:
check = [s in stamps for s in X_val['TIMESTAMP']]

In [ ]:
for s in X_val['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(s))

In [ ]:
for s in stamps:
    print(datetime.datetime.fromtimestamp(s))

In [ ]:
ids = valid_data[-1]

In [ ]:
type(ids[0])

In [ ]:
ids

In [ ]:
X_val