In [ ]:
import ast
import pandas as pd
import datetime
from keras.layers import Input, Dense, Embedding, merge, Flatten, Merge, BatchNormalization
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
import utils
import data
from sklearn.model_selection import train_test_split
from bcolz_array_iterator import BcolzArrayIterator
import bcolz
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
Below path is a shared directory, swap to own
In [ ]:
data_path = "/data/datasets/taxi/"
Original repo used some bizarre tuple method of reading in data to save in a hdf5 file using fuel. The following does the same approach in that module, only using pandas and saving in a bcolz format (w/ training data as example)
In [ ]:
meta = pd.read_csv(data_path+'metaData_taxistandsID_name_GPSlocation.csv', header=0)
In [ ]:
meta.head()
In [ ]:
train = pd.read_csv(data_path+'train/train.csv', header=0)
In [ ]:
train.head()
In [ ]:
train['ORIGIN_CALL'] = pd.Series(pd.factorize(train['ORIGIN_CALL'])[0]) + 1
In [ ]:
train['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in train["ORIGIN_STAND"]])
In [ ]:
train['TAXI_ID'] = pd.Series(pd.factorize(train['TAXI_ID'])[0]) + 1
In [ ]:
train['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in train['DAY_TYPE']])
The array of long/lat coordinates per trip (row) is read in as a string. The function ast.literal_eval(x) evaluates the string into the expression it represents (safely). This happens below
In [ ]:
polyline = pd.Series([ast.literal_eval(x) for x in train['POLYLINE']])
Split into latitude/longitude
In [ ]:
train['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])
In [ ]:
train['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
In [ ]:
utils.save_array(data_path+'train/train.bc', train.as_matrix())
In [ ]:
utils.save_array(data_path+'train/meta_train.bc', meta.as_matrix())
After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.
In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train.bc'), columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE'])
In [ ]:
train.head()
The paper discusses how many categorical variables there are per category. The following all check out
In [ ]:
train['ORIGIN_CALL'].max()
In [ ]:
train['ORIGIN_STAND'].max()
In [ ]:
train['TAXI_ID'].max()
Self-explanatory
In [ ]:
train['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in train['TIMESTAMP']])
Quarter hour of the day, i.e. 1 of the 4*24 = 96 quarter hours of the day
In [ ]:
train['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
for t in train['TIMESTAMP']])
Self-explanatory
In [ ]:
train['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in train['TIMESTAMP']])
Target coords are the last in the sequence (final position). If there are no positions, or only 1, then mark as invalid w/ nan in order to drop later
In [ ]:
train['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else numpy.nan for l in train[['LONGITUDE','LATITUDE']].iterrows()])
This function creates the continuous inputs, which are the concatened k first and k last coords in a sequence, as discussed in the paper.
If there aren't at least 2* k coords excluding the target, then the k first and k last overlap. In this case the sequence (excluding target) is padded at the end with the last coord in the sequence. The paper mentioned they padded front and back but didn't specify in what manner.
Also marks any invalid w/ na's
In [ ]:
def start_stop_inputs(k):
result = []
for l in train[['LONGITUDE','LATITUDE']].iterrows():
if len(l[1][0]) < 2 or len(l[1][1]) < 2:
result.append(numpy.nan)
elif len(l[1][0][:-1]) >= 2*k:
result.append(numpy.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
else:
l1 = numpy.lib.pad(l[1][0][:-1], (0,20-len(l[1][0][:-1])), mode='edge')
l2 = numpy.lib.pad(l[1][1][:-1], (0,20-len(l[1][1][:-1])), mode='edge')
result.append(numpy.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
return pd.Series(result)
In [ ]:
train['COORD_FEATURES'] = start_stop_inputs(5)
In [ ]:
train.shape
In [ ]:
train.dropna().shape
Drop na's
In [ ]:
train = train.dropna()
In [ ]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())
In [ ]:
train = pd.read_csv(data_path+'train/train.csv', header=0)
In [ ]:
test = pd.read_csv(data_path+'test/test.csv', header=0)
In [ ]:
def start_stop_inputs(k, data, test):
result = []
for l in data[['LONGITUDE','LATITUDE']].iterrows():
if not test:
if len(l[1][0]) < 2 or len(l[1][1]) < 2:
result.append(np.nan)
elif len(l[1][0][:-1]) >= 2*k:
result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
else:
l1 = np.lib.pad(l[1][0][:-1], (0,4*k-len(l[1][0][:-1])), mode='edge')
l2 = np.lib.pad(l[1][1][:-1], (0,4*k-len(l[1][1][:-1])), mode='edge')
result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
else:
if len(l[1][0]) < 1 or len(l[1][1]) < 1:
result.append(np.nan)
elif len(l[1][0]) >= 2*k:
result.append(np.concatenate([l[1][0][0:k],l[1][0][-k:],l[1][1][0:k],l[1][1][-k:]]).flatten())
else:
l1 = np.lib.pad(l[1][0], (0,4*k-len(l[1][0])), mode='edge')
l2 = np.lib.pad(l[1][1], (0,4*k-len(l[1][1])), mode='edge')
result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
return pd.Series(result)
Pre-calculated below on train set
In [ ]:
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309
In [ ]:
def feature_ext(data, test=False):
data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1
data['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in data["ORIGIN_STAND"]])
data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0]) + 1
data['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in data['DAY_TYPE']])
polyline = pd.Series([ast.literal_eval(x) for x in data['POLYLINE']])
data['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])
data['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
if not test:
data['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in data[['LONGITUDE','LATITUDE']].iterrows()])
data['LATITUDE'] = pd.Series([(t-lat_mean)/lat_std for t in data['LATITUDE']])
data['LONGITUDE'] = pd.Series([(t-long_mean)/long_std for t in data['LONGITUDE']])
data['COORD_FEATURES'] = start_stop_inputs(5, data, test)
data['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in data['TIMESTAMP']])
data['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
for t in data['TIMESTAMP']])
data['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in data['TIMESTAMP']])
data = data.dropna()
return data
In [ ]:
train = feature_ext(train)
In [ ]:
test = feature_ext(test, test=True)
In [ ]:
test.head()
In [ ]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())
In [ ]:
utils.save_array(data_path+'test/test_features.bc', test.as_matrix())
In [ ]:
train.head()
Meanshift clustering as performed in the paper
In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK',
'QUARTER_HOUR', "WEEK_OF_YEAR", "TARGET", "COORD_FEATURES"])
Clustering performed on the targets
In [ ]:
y_targ = np.vstack(train["TARGET"].as_matrix())
In [ ]:
from sklearn.cluster import MeanShift, estimate_bandwidth
Can use the commented out code for a estimate of bandwidth, which causes clustering to converge much quicker.
This is not mentioned in the paper but is included in the code. In order to get results similar to the paper's, they manually chose the uncommented bandwidth
In [ ]:
#bw = estimate_bandwidth(y_targ, quantile=.1, n_samples=1000)
bw = 0.001
This takes some time
In [ ]:
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(y_targ)
In [ ]:
cluster_centers = ms.cluster_centers_
This is very close to the number of clusters mentioned in the paper
In [ ]:
cluster_centers.shape
In [ ]:
utils.save_array(data_path+"cluster_centers_bw_001.bc", cluster_centers)
In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])
In [ ]:
cluster_centers = utils.load_array(data_path+"cluster_centers_bw_001.bc")
In [ ]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])
In [ ]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)
In [ ]:
def get_features(data):
return [np.vstack(data['COORD_FEATURES'].as_matrix()), np.vstack(data['ORIGIN_CALL'].as_matrix()),
np.vstack(data['TAXI_ID'].as_matrix()), np.vstack(data['ORIGIN_STAND'].as_matrix()),
np.vstack(data['QUARTER_HOUR'].as_matrix()), np.vstack(data['DAY_OF_WEEK'].as_matrix()),
np.vstack(data['WEEK_OF_YEAR'].as_matrix()), np.array([long for i in range(0,data.shape[0])]),
np.array([lat for i in range(0,data.shape[0])])]
In [ ]:
def get_target(data):
return np.vstack(data["TARGET"].as_matrix())
In [ ]:
X_train_features = get_features(X_train)
In [ ]:
X_train_target = get_target(X_train)
In [ ]:
utils.save_array(data_path+'train/X_train_features.bc', get_features(X_train))
Load training data and cluster centers
In [ ]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])
Validation cuts
In [ ]:
cuts = [
1376503200, # 2013-08-14 18:00
1380616200, # 2013-10-01 08:30
1381167900, # 2013-10-07 17:45
1383364800, # 2013-11-02 04:00
1387722600 # 2013-12-22 14:30
]
In [ ]:
print(datetime.datetime.fromtimestamp(1376503200))
In [ ]:
train.shape
In [ ]:
val_indices = []
index = 0
for index, row in train.iterrows():
time = row['TIMESTAMP']
latitude = row['LATITUDE']
for ts in cuts:
if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
val_indices.append(index)
break
index += 1
In [ ]:
X_valid = train.iloc[val_indices]
In [ ]:
valid.head()
In [ ]:
for d in valid['TIMESTAMP']:
print(datetime.datetime.fromtimestamp(d))
In [ ]:
X_train = train.drop(train.index[[val_indices]])
In [ ]:
cluster_centers = utils.load_array(data_path+"/data/cluster_centers_bw_001.bc")
In [ ]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])
In [ ]:
utils.save_array(data_path+'train/X_train.bc', X_train.as_matrix())
In [ ]:
utils.save_array(data_path+'valid/X_val.bc', X_valid.as_matrix())
In [ ]:
X_train = pd.DataFrame(utils.load_array(data_path+'train/X_train.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])
In [ ]:
X_val = pd.DataFrame(utils.load_array(data_path+'valid/X_val.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])
In [ ]:
In [ ]:
The equirectangular loss function mentioned in the paper.
Note: Very important that y[0] is longitude and y[1] is latitude.
Omitted the radius of the earth constant "R" as it does not affect minimization and units were not given in the paper.
In [ ]:
def equirectangular_loss(y_true, y_pred):
deg2rad = 3.141592653589793 / 180
long_1 = y_true[:,0]*deg2rad
long_2 = y_pred[:,0]*deg2rad
lat_1 = y_true[:,1]*deg2rad
lat_2 = y_pred[:,1]*deg2rad
return 6371*K.sqrt(K.square((long_1 - long_2)*K.cos((lat_1 + lat_2)/2.))
+K.square(lat_1 - lat_2))
In [ ]:
def embedding_input(name, n_in, n_out, reg):
inp = Input(shape=(1,), dtype='int64', name=name)
return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)
The following returns a fully-connected model as mentioned in the paper. Takes as input k as defined before, and the cluster centers.
Inputs: Embeddings for each category, concatenated w/ the 4*k continous variable representing the first/last k coords as mentioned above.
Embeddings have no regularization, as it was not mentioned in paper, though are easily equipped to include.
Paper mentions global normalization. Didn't specify exactly how they did that, whether thay did it sequentially or whatnot. I just included a batchnorm layer for the continuous inputs.
After concatenation, 1 hidden layer of 500 neurons as called for in paper.
Finally, output layer has as many outputs as there are cluster centers, w/ a softmax activation. Call this output P.
The prediction is the weighted sum of each cluster center c_i w/ corresponding predicted prob P_i.
To facilitate this, dotted output w/ cluster latitudes and longitudes separately. (this happens at variable y), then concatenated into single tensor.
NOTE!!: You will see that I have the cluster center coords as inputs. Ideally, This function should store the cluster longs/lats as a constant to be used in the model, but I could not figure out. As a consequence, I pass them in as a repeated input.
In [ ]:
def taxi_mlp(k, cluster_centers):
shp = cluster_centers.shape[0]
nums = Input(shape=(4*k,))
center_longs = Input(shape=(shp,))
center_lats = Input(shape=(shp,))
emb_names = ['client_ID', 'taxi_ID', "stand_ID", "quarter_hour", "day_of_week", "week_of_year"]
emb_ins = [57106, 448, 64, 96, 7, 52]
emb_outs = [10 for i in range(0,6)]
regs = [0 for i in range(0,6)]
embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]
x = merge([nums] + [Flatten()(e[1]) for e in embs], mode='concat')
x = Dense(500, activation='relu')(x)
x = Dense(shp, activation='softmax')(x)
y = merge([merge([x, center_longs], mode='dot'), merge([x, center_lats], mode='dot')], mode='concat')
return Model(input = [nums]+[e[0] for e in embs] + [center_longs, center_lats], output = y)
As mentioned, construction of repeated cluster longs/lats for input
Iterator for in memory train pandas dataframe. I did this as opposed to bcolz iterator due to the pre-processing
In [ ]:
def data_iter(data, batch_size, cluster_centers):
long = [c[0] for c in cluster_centers]
lat = [c[1] for c in cluster_centers]
i = 0
N = data.shape[0]
while True:
yield ([np.vstack(data['COORD_FEATURES'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_CALL'][i:i+batch_size].as_matrix()),
np.vstack(data['TAXI_ID'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_STAND'][i:i+batch_size].as_matrix()),
np.vstack(data['QUARTER_HOUR'][i:i+batch_size].as_matrix()), np.vstack(data['DAY_OF_WEEK'][i:i+batch_size].as_matrix()),
np.vstack(data['WEEK_OF_YEAR'][i:i+batch_size].as_matrix()), np.array([long for i in range(0,batch_size)]),
np.array([lat for i in range(0,batch_size)])], np.vstack(data["TARGET"][i:i+batch_size].as_matrix()))
i += batch_size
In [ ]:
x=Lambda(thing)([x,long,lat])
Of course, k in the model needs to match k from feature construction. We again use 5 as they did in the paper
In [ ]:
model = taxi_mlp(5, cluster_centers)
Paper used SGD opt w/ following paramerters
In [ ]:
model.compile(optimizer=SGD(0.01, momentum=0.9), loss=equirectangular_loss, metrics=['mse'])
In [ ]:
X_train_feat = get_features(X_train)
In [ ]:
X_train_target = get_target(X_train)
In [ ]:
X_val_feat = get_features(X_valid)
In [ ]:
X_val_target = get_target(X_valid)
In [ ]:
tqdm = TQDMNotebookCallback()
In [ ]:
checkpoint = ModelCheckpoint(filepath=data_path+'models/tmp/weights.{epoch:03d}.{val_loss:.8f}.hdf5', save_best_only=True)
In [ ]:
batch_size=256
In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)
In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=30, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)
In [ ]:
model = load_model(data_path+'models/weights.0.0799.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})
In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=100, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)
In [ ]:
model.save(data_path+'models/current_model.hdf5')
In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)
In [ ]:
model.fit(X_train_feat, X_train_target, nb_epoch=400, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)
In [ ]:
model.save(data_path+'/models/current_model.hdf5')
In [ ]:
len(X_val_feat[0])
It works, but it seems to converge unrealistically quick and the loss values are not the same. The paper does not mention what it's using as "error" in it's results. I assume the same equirectangular? Not very clear. The difference in values could be due to the missing Earth-radius factor
In [ ]:
best_model = load_model(data_path+'models/weights.308.0.03373993.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})
In [ ]:
best_model.evaluate(X_val_feat, X_val_target)
In [ ]:
test = pd.DataFrame(utils.load_array(data_path+'test/test_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE',
'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])
In [ ]:
test['ORIGIN_CALL'] = pd.read_csv(data_path+'real_origin_call.csv', header=None)
In [ ]:
test['TAXI_ID'] = pd.read_csv(data_path+'real_taxi_id.csv',header=None)
In [ ]:
X_test = get_features(test)
In [ ]:
b = np.sort(X_test[1],axis=None)
In [ ]:
test_preds = np.round(best_model.predict(X_test), decimals=6)
In [ ]:
d = {0:test['TRIP_ID'], 1:test_preds[:,1], 2:test_preds[:,0]}
kaggle_out = pd.DataFrame(data=d)
In [ ]:
kaggle_out.to_csv(data_path+'submission.csv', header=['TRIP_ID','LATITUDE', 'LONGITUDE'], index=False)
In [ ]:
def hdist(a, b):
deg2rad = 3.141592653589793 / 180
lat1 = a[:, 1] * deg2rad
lon1 = a[:, 0] * deg2rad
lat2 = b[:, 1] * deg2rad
lon2 = b[:, 0] * deg2rad
dlat = abs(lat1-lat2)
dlon = abs(lon1-lon2)
al = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2)**2)
d = np.arctan2(np.sqrt(al), np.sqrt(1-al))
hd = 2 * 6371 * d
return hd
In [ ]:
val_preds = best_model.predict(X_val_feat)
In [ ]:
trn_preds = model.predict(X_train_feat)
In [ ]:
er = hdist(val_preds, X_val_target)
In [ ]:
er.mean()
In [ ]:
In [ ]:
K.equal()
To-do: simple to extend to validation data
In [ ]:
cuts = [
1376503200, # 2013-08-14 18:00
1380616200, # 2013-10-01 08:30
1381167900, # 2013-10-07 17:45
1383364800, # 2013-11-02 04:00
1387722600 # 2013-12-22 14:30
]
In [ ]:
np.any([train['TIMESTAMP'].map(lambda x: x in cuts)])
In [ ]:
train['TIMESTAMP']
In [ ]:
np.any(train['TIMESTAMP']==1381167900)
In [ ]:
times = train['TIMESTAMP'].as_matrix()
In [ ]:
X_train.columns
In [ ]:
times
In [ ]:
count = 0
for index, row in X_val.iterrows():
for ts in cuts:
time = row['TIMESTAMP']
latitude = row['LATITUDE']
if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
count += 1
In [ ]:
one = count
In [ ]:
count + one
In [ ]:
import h5py
In [ ]:
h = h5py.File(data_path+'original/data.hdf5', 'r')
In [ ]:
evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']
In [ ]:
c = np.load(data_path+'original/arrival-clusters.pkl')
In [ ]:
from fuel.utils import find_in_data_path
from fuel.datasets import H5PYDataset
In [ ]:
original_path = '/data/bckenstler/data/taxi/original/'
In [ ]:
train_set = H5PYDataset(original_path+'data.hdf5', which_sets=('train',),load_in_memory=True)
In [ ]:
valid_set = H5PYDataset(original_path+'valid.hdf5', which_sets=('cuts/test_times_0',),load_in_memory=True)
In [ ]:
print(train_set.num_examples)
In [ ]:
print(valid_set.num_examples)
In [ ]:
data = train_set.data_sources
In [ ]:
data[0]
In [ ]:
valid_data = valid_set.data_sources
In [ ]:
valid_data[4][0]
In [ ]:
stamps = valid_data[-3]
In [ ]:
stamps[0]
In [ ]:
for i in range(0,304):
print(np.any([t==int(stamps[i]) for t in X_val['TIMESTAMP']]))
In [ ]:
type(X_train['TIMESTAMP'][0])
In [ ]:
type(stamps[0])
In [ ]:
In [ ]:
check = [s in stamps for s in X_val['TIMESTAMP']]
In [ ]:
for s in X_val['TIMESTAMP']:
print(datetime.datetime.fromtimestamp(s))
In [ ]:
for s in stamps:
print(datetime.datetime.fromtimestamp(s))
In [ ]:
ids = valid_data[-1]
In [ ]:
type(ids[0])
In [ ]:
ids
In [ ]:
X_val