In [ ]:
import ast
import pandas as pd
import datetime
from keras.layers import Input, Dense, Embedding, merge, Flatten, Merge, BatchNormalization
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
import utils
import data
from sklearn.model_selection import train_test_split
from bcolz_array_iterator import BcolzArrayIterator
import bcolz
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
import pickle
Data Path
In [ ]:
data_path = "/data/datasets/taxi/"
Need to check process and see if resulting data is the same
This is the author's feature extraction process. I have modified it so that it returns the tuple instead of saving as hdf5 file
In [ ]:
import ast
import csv
import os
import sys
import h5py
import numpy
from fuel.converters.base import fill_hdf5_file
import data
taxi_id_dict = {}
origin_call_dict = {0: 0}
def get_unique_taxi_id(val):
if val in taxi_id_dict:
return taxi_id_dict[val]
else:
taxi_id_dict[val] = len(taxi_id_dict)
return len(taxi_id_dict) - 1
def get_unique_origin_call(val):
if val in origin_call_dict:
return origin_call_dict[val]
else:
origin_call_dict[val] = len(origin_call_dict)
return len(origin_call_dict) - 1
def read_stands(input_directory, h5file):
stands_name = numpy.empty(shape=(data.stands_size,), dtype=('a', 24))
stands_latitude = numpy.empty(shape=(data.stands_size,), dtype=numpy.float32)
stands_longitude = numpy.empty(shape=(data.stands_size,), dtype=numpy.float32)
stands_name[0] = 'None'
stands_latitude[0] = stands_longitude[0] = 0
with open(os.path.join(input_directory, 'metaData_taxistandsID_name_GPSlocation.csv'), 'r') as f:
reader = csv.reader(f)
next(reader) # header
for line in reader:
id = int(line[0])
stands_name[id] = line[1].encode('utf-8')
stands_latitude[id] = float(line[2])
stands_longitude[id] = float(line[3])
return (('stands', 'stands_name', stands_name),
('stands', 'stands_latitude', stands_latitude),
('stands', 'stands_longitude', stands_longitude))
def read_taxis(input_directory, h5file, dataset):
size=getattr(data, '%s_size'%dataset)
trip_id = numpy.empty(shape=(size,), dtype='S19')
call_type = numpy.empty(shape=(size,), dtype=numpy.int8)
origin_call = numpy.empty(shape=(size,), dtype=numpy.int32)
origin_stand = numpy.empty(shape=(size,), dtype=numpy.int8)
taxi_id = numpy.empty(shape=(size,), dtype=numpy.int16)
timestamp = numpy.empty(shape=(size,), dtype=numpy.int32)
day_type = numpy.empty(shape=(size,), dtype=numpy.int8)
missing_data = numpy.empty(shape=(size,), dtype=numpy.bool)
latitude = numpy.empty(shape=(size,), dtype=data.Polyline)
longitude = numpy.empty(shape=(size,), dtype=data.Polyline)
with open(os.path.join(input_directory, '%s.csv'%dataset), 'r') as f:
reader = csv.reader(f)
next(reader) # header
id=0
for line in reader:
trip_id[id] = line[0].encode('utf-8')
call_type[id] = ord(line[1][0]) - ord('A')
origin_call[id] = 0 if line[2]=='NA' or line[2]=='' else get_unique_origin_call(int(line[2]))
origin_stand[id] = 0 if line[3]=='NA' or line[3]=='' else int(line[3])
taxi_id[id] = get_unique_taxi_id(int(line[4]))
timestamp[id] = int(line[5])
day_type[id] = ord(line[6][0]) - ord('A')
missing_data[id] = line[7][0] == 'T'
polyline = ast.literal_eval(line[8])
latitude[id] = numpy.array([point[1] for point in polyline], dtype=numpy.float32)
longitude[id] = numpy.array([point[0] for point in polyline], dtype=numpy.float32)
id+=1
splits = ()
for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']:
splits += ((dataset, name, locals()[name]),)
return splits
def unique(h5file):
unique_taxi_id = numpy.empty(shape=(data.taxi_id_size,), dtype=numpy.int32)
assert len(taxi_id_dict) == data.taxi_id_size
for k, v in taxi_id_dict.items():
unique_taxi_id[v] = k
unique_origin_call = numpy.empty(shape=(data.origin_call_size,), dtype=numpy.int32)
assert len(origin_call_dict) == data.origin_call_size
for k, v in origin_call_dict.items():
unique_origin_call[v] = k
return (('unique_taxi_id', 'unique_taxi_id', unique_taxi_id),
('unique_origin_call', 'unique_origin_call', unique_origin_call))
def get_data(input_directory):
split = ()
split += read_stands(input_directory, None)
split += read_taxis(input_directory, None, 'train')
split += read_taxis(input_directory, None, 'test')
split += unique(None)
return split
manually go through data collection
In [ ]:
taxi_id_dict = {}
origin_call_dict = {0: 0}
In [ ]:
split = ()
In [ ]:
split += read_stands(data_path+'data', None)
In [ ]:
split += read_taxis(data_path+'data', None, 'train')
In [ ]:
split += read_taxis(data_path+'data', None, 'test')
In [ ]:
split += unique(None)
Data structure: Tuple of tuples. Each sub-tuple: ('dataset', 'column' 'data')
Contains stands: metadata.
Save tulpe
In [ ]:
with open(data_path+'/data/data_tuple.pickle', 'wb') as f:
pickle.dump(split, f)
Load tuple
In [ ]:
with open(data_path+'/data/data_tuple.pickle', 'r') as f:
split = pickle.load(f)
Time cuts
In [ ]:
cuts = [
1376503200, # 2013-08-14 18:00
1380616200, # 2013-10-01 08:30
1381167900, # 2013-10-07 17:45
1383364800, # 2013-11-02 04:00
1387722600 # 2013-12-22 14:30
]
In [ ]:
split[12][2][0]
In [ ]:
split
In [ ]:
split[7][1]
In [ ]:
def make_valid(split):
valid = (
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[]
)
for i in range(len(split[5][2])):
trip_id = split[3][2][i]
call_type = split[4][2][i]
origin_call = split[5][2][i]
origin_stand = split[6][2][i]
taxi_id = split[7][2][i]
time = split[8][2][i]
day_type = split[9][2][i]
missing_data = split[10][2][i]
latitude = split[11][2][i]
longitude = split[12][2][i]
if len(latitude) == 0:
continue
for ts in cuts:
if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
# keep it
valid[0].append(trip_id)
valid[1].append(call_type)
valid[2].append(origin_call)
valid[3].append(origin_stand)
valid[4].append(taxi_id)
valid[5].append(time)
valid[6].append(day_type)
valid[7].append(missing_data)
n = (ts - time) / 15 + 1
valid[8].append(latitude[:n])
valid[9].append(longitude[:n])
valid[10].append(latitude[-1])
valid[11].append(longitude[-1])
valid[12].append(15 * (len(latitude)-1))
break
return (
('valid', 'trip_id', np.array(valid[0])),
('valid', 'call_type', np.array(valid[1])),
('valid', 'origin_call', np.array(valid[2])),
('valid', 'origin_stand', np.array(valid[3])),
('valid', 'taxi_id', np.array(valid[4])),
('valid', 'timestamp', np.array(valid[5])),
('valid', 'day_type', np.array(valid[6])),
('valid', 'missing_data', np.array(valid[7])),
('valid', 'latitude', np.array(valid[8])),
('valid', 'longitude', np.array(valid[9])),
('valid', 'destination_latitude', np.array(valid[10])),
('valid', 'destination_longitude', np.array(valid[11])),
('valid', 'travel_time', np.array(valid[12]))
)
In [ ]:
valid_split = make_valid(split)
In [ ]:
valid_split
In [ ]:
stands = pd.read_csv(data_path+'/data/metaData_taxistandsID_name_GPSlocation.csv', header=0)
In [ ]:
stands.head()
Compare columns
In [ ]:
stands['Descricao']
In [ ]:
len(stands['Descricao'].as_matrix())
In [ ]:
split[2][2]
In [ ]:
len(split[0][2])
Author's data has a "None" row w/ zeros. Will add to series
In [ ]:
d={'col1': 1, 'col2': 2}
In [ ]:
stands = pd.DataFrame([['None',0.,0.]], columns=['Descricao', 'Latitude', 'Longitude']).append(stands)
In [ ]:
np.allclose(stands['Latitude'],split[1][2])
In [ ]:
np.allclose(stands['Longitude'],split[2][2])
Longs/Lats check out
In [ ]:
data = pd.read_csv(data_path+'data/train.csv', header=0)
In [ ]:
data.columns
Check data same size
In [ ]:
data.shape
In [ ]:
len(split[4][2])
In [ ]:
split[4][1]
In [ ]:
call_type_f = lambda x:ord(x) - ord('A')
In [ ]:
np.allclose(data['CALL_TYPE'].apply(call_type_f),split[4][2])
In [ ]:
data['CALL_TYPE'] = data['CALL_TYPE'].apply(call_type_f)
Turn origin call into categorical variable
Can do using factorize: we want the nulls to be zero. Factorize sets them as -1, add 1 to set as needed
In [ ]:
np.allclose(pd.Series(pd.factorize(data['ORIGIN_CALL'])[0])+1,split[5][2])
And it is the same
In [ ]:
data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1
In [ ]:
num_origin_call = len(data['ORIGIN_CALL'].unique())
In [ ]:
s = set()
In [ ]:
s.i
In [ ]:
origin_stand_f = lambda x: 0 if pd.isnull(x) or x=='' else int(x)
In [ ]:
split[6]
In [ ]:
np.allclose(data["ORIGIN_STAND"].apply(origin_stand_f),split[6][2])
In [ ]:
data["ORIGIN_STAND"] = data["ORIGIN_STAND"].apply(origin_stand_f)
In [ ]:
num_origin_stand = len(data['ORIGIN_STAND'].unique())
In [ ]:
num_origin_stand
In [ ]:
split[7][2]
In [ ]:
pd.factorize(data['TAXI_ID'])[0]
In [ ]:
np.allclose(pd.Series(pd.factorize(data['TAXI_ID'])[0]),split[7][2])
In [ ]:
data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0])
In [ ]:
num_taxi_id = data['TAXI_ID'].unique()
In [ ]:
split[9]
In [ ]:
day_type_f = lambda x: ord(x[0]) - ord('A')
In [ ]:
np.allclose(data['DAY_TYPE'].apply(day_type_f),split[9][2])
In [ ]:
data['DAY_TYPE'] = data['DAY_TYPE'].apply(day_type_f)
In [ ]:
polyline_f = lambda x: ast.literal_eval(x)
In [ ]:
polyline = data['POLYLINE'].apply(polyline_f)
In [ ]:
polyline.to_pickle(data_path+'data/polylines.pkl')
In [ ]:
polyline = pd.read_pickle(data_path+'data/polylines.pkl')
In [ ]:
len(polyline)
In [ ]:
polyline
In [ ]:
lats = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])
In [ ]:
split[11][2]
In [ ]:
np.alltrue([np.allclose(lats[i],split[11][2][i]) for i in range(len(lats))])
Latitudes check out
In [ ]:
longs = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
In [ ]:
split[12]
In [ ]:
np.alltrue([np.allclose(longs[i],split[12][2][i]) for i in range(len(longs))])
Longitudes check out
In [ ]:
data['LATITUDE'] = lats
In [ ]:
data['LONGITUDE'] = longs
In [ ]:
data
SAVE DICTS
In [ ]:
np.save(data_path+'data/origin_call_dict.npy', origin_call_dict)
In [ ]:
np.save(data_path+'data/taxi_id_dict.npy', taxi_id_dict)
In [ ]:
test_data = pd.read_csv(data_path+'data/test.csv', header=0)
In [ ]:
test_data.columns
Check data same size
In [ ]:
test_data.shape
In [ ]:
len(split[13][2])
In [ ]:
call_type_f = lambda x:ord(x) - ord('A')
In [ ]:
np.allclose(test_data['CALL_TYPE'].apply(call_type_f),split[14][2])
In [ ]:
test_data['CALL_TYPE'] = test_data['CALL_TYPE'].apply(call_type_f)
Turn origin call into categorical variable
Can do using factorize: we want the nulls to be zero. Factorize sets them as -1, add 1 to set as needed
In [ ]:
np.unique(split[15][2])
Hold up! We need to use our previous mapping
In [ ]:
import numpy as np
In [ ]:
taxi_id_dict = np.load(data_path+'data/taxi_id_dict.npy').item()
In [ ]:
len(origin_call_dict)
In [ ]:
test_origin_call_f = lambda x: 0 if (np.isnan(x) or x=='' or x >= num_origin_call) else origin_call_dict[x]
In [ ]:
np.allclose(test_data['ORIGIN_CALL'].apply(test_origin_call_f),split[15][2])
In [ ]:
test_data['ORIGIN_CALL'] = test_data['ORIGIN_CALL'].apply(test_origin_call_f)
In [ ]:
origin_stand_f = lambda x: 0 if pd.isnull(x) or x=='' else int(x)
In [ ]:
split[16]
In [ ]:
np.allclose(test_data["ORIGIN_STAND"].apply(origin_stand_f),split[16][2])
In [ ]:
test_data["ORIGIN_STAND"] = test_data["ORIGIN_STAND"].apply(origin_stand_f)
In [ ]:
split[17][2]
In [ ]:
test_taxi_id_f = lambda x: taxi_id_dict[x]
In [ ]:
np.allclose(test_data['TAXI_ID'].apply(test_taxi_id_f),split[17][2])
In [ ]:
test_data['TAXI_ID'] = test_data['TAXI_ID'].apply(test_taxi_id_f)
In [ ]:
test_data['ORIGIN_CALL'].unique()
In [ ]:
test_data['TAXI_ID'].as_matrix()
In [ ]:
split[19]
In [ ]:
day_type_f = lambda x: ord(x[0]) - ord('A')
In [ ]:
np.allclose(test_data['DAY_TYPE'].apply(day_type_f),split[19][2])
In [ ]:
test_data['DAY_TYPE'] = test_data['DAY_TYPE'].apply(day_type_f)
In [ ]:
polyline_f = lambda x: ast.literal_eval(x)
In [ ]:
test_polyline = test_data['POLYLINE'].apply(polyline_f)
In [ ]:
test_polyline.to_pickle(data_path+'data/test_polylines.pkl')
In [ ]:
polyline
In [ ]:
lats = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in test_polyline])
In [ ]:
split[11][2]
In [ ]:
np.alltrue([np.allclose(lats[i],split[21][2][i]) for i in range(len(lats))])
Latitudes check out
In [ ]:
longs = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in test_polyline])
In [ ]:
split[12]
In [ ]:
np.alltrue([np.allclose(longs[i],split[22][2][i]) for i in range(len(longs))])
Longitudes check out
In [ ]:
test_data['LATITUDE'] = lats
In [ ]:
test_data['LONGITUDE'] = longs
In [ ]:
data.head()
In [ ]:
def make_valid_pandas(data):
valid = (
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[]
)
for row in data.itertuples():
trip_id = row[1]
call_type = row[2]
origin_call = row[3]
origin_stand = row[4]
taxi_id = row[5]
time = row[6]
day_type = row[7]
missing_data = row[8]
latitude = row[10]
longitude = row[11]
if len(latitude) == 0:
continue
for ts in cuts:
if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
# keep it
valid[0].append(trip_id)
valid[1].append(call_type)
valid[2].append(origin_call)
valid[3].append(origin_stand)
valid[4].append(taxi_id)
valid[5].append(time)
valid[6].append(day_type)
valid[7].append(missing_data)
n = (ts - time) / 15 + 1
valid[8].append(latitude[:n])
valid[9].append(longitude[:n])
valid[10].append(latitude[-1])
valid[11].append(longitude[-1])
valid[12].append(15 * (len(latitude)-1))
break
return pd.DataFrame({
'TRIP_ID':valid[0],
'CALL_TYPE':valid[1],
'ORIGIN_CALL':valid[2],
'ORIGIN_STAND':valid[3],
'TAXI_ID':valid[4],
'TIMESTAMP':valid[5],
'DAY_TYPE':valid[6],
'MISSING_DATA':valid[7],
'LATITUDE':valid[8],
'LONGITUDE':valid[9],
'DESTINATION_LATITUDE':valid[10],
'DESTINATION_LONGITUDE':valid[11],
'TRAVEL_TIME':valid[12]
}
)
In [ ]:
valid_data = make_valid_pandas(data)
In [ ]:
np.allclose(valid_data['CALL_TYPE'],valid_split[1][2])
In [ ]:
np.allclose(valid_data['ORIGIN_CALL'],valid_split[2][2])
In [ ]:
np.allclose(valid_data['ORIGIN_STAND'],valid_split[3][2])
In [ ]:
np.allclose(valid_data['TAXI_ID'],valid_split[4][2])
In [ ]:
np.allclose(valid_data['TIMESTAMP'],valid_split[5][2])
In [ ]:
np.allclose(valid_data['DAY_TYPE'],valid_split[6][2])
In [ ]:
np.allclose(valid_data['MISSING_DATA'],valid_split[7][2])
In [ ]:
np.alltrue([np.allclose(valid_data['LATITUDE'][i], valid_split[8][2][i]) for i in range(0,len(valid_data['LATITUDE']))])
In [ ]:
np.alltrue([np.allclose(valid_data['LONGITUDE'][i], valid_split[9][2][i]) for i in range(0,len(valid_data['LATITUDE']))])
In [ ]:
np.allclose(valid_data['DESTINATION_LATITUDE'],valid_split[10][2])
In [ ]:
np.allclose(valid_data['DESTINATION_LONGITUDE'],valid_split[11][2])
In [ ]:
np.allclose(valid_data['TRAVEL_TIME'],valid_split[12][2])
Values check out. Yay
In [ ]:
from sklearn.cluster import MeanShift, estimate_bandwidth
In [ ]:
split[11][1]
In [ ]:
dests = []
for i in range(0, len(split[5][2])):
if len(split[11][2][i]) == 0: continue
dests.append([split[11][2][i][-1], split[12][2][i][-1]])
pts = numpy.array(dests)
In [ ]:
pts.shape
In [ ]:
bw = 0.001
In [ ]:
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(pts)
cluster_centers = ms.cluster_centers_
In [ ]:
cluster_centers.shape
In [ ]:
dests = []
for row in data[['LATITUDE','LONGITUDE']].itertuples():
if len(row[1]) == 0: continue
dests.append([row[1][-1], row[2][-1]])
pts2 = numpy.array(dests)
In [ ]:
np.allclose(pts, pts2)
The points are the same. Hooray!
In [ ]:
np.save(data_path+'data/cluster_centers.npy', cluster_centers)
In [ ]:
(1,2)+(3,)
In [ ]:
split[8][1]
In [ ]:
valid_split[5][2]
In [ ]:
def get_date_data(split):
tmp = (
[],
[],
[],
[],
[],
[],
[],
[],
[]
)
for ts in split[8][2]:
date = datetime.datetime.utcfromtimestamp(ts)
yearweek = date.isocalendar()[1] - 1
tmp[0].append(numpy.int8(51 if yearweek == 52 else yearweek))
tmp[1].append(numpy.int8(date.weekday()))
tmp[2].append(numpy.int8(date.hour * 4 + date.minute / 15))
for ts in split[18][2]:
date = datetime.datetime.utcfromtimestamp(ts)
yearweek = date.isocalendar()[1] - 1
tmp[3].append(numpy.int8(51 if yearweek == 52 else yearweek))
tmp[4].append(numpy.int8(date.weekday()))
tmp[5].append(numpy.int8(date.hour * 4 + date.minute / 15))
for ts in valid_split[5][2]:
date = datetime.datetime.utcfromtimestamp(ts)
yearweek = date.isocalendar()[1] - 1
tmp[6].append(numpy.int8(51 if yearweek == 52 else yearweek))
tmp[7].append(numpy.int8(date.weekday()))
tmp[8].append(numpy.int8(date.hour * 4 + date.minute / 15))
return (
('train', 'yearweek', np.array(tmp[0])),
('train', 'weekday', np.array(tmp[1])),
('train', 'quarterhour', np.array(tmp[2])),
('test', 'yearweek', np.array(tmp[3])),
('test', 'weekday', np.array(tmp[4])),
('test', 'quarterhour', np.array(tmp[5])),
('valid', 'yearweek', np.array(tmp[6])),
('valid', 'weekday', np.array(tmp[7])),
('valid', 'quarterhour', np.array(tmp[8]))
)
In [ ]:
dates = get_date_data(split)
In [ ]:
dates[3][2].shape
In [ ]:
dates[2][2].shape[0]
In [ ]:
trn_size = dates[2][2].shape[0]
In [ ]:
test_size = dates[4][2].shape[0]
In [ ]:
val_size= dates[6][2].shape[0]
In [ ]:
_size
In [ ]:
train_gps_mean = [np.concatenate([split[11][2][i] for i in range(trn_size)]).mean(),
np.concatenate([split[12][2][i] for i in range(trn_size)]).mean()]
In [ ]:
train_gps_std = [np.concatenate([split[11][2][i] for i in range(trn_size)]).std(),
np.concatenate([split[12][2][i] for i in range(trn_size)]).std()]
In [ ]:
def at_least_k(k, v, pad_at_begin, is_longitude):
if len(v) == 0:
v = numpy.array([train_gps_mean[1 if is_longitude else 0]])
if len(v) < k:
if pad_at_begin:
v = numpy.concatenate((numpy.full((k - len(v),), v[0]), v))
else:
v = numpy.concatenate((v, numpy.full((k - len(v),), v[-1])))
return v
In [ ]:
valid_split[9]
In [ ]:
import theano
In [ ]:
def get_first_last_k(split, k):
first_k = (
[],
[],
[],
[],
[],
[]
)
last_k = (
[],
[],
[],
[],
[],
[]
)
for i in range(trn_size):
first_k[0].append(np.array(at_least_k(k, split[11][2][i], False, False)[:k]))
first_k[1].append(np.array(at_least_k(k, split[12][2][i], False, True)[:k]))
last_k[0].append(np.array(at_least_k(k, split[11][2][i], True, False)[-k:]))
last_k[1].append(np.array(at_least_k(k, split[12][2][i], True, True)[-k:]))
for i in range(test_size):
first_k[2].append(np.array(at_least_k(k, split[21][2][i], False, False)[:k]))
first_k[3].append(np.array(at_least_k(k, split[22][2][i], False, True)[:k]))
last_k[2].append(np.array(at_least_k(k, split[21][2][i], True, False)[-k:]))
last_k[3].append(np.array(at_least_k(k, split[22][2][i], True, True)[-k:]))
for i in range(val_size):
first_k[4].append(np.array(at_least_k(k, valid_split[8][2][i], False, False)[:k]))
first_k[5].append(np.array(at_least_k(k, valid_split[9][2][i], False, True)[:k]))
last_k[4].append(np.array(at_least_k(k, valid_split[8][2][i], True, False)[-k:]))
last_k[5].append(np.array(at_least_k(k, valid_split[9][2][i], True, True)[-k:]))
return (
('train', 'first_latitude', np.array(first_k[0])),
('train', 'first_longitude', np.array(first_k[1])),
('train', 'last_latitude', np.array(last_k[0])),
('train', 'last_longitude', np.array(last_k[1])),
('test', 'first_latitude', np.array(first_k[2])),
('test', 'first_longitude', np.array(first_k[3])),
('test', 'last_latitude', np.array(last_k[2])),
('test', 'last_longitude', np.array(last_k[3])),
('valid', 'first_latitude', np.array(first_k[4])),
('valid', 'first_longitude', np.array(first_k[5])),
('valid', 'last_latitude', np.array(last_k[4])),
('valid', 'last_longitude', np.array(last_k[5])))
In [ ]:
import warnings
warnings.filterwarnings("ignore")
In [ ]:
coords = get_first_last_k(split, 5)
In [ ]:
coords[0][2].shape
In [ ]:
coords[4][2].shape
In [ ]:
coords[4]
In [ ]:
valid_split[8][2][0][-1]
In [ ]:
valid_split[10][2][0]
In [ ]:
np.sum([len(l) for l in data['LATITUDE']])
In [ ]:
data.to_pickle(data_path+'data/train_data.pkl')
In [ ]:
data['DAY_OF_WEEK'] = data['TIMESTAMP'].apply(lambda t:datetime.datetime.fromtimestamp(t).weekday())
In [ ]:
data['QUARTER_HOUR'] = data['TIMESTAMP'].apply(lambda t:int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15))
In [ ]:
data['WEEK_OF_YEAR'] = data['TIMESTAMP'].apply(lambda t:datetime.datetime.fromtimestamp(t).isocalendar()[1])
In [ ]:
data['DESTINATION_LATITUDE'] = data['LATITUDE'].apply(lambda l: l[-1] if len(l) > 0 else np.nan)
In [ ]:
data['DESTINATION_LONGITUDE'] = data['LONGITUDE'].apply(lambda l: l[-1] if len(l) > 0 else np.nan)
In [ ]:
data = data.dropna()
In [ ]:
train_gps_mean
In [ ]:
data['LATITUDE'] = data['LATITUDE'].apply(lambda l: (l-train_gps_mean[0])/train_gps_std[0])
In [ ]:
data['LONGITUDE'] = data['LONGITUDE'].apply(lambda l: (l-train_gps_mean[1])/train_gps_std[1])
In [ ]:
data.columns
In [ ]:
data['CALL_TYPE'].unique()
In [ ]:
def at_least_k(k, v, pad_at_begin, is_longitude):
if len(v) == 0:
v = numpy.array([train_gps_mean[1 if is_longitude else 0]])
if len(v) < k:
if pad_at_begin:
v = numpy.concatenate((numpy.full((k - len(v),), v[0]), v))
else:
v = numpy.concatenate((v, numpy.full((k - len(v),), v[-1])))
return v
In [ ]:
origin_call = []
origin_stand = []
taxi_id = []
day_of_week = []
quarter_hour = []
week_of_year = []
day_type = []
first_latitude = []
first_longitude = []
last_latitude = []
last_longitude = []
destination_latitude = []
destination_longitude = []
In [ ]:
k = 5
In [ ]:
def
origin_call = []
origin_stand = []
taxi_id = []
day_of_week = []
quarter_hour = []
week_of_year = []
day_type = []
first_latitude = []
first_longitude = []
last_latitude = []
last_longitude = []
destination_latitude = []
destination_longitude = []
for i in data.index:
latitude = data['LATITUDE'][i][:-1]
longitude = data['LONGITUDE'][i][:-1]
l = len(latitude)
if l==0:
continue
if l < 100:
for j in range(l):
first_latitude.append(np.array(at_least_k(k, latitude[:j+1], False, False)[:k]))
first_longitude.append(np.array(at_least_k(k, longitude[:j+1], False, True)[:k]))
last_latitude.append(np.array(at_least_k(k, latitude[:j+1], False, False)[-k:]))
last_longitude.append(np.array(at_least_k(k, longitude[:j+1], False, True)[-k:]))
origin_call.append(data['ORIGIN_CALL'][i])
origin_stand.append(data['ORIGIN_STAND'][i])
taxi_id.append(data['TAXI_ID'][i])
day_of_week.append(data['DAY_OF_WEEK'][i])
quarter_hour.append(data['QUARTER_HOUR'][i])
week_of_year.append(data['WEEK_OF_YEAR'][i])
day_type.append(data['DAY_TYPE'][i])
destination_latitude.append(data['DESTINATION_LATITUDE'][i])
destination_longitude.append(data['DESTINATION_LONGITUDE'][i])
else:
indices = np.random.choice(range(l), 100, replace=False)
for j in indices:
first_latitude.append(np.array(at_least_k(k, latitude[:j+1], False, False)[:k]))
first_longitude.append(np.array(at_least_k(k, longitude[:j+1], False, True)[:k]))
last_latitude.append(np.array(at_least_k(k, latitude[:j+1], False, False)[-k:]))
last_longitude.append(np.array(at_least_k(k, longitude[:j+1], False, True)[-k:]))
origin_call.append(data['ORIGIN_CALL'][i])
origin_stand.append(data['ORIGIN_STAND'][i])
taxi_id.append(data['TAXI_ID'][i])
day_of_week.append(data['DAY_OF_WEEK'][i])
quarter_hour.append(data['QUARTER_HOUR'][i])
week_of_year.append(data['WEEK_OF_YEAR'][i])
day_type.append(data['DAY_TYPE'][i])
destination_latitude.append(data['DESTINATION_LATITUDE'][i])
destination_longitude.append(data['DESTINATION_LONGITUDE'][i])
In [ ]:
print('I"m finished')
In [ ]:
5
In [ ]:
len(origin_stand)
In [ ]:
data['LATITUDE'][0]
In [ ]:
first_longitude[:22]
In [ ]:
last_longitude[:22]
In [ ]:
first_latitude[:22]
In [ ]:
last_latitude[:22]
In [ ]:
valid_data.to_pickle(data_path+'data/valid_data.pkl')
In [ ]:
test_data.to_pickle(data_path+'data/test_data.pkl')
In [ ]:
n_origin_call = len(data['ORIGIN_CALL'].unique())
n_taxi_id = len(data['TAXI_ID'].unique())
n_origin_stand = len(data['ORIGIN_STAND'].unique())
n_quarter_hour = len(data['QUARTER_HOUR'].unique())
n_day_of_week = len(data['DAY_OF_WEEK'].unique())
n_week_of_year = len(data['WEEK_OF_YEAR'])
n_day_type = 3
In [ ]:
latitude_sum = lambda x: np.dot(x,cluster_centers[0])
longitude_sum = lambda x: np.dot(x, cluster_centers[1])
In [ ]:
def taxi_mlp(k, shp = cluster_centers.shape[0]):
first_lat_in = Input(shape(k,))
last_lat_in = Input(shape(k,))
first_long_in = Input(shape(k,))
last_long_in = Input(shape(k,))
center_lats = Input(shape=(shp,))
center_longs = Input(shape=(shp,))
emb_names = ['origin_call', 'taxi_ID', "origin_stand", "quarter_hour", "day_of_week", "week_of_year", "day_type"]
emb_ins = [n_origin_call + 1, n_taxi_id + 1, n_origin_stand + 1, n_quarter_hour + 1, n_day_of_week + 1, n_week_of_year + 1, n_day_type+1]
emb_outs = [10 for i in range(0,6)]
regs = [0 for i in range(0,6)]
embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]
x = merge([first_lat_in, last_lat_in, first_long_in, last_long_in] + [Flatten()(e[1]) for e in embs], mode='concat')
x = Dense(500, activation='relu')(x)
x = Dense(shp, activation='softmax')(x)
#CHECK ON CLUSTERS!!!!
y_latitude = Lambda(latitude_sum,(1,))(x)
y_longitude = Lambda(longitude_sum,(1,))(x)
return Model(input = [first_lat_in, last_lat_in, first_long_in, last_long_in]+[e[0] for e in embs] + [center_longs, center_lats],
output = [y_latitude,y_longitude])