In [ ]:
import pandas as pd
import numpy as np
from dateutil import parser
import time
import math
In [ ]:
# choose whether to study pickups or dropoffs
target = "pickup"
# target = "dropoff"
In [ ]:
start = time.time()
filename = '-yellow_tripdata_2016-06.csv'
nlinesfile = 11135470 # total number of samples
nlinesrandomsample = 5000000 # set amount of data to import based on RAM capacity (reduce if you get out of memory errors)
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.choice.html
skip = np.random.choice(np.arange(1,nlinesfile+1), (nlinesfile-nlinesrandomsample), replace=False)
df = pd.read_csv(filename,
usecols=['tpep_pickup_datetime',
'tpep_dropoff_datetime',
'pickup_longitude',
'pickup_latitude',
'dropoff_longitude',
'dropoff_latitude'],
skiprows=skip,
error_bad_lines=False)
print "load file:", (time.time() - start), "sec"
In [ ]:
print df[:5]
In [ ]:
# cull by geography (bounding box around manhattan)
print "starting entries:", len(df)
df = df[(df[target+'_latitude'] >= 40.699) & (df[target+'_latitude'] <= 40.875)]
df = df[(df[target+'_longitude'] >= -74.025) & (df[target+'_longitude'] <= -73.904)]
print "final entries:", len(df)
In [ ]:
# chunk geography by rounding latitude and logitude to 3 decimal places
df[target+'_longitude'] = df[target+'_longitude'].round(3)
df[target+'_latitude'] = df[target+'_latitude'].round(3)
In [ ]:
# convert pickup and dropoff data to panda's datatime format
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')
In [ ]:
timeSeries = df['tpep_'+ target +'_datetime']
dow = timeSeries.dt.dayofweek
df[target+'_dow_s'] = dow.apply(lambda x: math.sin((2 * math.pi) / 7 * x))
tod = timeSeries.dt.hour
df[target+'_tod_s'] = tod.apply(lambda x: math.sin((2 * math.pi) / 24 * x))
In [ ]:
print df[:5]
In [ ]:
# group by features to get counts at areas of same geography and time
# http://stackoverflow.com/questions/10373660/converting-a-pandas-groupby-object-to-dataframe
count_data = pd.DataFrame({'count' : df.groupby( [target+'_longitude', target+'_latitude', target+'_dow_s', target+'_tod_s'] ).size()}).reset_index()
print count_data[:5]
print 'number of samples:', len(count_data)
print 'min number of pickups in any sample:', count_data['count'].min()
print 'max number of pickups in any sample:', count_data['count'].max()
In [ ]:
# convert DataFrame to numpy array and shuffle it
data = count_data.as_matrix()
np.random.shuffle(data)
# create X (feature) and y (target) data sets
X = data[:,:-1]
y = data[:,-1]
print 'Data set:', X.shape, y.shape
In [ ]:
# use scikit-learn library to normalize x data to be mean 0, variance 1
import pickle
from sklearn import preprocessing
# fit scaling function to X data
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler = preprocessing.StandardScaler().fit(X)
# save out scaling function for later use
with open("-scaler_"+target+".pkl", "wb") as f:
pickle.dump(scaler, f)
X_scaled = scaler.transform(X)
# check to make sure data was scaled correctly
# print "Training feature data -- mean:", X_train_scaled.mean(), "std:", X_train_scaled.std()
# print "Test feature data -- mean:", X_test_scaled.mean(), "std:", X_test_scaled.std()
print "mean:", X_scaled.mean(), "<-- should be around 0.0"
print "std:", X_scaled.std(), "<-- should be around 1.0"
In [ ]:
#plot correlation matrix between features and target
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
corr_df = pd.DataFrame(data=X_scaled, columns=[target+'_longitude', target+'_latitude', target+'_dow_s', target+'_tod_s'])
corr_df['count'] = y
corrmat = corr_df.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 6))
# Draw the heatmap using seaborn
sns.set_context("notebook", font_scale=0.7, rc={"lines.linewidth": 1.5})
sns.heatmap(corrmat, annot=True, square=True)
f.tight_layout()
In [ ]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
# model hyperparameters
batch_size = 256
nb_epoch = 5
num_hidden_1 = 1024
num_hidden_2 = 1024
num_hidden_3 = 1024
dropout = 0.15
In [ ]:
model = Sequential()
model.add(Dense(output_dim=num_hidden_1, input_dim=X.shape[1], W_regularizer=l2(0.0005)))
model.add(Activation("tanh"))
model.add(Dropout(dropout))
model.add(Dense(num_hidden_2, W_regularizer=l2(0.0005)))
model.add(Activation("tanh"))
model.add(Dropout(dropout))
model.add(Dense(num_hidden_3, W_regularizer=l2(0.0005)))
model.add(Activation("tanh"))
model.add(Dropout(dropout))
model.add(Dense(1)) # single neuron in output layer for regression problem
# save out model each time it performs better than previous epochs
checkpoint_name = "-model_"+target+".hdf5"
checkpointer = ModelCheckpoint(checkpoint_name, verbose=0, save_best_only=True)
# mean squared logarithmic error for regression problme
model.compile(loss='mean_squared_logarithmic_error', optimizer='adam')
# fit model using a 20% validation split (keras will automatically split the data into training and validation sets)
history = model.fit(X_scaled, y, validation_split=0.2, batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, callbacks=[checkpointer])
In [ ]:
# list all data in history
print(history.history.keys())
In [ ]:
# plot history of loss in training and validation data
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()