Exploratory Data Analysis


In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

#import xgboost as xgb
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

Load the data


In [2]:
# Load training data as train
trainDF = pd.read_csv('input/train.csv')

# Load testing data as test
testDF = pd.read_csv('input/test.csv')

In [3]:
# Print size as well as the top 5 observation of training dataset
print('Size of the TRAINING set is: {} rows and {} columns'.format(*trainDF.shape))
print('Size of the TEST set is: {} rows and {} columns'.format(*testDF.shape))


Size of the TRAINING set is: 1458644 rows and 11 columns
Size of the TEST set is: 625134 rows and 9 columns

In [4]:
trainDF.head()


Out[4]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435

In [5]:
testDF.head()


Out[5]:
id vendor_id pickup_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
0 id3004672 1 2016-06-30 23:59:58 1 -73.988129 40.732029 -73.990173 40.756680 N
1 id3505355 1 2016-06-30 23:59:53 1 -73.964203 40.679993 -73.959808 40.655403 N
2 id1217141 1 2016-06-30 23:59:47 1 -73.997437 40.737583 -73.986160 40.729523 N
3 id2150126 2 2016-06-30 23:59:41 1 -73.956070 40.771900 -73.986427 40.730469 N
4 id1598245 1 2016-06-30 23:59:33 1 -73.970215 40.761475 -73.961510 40.755890 N

Understanding Data


In [6]:
trainDF.describe()


Out[6]:
vendor_id passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude trip_duration
count 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06
mean 1.534950e+00 1.664530e+00 -7.397349e+01 4.075092e+01 -7.397342e+01 4.075180e+01 9.594923e+02
std 4.987772e-01 1.314242e+00 7.090186e-02 3.288119e-02 7.064327e-02 3.589056e-02 5.237432e+03
min 1.000000e+00 0.000000e+00 -1.219333e+02 3.435970e+01 -1.219333e+02 3.218114e+01 1.000000e+00
25% 1.000000e+00 1.000000e+00 -7.399187e+01 4.073735e+01 -7.399133e+01 4.073588e+01 3.970000e+02
50% 2.000000e+00 1.000000e+00 -7.398174e+01 4.075410e+01 -7.397975e+01 4.075452e+01 6.620000e+02
75% 2.000000e+00 2.000000e+00 -7.396733e+01 4.076836e+01 -7.396301e+01 4.076981e+01 1.075000e+03
max 2.000000e+00 9.000000e+00 -6.133553e+01 5.188108e+01 -6.133553e+01 4.392103e+01 3.526282e+06

In [7]:
testDF.describe()


Out[7]:
vendor_id passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
count 625134.000000 625134.000000 625134.000000 625134.000000 625134.000000 625134.000000
mean 1.534884 1.661765 -73.973614 40.750927 -73.973458 40.751816
std 0.498782 1.311293 0.073389 0.029848 0.072565 0.035824
min 1.000000 0.000000 -121.933128 37.389587 -121.933327 36.601322
25% 1.000000 1.000000 -73.991852 40.737392 -73.991318 40.736000
50% 2.000000 1.000000 -73.981743 40.754093 -73.979774 40.754543
75% 2.000000 2.000000 -73.967400 40.768394 -73.963013 40.769852
max 2.000000 9.000000 -69.248917 42.814938 -67.496796 48.857597

Feature Engineering


In [8]:
import datetime as dt 
now = dt.datetime.now()
now


Out[8]:
datetime.datetime(2017, 9, 15, 0, 53, 37, 185406)

In [9]:
train = trainDF
test = testDF
del trainDF, testDF

In [10]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date

train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')

test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')
train['check_trip_duration'] = (train['dropoff_datetime'] - train['pickup_datetime']).map(lambda x: x.total_seconds())

duration_difference = train[np.abs(train['check_trip_duration'].values  - train['trip_duration'].values) > 1]

train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)

print('Trip_duration and datetimes are ok.') if len(duration_difference[['pickup_datetime', 'dropoff_datetime', 'trip_duration', 'check_trip_duration']]) == 0 else print('Ooops.')
train.describe()


Trip_duration and datetimes are ok.
Out[10]:
vendor_id passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration check_trip_duration log_trip_duration
count 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06
mean 1.534950e+00 1.664530e+00 -7.397349e+01 4.075092e+01 -7.397342e+01 4.075180e+01 5.515396e-03 9.594923e+02 9.594923e+02 6.466978e+00
std 4.987772e-01 1.314242e+00 7.090186e-02 3.288119e-02 7.064327e-02 3.589056e-02 7.406066e-02 5.237432e+03 5.237432e+03 7.957595e-01
min 1.000000e+00 0.000000e+00 -1.219333e+02 3.435970e+01 -1.219333e+02 3.218114e+01 0.000000e+00 1.000000e+00 1.000000e+00 6.931472e-01
25% 1.000000e+00 1.000000e+00 -7.399187e+01 4.073735e+01 -7.399133e+01 4.073588e+01 0.000000e+00 3.970000e+02 3.970000e+02 5.986452e+00
50% 2.000000e+00 1.000000e+00 -7.398174e+01 4.075410e+01 -7.397975e+01 4.075452e+01 0.000000e+00 6.620000e+02 6.620000e+02 6.496775e+00
75% 2.000000e+00 2.000000e+00 -7.396733e+01 4.076836e+01 -7.396301e+01 4.076981e+01 0.000000e+00 1.075000e+03 1.075000e+03 6.981006e+00
max 2.000000e+00 9.000000e+00 -6.133553e+01 5.188108e+01 -6.133553e+01 4.392103e+01 1.000000e+00 3.526282e+06 3.526282e+06 1.507575e+01

In [11]:
# Feature Extraction
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [12]:
train.head()


Out[12]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration pickup_date check_trip_duration log_trip_duration pickup_pca0 pickup_pca1 dropoff_pca0 dropoff_pca1
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 455 2016-03-14 455.0 6.122493 0.007691 0.017053 -0.009666 0.013695
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 663 2016-06-12 663.0 6.498282 0.007677 -0.012371 0.027145 -0.018652
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 2124 2016-01-19 2124.0 7.661527 0.004803 0.012879 0.034222 -0.039337
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 429 2016-04-06 429.0 6.063785 0.038342 -0.029194 0.041343 -0.042293
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 435 2016-03-26 435.0 6.077642 -0.002877 0.041749 -0.002380 0.031071

In [13]:
pca.explained_variance_


Out[13]:
array([ 0.00511756,  0.00114191])

In [14]:
# Distance
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [15]:
train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'pca_manhattan'] = np.abs(train['dropoff_pca1'] - train['pickup_pca1']) + np.abs(train['dropoff_pca0'] - train['pickup_pca0'])

test.loc[:, 'distance_haversine'] = haversine_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'direction'] = bearing_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'pca_manhattan'] = np.abs(test['dropoff_pca1'] - test['pickup_pca1']) + np.abs(test['dropoff_pca0'] - test['pickup_pca0'])

train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2
test.loc[:, 'center_latitude'] = (test['pickup_latitude'].values + test['dropoff_latitude'].values) / 2
test.loc[:, 'center_longitude'] = (test['pickup_longitude'].values + test['dropoff_longitude'].values) / 2

train.describe()


Out[15]:
vendor_id passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration check_trip_duration log_trip_duration pickup_pca0 pickup_pca1 dropoff_pca0 dropoff_pca1 distance_haversine distance_dummy_manhattan direction pca_manhattan center_latitude center_longitude
count 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06
mean 1.534950e+00 1.664530e+00 -7.397349e+01 4.075092e+01 -7.397342e+01 4.075180e+01 5.515396e-03 9.594923e+02 9.594923e+02 6.466978e+00 3.560932e-05 -4.414311e-04 -8.615863e-05 4.315364e-04 3.440864e+00 4.446860e+00 -1.566892e+01 4.601049e-02 4.075136e+01 -7.397345e+01
std 4.987772e-01 1.314242e+00 7.090186e-02 3.288119e-02 7.064327e-02 3.589056e-02 7.406066e-02 5.237432e+03 5.237432e+03 7.957595e-01 7.084957e-02 3.299371e-02 7.081532e-02 3.554987e-02 4.296538e+00 5.665930e+00 1.044915e+02 6.009216e-02 2.972943e-02 6.683386e-02
min 1.000000e+00 0.000000e+00 -1.219333e+02 3.435970e+01 -1.219333e+02 3.218114e+01 0.000000e+00 1.000000e+00 1.000000e+00 6.931472e-01 -1.244609e+01 -6.857210e+00 -1.244609e+01 -8.563596e+00 0.000000e+00 0.000000e+00 -1.799927e+02 0.000000e+00 3.344669e+01 -1.219333e+02
25% 1.000000e+00 1.000000e+00 -7.399187e+01 4.073735e+01 -7.399133e+01 4.073588e+01 0.000000e+00 3.970000e+02 3.970000e+02 5.986452e+00 -7.260550e-03 -1.295075e-02 -1.132004e-02 -1.457879e-02 1.231837e+00 1.570939e+00 -1.252736e+02 1.612315e-02 4.073715e+01 -7.399012e+01
50% 2.000000e+00 1.000000e+00 -7.398174e+01 4.075410e+01 -7.397975e+01 4.075452e+01 0.000000e+00 6.620000e+02 6.620000e+02 6.496775e+00 7.727282e-03 3.063014e-03 6.083989e-03 3.359148e-03 2.093717e+00 2.688798e+00 8.199996e+00 2.742471e-02 4.075323e+01 -7.397973e+01
75% 2.000000e+00 2.000000e+00 -7.396733e+01 4.076836e+01 -7.396301e+01 4.076981e+01 0.000000e+00 1.075000e+03 1.075000e+03 6.981006e+00 1.890063e-02 1.588978e-02 1.825289e-02 1.725344e-02 3.875337e+00 4.998990e+00 5.340495e+01 5.067191e-02 4.076726e+01 -7.396552e+01
max 2.000000e+00 9.000000e+00 -6.133553e+01 5.188108e+01 -6.133553e+01 4.392103e+01 1.000000e+00 3.526282e+06 3.526282e+06 1.507575e+01 4.807449e+01 1.104230e+01 4.807444e+01 3.489436e+00 1.240909e+03 1.318480e+03 1.800000e+02 1.301444e+01 4.631584e+01 -6.133553e+01

In [16]:
# Datetime features
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_hour_weekofyear'] = train['pickup_datetime'].dt.weekofyear
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_hour_weekofyear'] = test['pickup_datetime'].dt.weekofyear
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

train.loc[:,'week_delta'] = train['pickup_datetime'].dt.weekday + \
    ((train['pickup_datetime'].dt.hour + (train['pickup_datetime'].dt.minute / 60.0)) / 24.0)
test.loc[:,'week_delta'] = test['pickup_datetime'].dt.weekday + \
    ((test['pickup_datetime'].dt.hour + (test['pickup_datetime'].dt.minute / 60.0)) / 24.0)

In [17]:
train['pickup_datetime'].dt.month[0] , train['pickup_datetime'][0]


Out[17]:
(3, Timestamp('2016-03-14 17:24:55'))

In [18]:
train.head()


Out[18]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... pca_manhattan center_latitude center_longitude pickup_weekday pickup_hour_weekofyear pickup_hour pickup_minute pickup_dt pickup_week_hour week_delta
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 0.020716 40.766769 -73.973392 0 11 17 24 6369878.0 17 0.725000
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 0.025749 40.734858 -73.989948 6 23 0 43 14085798.0 144 6.029861
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 0.081636 40.737013 -73.992180 1 3 11 35 1596907.0 35 1.482639
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 0.016100 40.713345 -74.011154 2 14 19 32 8364734.0 67 2.813889
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 0.011175 40.787865 -73.972988 5 12 13 30 7392638.0 133 5.562500

5 rows × 31 columns


In [19]:
# Make time features cyclic
train.loc[:,'week_delta_sin'] = np.sin((train['week_delta'] / 7) * np.pi)**2
train.loc[:,'hour_sin'] = np.sin((train['pickup_hour'] / 24) * np.pi)**2

test.loc[:,'week_delta_sin'] = np.sin((test['week_delta'] / 7) * np.pi)**2
test.loc[:,'hour_sin'] = np.sin((test['pickup_hour'] / 24) * np.pi)**2

train.head()


Out[19]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... center_longitude pickup_weekday pickup_hour_weekofyear pickup_hour pickup_minute pickup_dt pickup_week_hour week_delta week_delta_sin hour_sin
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... -73.973392 0 11 17 24 6369878.0 17 0.725000 0.102188 0.629410
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... -73.989948 6 23 0 43 14085798.0 144 6.029861 0.177891 0.000000
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... -73.992180 1 3 11 35 1596907.0 35 1.482639 0.381157 0.982963
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... -74.011154 2 14 19 32 8364734.0 67 2.813889 0.908141 0.370590
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... -73.972988 5 12 13 30 7392638.0 133 5.562500 0.361582 0.982963

5 rows × 33 columns


In [20]:
# Speed
train.loc[:, 'avg_speed_h'] = 1000 * train['distance_haversine'] / train['trip_duration']
train.loc[:, 'avg_speed_m'] = 1000 * train['distance_dummy_manhattan'] / train['trip_duration']

train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 3)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 3)
train.head()


Out[20]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... pickup_minute pickup_dt pickup_week_hour week_delta week_delta_sin hour_sin avg_speed_h avg_speed_m pickup_lat_bin pickup_long_bin
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 24 6369878.0 17 0.725000 0.102188 0.629410 3.293452 3.814139 40.768 -73.982
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 43 14085798.0 144 6.029861 0.177891 0.000000 2.723239 3.665922 40.739 -73.980
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 35 1596907.0 35 1.482639 0.381157 0.982963 3.006167 3.862323 40.764 -73.979
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 32 8364734.0 67 2.813889 0.908141 0.370590 3.462700 3.872567 40.720 -74.010
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 30 7392638.0 133 5.562500 0.361582 0.982963 2.732387 2.757372 40.793 -73.973

5 rows × 37 columns


In [21]:
# Average speed for regions
gby_cols = ['pickup_lat_bin', 'pickup_long_bin']
coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
coord_stats = coord_stats[coord_stats['id'] > 100]
coord_stats.head()


Out[21]:
pickup_lat_bin pickup_long_bin avg_speed_h id
547 40.641 -73.789 7.582097 253
548 40.641 -73.788 7.808076 453
570 40.642 -73.789 7.461802 530
571 40.642 -73.788 7.797803 885
600 40.643 -73.790 6.812635 517

In [22]:
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 2)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 2)
train.loc[:, 'center_lat_bin'] = np.round(train['center_latitude'], 2)
train.loc[:, 'center_long_bin'] = np.round(train['center_longitude'], 2)
train.loc[:, 'pickup_dt_bin'] = (train['pickup_dt'] // (3 * 3600))

test.loc[:, 'pickup_lat_bin'] = np.round(test['pickup_latitude'], 2)
test.loc[:, 'pickup_long_bin'] = np.round(test['pickup_longitude'], 2)
test.loc[:, 'center_lat_bin'] = np.round(test['center_latitude'], 2)
test.loc[:, 'center_long_bin'] = np.round(test['center_longitude'], 2)
test.loc[:, 'pickup_dt_bin'] = (test['pickup_dt'] // (3 * 3600))

train.head()


Out[22]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... week_delta week_delta_sin hour_sin avg_speed_h avg_speed_m pickup_lat_bin pickup_long_bin center_lat_bin center_long_bin pickup_dt_bin
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 0.725000 0.102188 0.629410 3.293452 3.814139 40.77 -73.98 40.77 -73.97 589.0
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 6.029861 0.177891 0.000000 2.723239 3.665922 40.74 -73.98 40.73 -73.99 1304.0
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 1.482639 0.381157 0.982963 3.006167 3.862323 40.76 -73.98 40.74 -73.99 147.0
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 2.813889 0.908141 0.370590 3.462700 3.872567 40.72 -74.01 40.71 -74.01 774.0
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 5.562500 0.361582 0.982963 2.732387 2.757372 40.79 -73.97 40.79 -73.97 684.0

5 rows × 40 columns


In [23]:
# Clustering
t0 = dt.datetime.now()

sample_ind = np.random.permutation(len(coords))
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])
t1 = dt.datetime.now()
print('Time for clustering: %i seconds' % (t1 - t0).seconds)
train.head()


Time for clustering: 10 seconds
Out[23]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... hour_sin avg_speed_h avg_speed_m pickup_lat_bin pickup_long_bin center_lat_bin center_long_bin pickup_dt_bin pickup_cluster dropoff_cluster
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 0.629410 3.293452 3.814139 40.77 -73.98 40.77 -73.97 589.0 16 94
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 0.000000 2.723239 3.665922 40.74 -73.98 40.73 -73.99 1304.0 23 83
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 0.982963 3.006167 3.862323 40.76 -73.98 40.74 -73.99 147.0 78 44
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 0.370590 3.462700 3.872567 40.72 -74.01 40.71 -74.01 774.0 81 4
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 0.982963 2.732387 2.757372 40.79 -73.97 40.79 -73.97 684.0 31 18

5 rows × 42 columns


In [24]:
# Temporal and geospatial aggregation
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    train = pd.merge(train, gby, how='left', left_on=gby_col, right_index=True)
    test = pd.merge(test, gby, how='left', left_on=gby_col, right_index=True)

for gby_cols in [['center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'pickup_cluster'],  
                 ['pickup_hour', 'dropoff_cluster'],
                 ['pickup_cluster', 'dropoff_cluster']]:
    coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
    coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
    coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
    coord_stats = coord_stats[coord_stats['id'] > 100]
    coord_stats.columns = gby_cols + ['avg_speed_h_%s' % '_'.join(gby_cols), 'cnt_%s' %  '_'.join(gby_cols)]
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)
    test = pd.merge(test, coord_stats, how='left', on=gby_cols)

group_freq = '60min'
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq)

train.head()


Out[24]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... cnt_center_lat_bin_center_long_bin avg_speed_h_pickup_hour_center_lat_bin_center_long_bin cnt_pickup_hour_center_lat_bin_center_long_bin avg_speed_h_pickup_hour_pickup_cluster cnt_pickup_hour_pickup_cluster avg_speed_h_pickup_hour_dropoff_cluster cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 57030.0 3.085869 3451.0 3.152372 1901.0 3.121137 1219.0 3.057220 778.0 2016-03-14 17:00:00
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 53601.0 3.553433 2701.0 4.774890 722.0 3.889440 949.0 2.867844 473.0 2016-06-12 01:00:00
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 82646.0 2.933729 3795.0 3.023724 2210.0 4.186092 578.0 4.710144 235.0 2016-01-19 12:00:00
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 9309.0 2.971749 558.0 4.369606 1383.0 4.433726 712.0 3.225647 827.0 2016-04-06 20:00:00
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 19077.0 4.312554 990.0 4.129998 1515.0 3.843676 1555.0 4.294911 2487.0 2016-03-26 14:00:00

5 rows × 71 columns


In [25]:
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    print(gby.columns)


Index(['avg_speed_h_gby_pickup_hour', 'avg_speed_m_gby_pickup_hour',
       'log_trip_duration_gby_pickup_hour'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_date', 'avg_speed_m_gby_pickup_date',
       'log_trip_duration_gby_pickup_date'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_dt_bin', 'avg_speed_m_gby_pickup_dt_bin',
       'log_trip_duration_gby_pickup_dt_bin'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_week_hour', 'avg_speed_m_gby_pickup_week_hour',
       'log_trip_duration_gby_pickup_week_hour'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_cluster', 'avg_speed_m_gby_pickup_cluster',
       'log_trip_duration_gby_pickup_cluster'],
      dtype='object')
Index(['avg_speed_h_gby_dropoff_cluster', 'avg_speed_m_gby_dropoff_cluster',
       'log_trip_duration_gby_dropoff_cluster'],
      dtype='object')

In [26]:
gby.head()


Out[26]:
avg_speed_h_gby_dropoff_cluster avg_speed_m_gby_dropoff_cluster log_trip_duration_gby_dropoff_cluster
dropoff_cluster
0 3.222116 4.126587 6.526590
1 3.390453 4.286948 6.495371
2 8.137941 11.160425 7.362408
3 4.594309 6.044043 6.249759
4 4.537347 5.992313 6.744443

In [27]:
# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

train.head(30)


Out[27]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... avg_speed_h_pickup_hour_center_lat_bin_center_long_bin cnt_pickup_hour_center_lat_bin_center_long_bin avg_speed_h_pickup_hour_pickup_cluster cnt_pickup_hour_pickup_cluster avg_speed_h_pickup_hour_dropoff_cluster cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group count_60min
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 3.085869 3451.0 3.152372 1901.0 3.121137 1219.0 3.057220 778.0 2016-03-14 17:00:00 580.0
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 3.553433 2701.0 4.774890 722.0 3.889440 949.0 2.867844 473.0 2016-06-12 01:00:00 652.0
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 2.933729 3795.0 3.023724 2210.0 4.186092 578.0 4.710144 235.0 2016-01-19 12:00:00 547.0
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 2.971749 558.0 4.369606 1383.0 4.433726 712.0 3.225647 827.0 2016-04-06 20:00:00 776.0
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 4.312554 990.0 4.129998 1515.0 3.843676 1555.0 4.294911 2487.0 2016-03-26 14:00:00 609.0
5 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03 6 -73.982857 40.742195 -73.992081 40.749184 0 ... 3.575362 4844.0 3.862582 2367.0 3.544228 1537.0 2.247211 1326.0 2016-01-30 22:00:00 659.0
6 id1813257 1 2016-06-17 22:34:59 2016-06-17 22:40:40 4 -73.969017 40.757839 -73.957405 40.765896 0 ... 4.785684 978.0 4.132709 831.0 4.442024 1367.0 3.258810 752.0 2016-06-17 23:00:00 704.0
7 id1324603 2 2016-05-21 07:54:58 2016-05-21 08:20:49 1 -73.969276 40.797779 -73.922470 40.760559 0 ... 3.834210 939.0 4.653557 815.0 4.148504 149.0 NaN NaN 2016-05-21 08:00:00 222.0
8 id1301050 1 2016-05-27 23:12:23 2016-05-27 23:16:38 1 -73.999481 40.738400 -73.985786 40.732815 0 ... 3.918048 4133.0 4.144213 1432.0 4.142915 782.0 2.950334 487.0 2016-05-27 23:00:00 599.0
9 id0012891 2 2016-03-10 21:45:01 2016-03-10 22:05:26 1 -73.981049 40.744339 -73.973000 40.789989 0 ... 4.108588 2414.0 4.231687 2101.0 5.191671 1742.0 4.200690 197.0 2016-03-10 22:00:00 764.0
10 id1436371 2 2016-05-10 22:08:41 2016-05-10 22:29:55 1 -73.982651 40.763840 -74.002228 40.732990 0 ... 3.575362 4844.0 3.790353 2827.0 4.076291 1207.0 3.932262 300.0 2016-05-10 22:00:00 675.0
11 id1299289 2 2016-05-15 11:16:11 2016-05-15 11:34:59 4 -73.991531 40.749439 -73.956543 40.770630 0 ... 2.844484 3783.0 2.719137 2452.0 3.429599 1094.0 3.550410 536.0 2016-05-15 11:00:00 577.0
12 id1187965 2 2016-02-19 09:52:46 2016-02-19 10:11:20 2 -73.962982 40.756680 -73.984406 40.760719 0 ... 2.722634 3927.0 3.251457 1100.0 2.656152 2003.0 2.396857 287.0 2016-02-19 10:00:00 622.0
13 id0799785 2 2016-06-01 20:58:29 2016-06-01 21:02:49 1 -73.956306 40.767941 -73.966110 40.763000 0 ... 4.753583 3009.0 4.529184 1269.0 3.778637 552.0 2.830620 571.0 2016-06-01 21:00:00 685.0
14 id2900608 2 2016-05-27 00:43:36 2016-05-27 01:07:10 1 -73.992195 40.727226 -73.974655 40.783070 0 ... 3.867352 2381.0 4.223347 1658.0 5.677079 678.0 4.533535 136.0 2016-05-27 01:00:00 486.0
15 id3319787 1 2016-05-16 15:29:02 2016-05-16 15:32:33 1 -73.955513 40.768593 -73.948761 40.771545 0 ... 3.408137 831.0 3.237316 1413.0 3.440373 1455.0 5.504894 1547.0 2016-05-16 15:00:00 631.0
16 id3379579 2 2016-04-11 17:29:50 2016-04-11 18:08:26 1 -73.991165 40.755562 -73.999290 40.725353 0 ... 3.554611 2683.0 3.239811 1517.0 3.158781 1488.0 3.361056 476.0 2016-04-11 17:00:00 568.0
17 id1154431 1 2016-04-14 08:48:26 2016-04-14 09:00:37 1 -73.994255 40.745804 -73.999657 40.723343 0 ... 3.735414 2302.0 3.458586 823.0 3.550004 873.0 3.170458 433.0 2016-04-14 09:00:00 701.0
18 id3552682 1 2016-06-27 09:55:13 2016-06-27 10:17:10 1 -74.003983 40.713013 -73.979195 40.749924 0 ... 3.806191 2230.0 3.605195 535.0 2.585617 2433.0 4.720563 236.0 2016-06-27 10:00:00 557.0
19 id3390316 2 2016-06-05 13:47:23 2016-06-05 13:51:34 1 -73.983887 40.738197 -73.991203 40.727871 0 ... 3.328760 2198.0 3.308713 1022.0 3.323036 824.0 4.229128 846.0 2016-06-05 14:00:00 569.0
20 id2070428 1 2016-02-28 02:23:02 2016-02-28 02:31:08 1 -73.980370 40.742420 -73.962852 40.760635 0 ... 6.670052 624.0 5.622604 525.0 5.483900 268.0 4.876739 808.0 2016-02-28 02:00:00 727.0
21 id0809232 2 2016-04-01 12:12:25 2016-04-01 12:23:17 1 -73.979538 40.753361 -73.963997 40.763458 0 ... 2.731474 3907.0 3.036376 2295.0 2.813803 605.0 2.796585 259.0 2016-04-01 12:00:00 545.0
22 id2352683 1 2016-04-09 03:34:27 2016-04-09 03:41:30 1 -73.995865 40.758812 -73.993324 40.740322 0 ... 4.777298 1036.0 5.714316 342.0 4.607953 195.0 3.298622 366.0 2016-04-09 04:00:00 455.0
23 id1603037 1 2016-06-25 10:36:26 2016-06-25 10:55:49 1 -73.993553 40.747173 -74.006142 40.704384 0 ... 3.557703 2566.0 3.203422 853.0 4.471480 636.0 4.497097 116.0 2016-06-25 11:00:00 485.0
24 id3321406 2 2016-06-03 08:15:05 2016-06-03 08:56:30 1 -73.955231 40.777134 -73.788750 40.641472 0 ... NaN NaN 3.325140 2206.0 8.475760 221.0 NaN NaN 2016-06-03 08:00:00 661.0
25 id0129640 2 2016-02-14 13:27:56 2016-02-14 13:49:19 1 -73.956581 40.771358 -73.974968 40.732792 0 ... 3.722764 1240.0 3.578465 1407.0 3.487383 913.0 6.460617 341.0 2016-02-14 13:00:00 706.0
26 id3587298 1 2016-02-27 21:56:01 2016-02-27 22:14:51 1 -73.983765 40.749874 -73.958832 40.800961 0 ... 4.694206 1738.0 3.766723 1733.0 5.430098 783.0 NaN NaN 2016-02-27 22:00:00 650.0
27 id2104175 1 2016-06-20 23:07:16 2016-06-20 23:18:50 1 -73.958435 40.713192 -73.949539 40.680252 0 ... NaN NaN 4.183916 529.0 5.037164 449.0 5.056182 106.0 2016-06-20 23:00:00 513.0
28 id3973319 2 2016-06-13 21:57:27 2016-06-13 22:12:19 1 -73.994217 40.713306 -73.982849 40.692299 0 ... 4.606479 226.0 3.978923 212.0 4.841759 614.0 NaN NaN 2016-06-13 22:00:00 651.0
29 id1410897 1 2016-03-23 14:10:39 2016-03-23 14:49:30 1 -73.982117 40.756351 -73.865692 40.770988 0 ... 5.613759 589.0 2.997019 2305.0 5.428448 620.0 6.108822 328.0 2016-03-23 14:00:00 541.0

30 rows × 72 columns


In [28]:
# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

# Count how many trips are going from each cluster over time
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('pickup_cluster').rolling('240min').mean() \
    .drop('pickup_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)
test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)


train.head()


Out[28]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... avg_speed_h_pickup_hour_pickup_cluster cnt_pickup_hour_pickup_cluster avg_speed_h_pickup_hour_dropoff_cluster cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group count_60min dropoff_cluster_count pickup_cluster_count
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 3.152372 1901.0 3.121137 1219.0 3.057220 778.0 2016-03-14 17:00:00 580.0 12.00 25.50
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 4.774890 722.0 3.889440 949.0 2.867844 473.0 2016-06-12 01:00:00 652.0 10.50 7.25
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 3.023724 2210.0 4.186092 578.0 4.710144 235.0 2016-01-19 12:00:00 547.0 5.75 22.75
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 4.369606 1383.0 4.433726 712.0 3.225647 827.0 2016-04-06 20:00:00 776.0 6.00 15.00
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 4.129998 1515.0 3.843676 1555.0 4.294911 2487.0 2016-03-26 14:00:00 609.0 11.00 12.25

5 rows × 74 columns


In [29]:
# OSRM Features
fr1 = pd.read_csv('input/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
fr2 = pd.read_csv('input/fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
test_street_info = pd.read_csv('input/fastest_routes_test.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
train_street_info = pd.concat((fr1, fr2))
train = train.merge(train_street_info, how='left', on='id')
test = test.merge(test_street_info, how='left', on='id')
train_street_info.head()


Out[29]:
id total_distance total_travel_time number_of_steps
0 id2875421 2009.1 164.9 5
1 id2377394 2513.2 332.0 6
2 id3504673 1779.4 235.8 4
3 id2181028 1614.9 140.1 5
4 id0801584 1393.5 189.4 5

In [30]:
train.head()


Out[30]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group count_60min dropoff_cluster_count pickup_cluster_count total_distance total_travel_time number_of_steps
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 0 ... 1219.0 3.057220 778.0 2016-03-14 17:00:00 580.0 12.00 25.50 2009.1 164.9 5.0
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 0 ... 949.0 2.867844 473.0 2016-06-12 01:00:00 652.0 10.50 7.25 2513.2 332.0 6.0
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 0 ... 578.0 4.710144 235.0 2016-01-19 12:00:00 547.0 5.75 22.75 11060.8 767.6 16.0
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 0 ... 712.0 3.225647 827.0 2016-04-06 20:00:00 776.0 6.00 15.00 1779.4 235.8 4.0
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 0 ... 1555.0 4.294911 2487.0 2016-03-26 14:00:00 609.0 11.00 12.25 1614.9 140.1 5.0

5 rows × 77 columns


In [31]:
feature_names = list(train.columns)
print(np.setdiff1d(train.columns, test.columns))
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime', 'trip_duration', 'check_trip_duration',
                           'pickup_date', 'avg_speed_h', 'avg_speed_m', 'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin', 'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
print(feature_names)
print('We have %i features.' % len(feature_names))
train[feature_names].count()
y = np.log(train['trip_duration'].values + 1)

t1 = dt.datetime.now()
print('Feature extraction time: %i seconds' % (t1 - t0).seconds)


['avg_speed_h' 'avg_speed_m' 'check_trip_duration' 'dropoff_datetime'
 'log_trip_duration' 'trip_duration']
['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 'distance_haversine', 'distance_dummy_manhattan', 'direction', 'pca_manhattan', 'center_latitude', 'center_longitude', 'pickup_weekday', 'pickup_hour_weekofyear', 'pickup_hour', 'pickup_minute', 'pickup_dt', 'pickup_week_hour', 'week_delta', 'week_delta_sin', 'hour_sin', 'pickup_cluster', 'dropoff_cluster', 'avg_speed_h_gby_pickup_hour', 'avg_speed_m_gby_pickup_hour', 'log_trip_duration_gby_pickup_hour', 'avg_speed_h_gby_pickup_date', 'avg_speed_m_gby_pickup_date', 'log_trip_duration_gby_pickup_date', 'avg_speed_h_gby_pickup_dt_bin', 'avg_speed_m_gby_pickup_dt_bin', 'log_trip_duration_gby_pickup_dt_bin', 'avg_speed_h_gby_pickup_week_hour', 'avg_speed_m_gby_pickup_week_hour', 'log_trip_duration_gby_pickup_week_hour', 'avg_speed_h_gby_pickup_cluster', 'avg_speed_m_gby_pickup_cluster', 'log_trip_duration_gby_pickup_cluster', 'avg_speed_h_gby_dropoff_cluster', 'avg_speed_m_gby_dropoff_cluster', 'log_trip_duration_gby_dropoff_cluster', 'avg_speed_h_center_lat_bin_center_long_bin', 'cnt_center_lat_bin_center_long_bin', 'avg_speed_h_pickup_hour_center_lat_bin_center_long_bin', 'cnt_pickup_hour_center_lat_bin_center_long_bin', 'avg_speed_h_pickup_hour_pickup_cluster', 'cnt_pickup_hour_pickup_cluster', 'avg_speed_h_pickup_hour_dropoff_cluster', 'cnt_pickup_hour_dropoff_cluster', 'avg_speed_h_pickup_cluster_dropoff_cluster', 'cnt_pickup_cluster_dropoff_cluster', 'count_60min', 'dropoff_cluster_count', 'pickup_cluster_count', 'total_distance', 'total_travel_time', 'number_of_steps']
We have 62 features.
Feature extraction time: 86 seconds

Feature check before modeling


In [32]:
feature_stats = pd.DataFrame({'feature': feature_names})
feature_stats.loc[:, 'train_mean'] = np.nanmean(train[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'test_mean'] = np.nanmean(test[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'train_std'] = np.nanstd(train[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'test_std'] = np.nanstd(test[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'train_nan'] = np.mean(np.isnan(train[feature_names].values), axis=0).round(3)
feature_stats.loc[:, 'test_nan'] = np.mean(np.isnan(test[feature_names].values), axis=0).round(3)
feature_stats.loc[:, 'train_test_mean_diff'] = np.abs(feature_stats['train_mean'] - feature_stats['test_mean']) / np.abs(feature_stats['train_std'] + feature_stats['test_std'])  * 2
feature_stats.loc[:, 'train_test_nan_diff'] = np.abs(feature_stats['train_nan'] - feature_stats['test_nan'])
feature_stats = feature_stats.sort_values(by='train_test_mean_diff')
feature_stats[['feature', 'train_test_mean_diff']].tail()


Out[32]:
feature train_test_mean_diff
46 avg_speed_h_center_lat_bin_center_long_bin 0.002543
21 pickup_dt 0.002648
7 pickup_pca0 0.002774
10 dropoff_pca1 0.002833
18 pickup_hour_weekofyear 0.002872

Modeling


In [33]:
y[y.argsort()[:50]] # y is log :-)


Out[33]:
array([ 0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229])

In [34]:
train.loc[train.trip_duration.argsort() , ['pickup_longitude', 
                                           'pickup_latitude', 
                                           'dropoff_longitude', 
                                           'dropoff_latitude', 
                                           'store_and_fwd_flag', 
                                           'distance_haversine', 
                                           'trip_duration']].head(50)


Out[34]:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag distance_haversine trip_duration
207497 -73.819893 40.740822 -73.819885 40.740822 0 0.000643 1
1382872 -73.987991 40.724083 -73.987991 40.724079 0 0.000424 1
1360664 -73.991486 40.741940 -73.991478 40.741955 0 0.001814 1
346102 -73.985825 40.755760 -73.985901 40.755829 0 0.009980 1
1034341 -73.953728 40.670036 -73.953346 40.670021 0 0.032217 1
1439166 -73.975677 40.785488 -73.976372 40.785831 0 0.069815 1
35196 -73.940384 40.786423 -73.940300 40.786373 0 0.008963 1
918415 -74.004005 40.745125 -74.003998 40.745144 0 0.002216 1
767271 -73.982925 40.738781 -73.982925 40.738781 0 0.000000 1
810851 -73.946075 40.705254 -73.946075 40.705254 0 0.000000 1
600247 -73.861961 40.768559 -73.861961 40.768559 0 0.000000 1
1165514 -73.845016 40.874332 -73.845016 40.874332 0 0.000000 1
533799 -73.782417 40.644154 -73.782433 40.644169 0 0.002130 1
596136 -73.789658 40.647003 -73.789658 40.647003 0 0.000000 1
311745 -73.806152 40.660206 -73.806267 40.660049 0 0.019891 1
555237 -73.986282 40.750034 -73.986237 40.750088 0 0.007081 1
404610 -73.971954 40.750137 -73.971985 40.750130 0 0.002707 1
761666 -73.978676 40.715626 -73.978691 40.715626 0 0.001286 1
1063496 -73.977554 40.787083 -73.977554 40.787083 0 0.000000 1
1056854 -73.781754 40.644451 -73.781754 40.644459 0 0.000848 1
1306890 -73.949402 40.796833 -73.949486 40.796726 0 0.013819 1
467645 -73.787086 40.647388 -73.789780 40.646992 0 0.231459 1
1296239 -73.987595 40.719997 -73.987595 40.720005 0 0.000848 1
269418 -73.945999 40.792202 -73.947411 40.792740 0 0.133023 1
1125014 -73.968124 40.694622 -73.968124 40.694622 0 0.000000 1
1029695 -73.979240 40.765270 -73.979279 40.765236 0 0.004989 1
563953 -73.807213 40.658192 -73.807259 40.658051 0 0.016163 1
451396 -74.001610 40.737450 -74.001610 40.737450 0 0.000000 1
610159 -73.980560 40.754829 -73.980560 40.754829 0 0.000000 1
346959 -73.776512 40.646454 -73.776512 40.646454 0 0.000000 1
726971 -73.983154 40.767159 -73.983238 40.767097 0 0.009799 1
279493 -73.841423 40.695278 -73.841423 40.695274 0 0.000424 1
285635 -73.980690 40.744419 -73.980690 40.744419 0 0.000000 1
282028 -73.928391 40.808971 -73.928398 40.808975 0 0.000770 2
241947 -73.980919 40.759865 -73.980919 40.759865 0 0.000000 2
504757 -73.790138 40.646938 -73.790138 40.646938 0 0.000000 2
702422 -74.001785 40.740986 -74.001785 40.740986 0 0.000000 2
846297 -73.974747 40.752926 -73.974754 40.752930 0 0.000770 2
516136 -73.956703 40.803001 -73.956703 40.802990 0 0.001273 2
1152386 -73.789795 40.643040 -73.789795 40.643040 0 0.000000 2
315535 -73.989029 40.763493 -73.989098 40.763401 0 0.011708 2
68998 -73.928200 40.653770 -73.928200 40.653767 0 0.000424 2
1007642 -74.182350 40.687710 -74.182350 40.687710 0 0.000000 2
547818 -73.776550 40.646278 -73.776550 40.646282 0 0.000424 2
372538 -73.980385 40.744614 -73.980408 40.744610 0 0.001974 2
464384 -73.794342 40.657082 -73.794342 40.657082 0 0.000000 2
682599 -73.986237 40.757740 -73.987717 40.757290 0 0.134339 2
287664 -74.010735 40.728527 -74.010475 40.730782 0 0.251639 2
1197615 -73.965729 40.754471 -73.965729 40.754471 0 0.000000 2
866365 -73.980492 40.683014 -73.980484 40.682991 0 0.002625 2

In [35]:
train.loc[train.trip_duration.argsort() , ['pickup_longitude', 
                                           'pickup_latitude', 
                                           'dropoff_longitude', 
                                           'dropoff_latitude', 
                                           'store_and_fwd_flag', 
                                           'distance_haversine', 
                                           'trip_duration']].tail(50)


Out[35]:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag distance_haversine trip_duration
913947 -73.980469 40.733913 -73.995087 40.743858 0 1.655186 86353
1298124 -74.005936 40.736198 -73.994583 40.750385 0 1.844787 86353
660239 -73.994003 40.724518 -73.985214 40.727486 0 0.810830 86354
98761 -74.009071 40.710686 -73.980965 40.764664 0 6.452339 86354
992593 -74.003242 40.732742 -74.006226 40.711880 0 2.333395 86354
447905 -74.000870 40.757580 -74.004807 40.748051 0 1.110267 86354
1107580 -73.991158 40.734909 -74.007851 40.714909 0 2.631501 86355
645313 -73.991150 40.750629 -73.987946 40.736725 0 1.569508 86356
836498 -74.001968 40.728039 -74.000465 40.742641 0 1.628673 86356
1296280 -73.992279 40.743641 -73.985779 40.747280 0 0.680896 86356
169857 -73.987442 40.753510 -73.996429 40.753166 0 0.758001 86356
612768 -73.989449 40.757431 -73.987381 40.765339 0 0.896392 86357
1437604 -73.991409 40.750252 -73.999252 40.728313 0 2.527341 86357
6513 -73.993744 40.727444 -74.001335 40.729244 0 0.670275 86357
883205 -73.962730 40.768219 -73.971992 40.791828 0 2.738605 86358
88508 -73.955414 40.779453 -73.963234 40.809166 0 3.368845 86358
351537 -73.984894 40.748360 -73.990868 40.734138 0 1.659483 86358
318008 -73.981255 40.780132 -73.983833 40.759827 0 2.268303 86360
1099093 -74.002815 40.760563 -73.988312 40.759232 0 1.230491 86361
1102514 -73.937073 40.764462 -73.937004 40.764519 0 0.008598 86361
1182032 -73.999023 40.734821 -73.978210 40.751495 0 2.551867 86361
556468 -73.985367 40.747066 -73.996208 40.732265 0 1.882269 86362
1258361 -73.972183 40.745804 -73.966812 40.753181 0 0.936853 86362
252615 -73.958870 40.761326 -73.979088 40.747147 0 2.320784 86362
755015 -73.991684 40.754745 -73.956551 40.766376 0 3.229359 86362
1399707 -73.983459 40.755875 -73.969604 40.768566 0 1.831177 86363
344777 -73.999870 40.725166 -73.975685 40.759354 0 4.313074 86364
290498 -73.968300 40.755257 -73.959572 40.762756 0 1.111687 86364
1282413 -73.986298 40.734531 -73.992271 40.749088 0 1.695088 86365
483960 -73.939819 40.798328 -73.924026 40.807671 0 1.687045 86366
1233528 -73.966072 40.773960 -73.983414 40.762760 0 1.919312 86367
1213613 -73.976395 40.739712 -73.984245 40.733238 0 0.977582 86367
1269316 -73.978783 40.731056 -73.997871 40.716831 0 2.256008 86369
1284067 -73.974281 40.762402 -73.988472 40.759548 0 1.236585 86369
779030 -73.992996 40.753071 -73.977242 40.765148 0 1.887932 86369
172662 -73.975540 40.789719 -73.978920 40.777836 0 1.351600 86369
1138915 -73.789696 40.643574 -74.001190 40.721466 0 19.825771 86377
66346 -74.000504 40.732121 -73.994202 40.712418 0 2.254313 86377
1221666 -73.982658 40.761963 -73.988976 40.743023 0 2.172213 86378
91717 -73.781990 40.644722 -73.980675 40.724918 0 18.978561 86378
753765 -74.006111 40.734680 -73.958809 40.815449 0 9.824692 86379
1360439 -73.782089 40.644806 -73.985016 40.666828 0 17.292454 86385
59891 -73.992279 40.749729 -73.962524 40.800770 0 6.203902 86387
73816 -73.996010 40.753220 -73.979027 40.740601 0 2.003924 86390
295382 -73.781952 40.644688 -73.993874 40.745926 0 21.117104 86391
1234291 -73.794525 40.644825 -73.991051 40.755573 0 20.642723 86392
355003 -73.789650 40.643559 -73.956810 40.773087 0 20.148664 1939736
680594 -73.921677 40.735252 -73.984749 40.759979 0 5.982487 2049578
924150 -73.983788 40.742325 -73.985489 40.727676 0 1.635128 2227612
978383 -73.783905 40.648632 -73.978271 40.750202 0 19.900661 3526282

In [36]:
train.shape  , train.loc[train.trip_duration < 1939735].shape


Out[36]:
((1458644, 77), (1458640, 77))

Additional data


In [38]:
train_augmented = pd.read_csv('nyc-taxi-trip-noisy/train_augmented.csv')
test_augmented = pd.read_csv('nyc-taxi-trip-noisy/test_augmented.csv')

In [39]:
train_augmented.shape , test_augmented.shape


Out[39]:
((1458643, 16), (625134, 16))

In [40]:
train_augmented.head()


Out[40]:
id distance duration motorway trunk primary secondary tertiary unclassified residential nTrafficSignals nCrossing nStop nIntersection srcCounty dstCounty
0 id2875421 2009.1 160.9 0.0 0.00000 0.0 0.000000 1.000000 0.0 0.000000 14 5 0 4 1.0 1.0
1 id2377394 2513.4 256.5 0.0 0.00000 0.0 0.348518 0.174776 0.0 0.143903 25 13 0 0 1.0 1.0
2 id3858529 9910.7 679.6 0.0 0.54282 0.0 0.372717 0.039806 0.0 0.006861 38 12 0 3 1.0 1.0
3 id3504673 1779.1 181.8 0.0 0.00000 0.0 0.000000 0.424452 0.0 0.039741 18 6 0 1 1.0 1.0
4 id2181028 1615.0 132.2 0.0 0.00000 0.0 0.637338 0.362663 0.0 0.000000 17 2 0 2 1.0 1.0

In [42]:
ti = np.intersect1d(ar1=train.id,ar2=train_augmented.id)

In [43]:
tie = np.intersect1d(ar1=train.id,ar2=test_augmented.id)

In [45]:
len(ti) , len(tie) , train.shape


Out[45]:
(1458643, 0, (1458644, 77))

In [62]:
train.loc[train['id']=='id0551003'].trip_duration , '--',train_augmented.loc[train_augmented['id']=='id0551003'].duration


Out[62]:
(66718    9451
 Name: trip_duration, dtype: int64, '--', 66718    1213.5
 Name: duration, dtype: float64)

In [54]:
train.loc[train['id']=='id2875421']


Out[54]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group count_60min dropoff_cluster_count pickup_cluster_count total_distance total_travel_time number_of_steps
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.96463 40.765602 0 ... 1219.0 3.05722 778.0 2016-03-14 17:00:00 580.0 12.0 25.5 2009.1 164.9 5.0

1 rows × 77 columns


In [50]:
train_augmented.loc[train_augmented['id']=='id2875421']


Out[50]:
id distance duration motorway trunk primary secondary tertiary unclassified residential nTrafficSignals nCrossing nStop nIntersection srcCounty dstCounty
0 id2875421 2009.1 160.9 0.0 0.0 0.0 0.0 1.0 0.0 0.0 14 5 0 4 1.0 1.0

In [55]:
tie2 = np.intersect1d(ar1=test.id,ar2=test_augmented.id)
len(tie2) ,  test.shape


Out[55]:
(625134, (625134, 71))

In [56]:
test_augmented.head()


Out[56]:
id distance duration motorway trunk primary secondary tertiary unclassified residential nTrafficSignals nCrossing nStop nIntersection srcCounty dstCounty
0 id3004672 3795.9 360.7 0.0 0.000000 0.0 0.522511 0.477489 0.0 0.000000 37 39 0 1 1.0 1.0
1 id3505355 2829.9 196.4 0.0 0.000000 0.0 0.255318 0.724963 0.0 0.000000 5 0 0 17 3.0 3.0
2 id1217141 1499.5 148.5 0.0 0.000000 0.0 0.428047 0.231395 0.0 0.000000 12 4 0 0 1.0 1.0
3 id2150126 6492.3 442.4 0.0 0.561299 0.0 0.213389 0.074967 0.0 0.078344 23 9 0 0 1.0 1.0
4 id1598245 1108.2 94.0 0.0 0.000000 0.0 0.638693 0.195813 0.0 0.000000 9 0 0 1 1.0 1.0

In [57]:
test.loc[test['id']=='id3004672']


Out[57]:
id vendor_id pickup_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag pickup_date ... cnt_pickup_hour_dropoff_cluster avg_speed_h_pickup_cluster_dropoff_cluster cnt_pickup_cluster_dropoff_cluster pickup_datetime_group count_60min dropoff_cluster_count pickup_cluster_count total_distance total_travel_time number_of_steps
0 id3004672 1 2016-06-30 23:59:58 1 -73.988129 40.732029 -73.990173 40.75668 0 2016-06-30 ... 1722.0 3.343171 369.0 2016-07-01 624.0 0.0 0.0 3795.9 424.6 4

1 rows × 71 columns


In [ ]:
train_augmented = pd.read_csv('nyc-taxi-trip-noisy/train_augmented.csv')