In [1]:

    
import urllib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

 # for subsampling the data
from random import sample

# for plotting on a geographic map
import folium 
from folium import plugins 
import mplleaflet

Getting and inspecting the data

Downloading the data. At the moment, only data from June 2015 is considered.



In [2]:

    
# list containing the link(s) to the csv file(s)
data_links = ['https://storage.googleapis.com/tlc-trip-data/2015/yellow_tripdata_2015-06.csv']

filenames = []
for link in data_links:
    filenames.append(link.split('/')[-1])
    if not(os.path.isfile(filenames[-1])): # do not download file if it already exists
        urllib.urlretrieve(link, filename)

Loading the data into a pandas data frame and look at it:



In [3]:

    
df = pd.DataFrame()
for filename in filenames:
    df = df.append(pd.read_csv(filename), ignore_index=True)



In [4]:

    
df.head()









    Out[4]:






  
    
      
      VendorID
      tpep_pickup_datetime
      tpep_dropoff_datetime
      passenger_count
      trip_distance
      pickup_longitude
      pickup_latitude
      RateCodeID
      store_and_fwd_flag
      dropoff_longitude
      dropoff_latitude
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      0
      2
      2015-06-02 11:19:29
      2015-06-02 11:47:52
      1
      1.63
      -73.954430
      40.764141
      1
      N
      -73.974754
      40.754093
      2
      17.0
      0
      0.5
      0.00
      0
      0.3
      17.80
    
    
      1
      2
      2015-06-02 11:19:30
      2015-06-02 11:27:56
      1
      0.46
      -73.971443
      40.758942
      1
      N
      -73.978539
      40.761909
      1
      6.5
      0
      0.5
      1.00
      0
      0.3
      8.30
    
    
      2
      2
      2015-06-02 11:19:31
      2015-06-02 11:30:30
      1
      0.87
      -73.978111
      40.738434
      1
      N
      -73.990273
      40.745438
      1
      8.0
      0
      0.5
      2.20
      0
      0.3
      11.00
    
    
      3
      2
      2015-06-02 11:19:31
      2015-06-02 11:39:02
      1
      2.13
      -73.945892
      40.773529
      1
      N
      -73.971527
      40.760330
      1
      13.5
      0
      0.5
      2.86
      0
      0.3
      17.16
    
    
      4
      1
      2015-06-02 11:19:32
      2015-06-02 11:32:49
      1
      1.40
      -73.979088
      40.776772
      1
      N
      -73.982162
      40.758999
      2
      9.5
      0
      0.5
      0.00
      0
      0.3
      10.30



In [5]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 12324935 entries, 0 to 12324934
Data columns (total 19 columns):
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID               int64
store_and_fwd_flag       object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.8+ GB



In [6]:

    
df.describe()









    Out[6]:






  
    
      
      VendorID
      passenger_count
      trip_distance
      pickup_longitude
      pickup_latitude
      RateCodeID
      dropoff_longitude
      dropoff_latitude
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      count
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
      12324935.000000
    
    
      mean
      1.523565
      1.681898
      11.829076
      -72.913847
      40.166867
      1.041832
      -72.944737
      40.184784
      1.379517
      13.204085
      0.323405
      0.497618
      1.736538
      0.316152
      0.299721
      16.378268
    
    
      std
      0.499444
      1.335180
      7678.550287
      8.796762
      4.843162
      0.581957
      8.669562
      4.777574
      0.499528
      106.076575
      0.480415
      0.042148
      2.637613
      1.542573
      0.012164
      106.382788
    
    
      min
      1.000000
      0.000000
      0.000000
      -759.233337
      -67.136963
      1.000000
      -754.166687
      -16.177874
      1.000000
      -300.000000
      -30.500000
      -0.500000
      -80.000000
      -14.000000
      -0.300000
      -300.000000
    
    
      25%
      1.000000
      1.000000
      1.010000
      -73.991898
      40.736141
      1.000000
      -73.991302
      40.734627
      1.000000
      6.500000
      0.000000
      0.500000
      0.000000
      0.000000
      0.300000
      8.760000
    
    
      50%
      2.000000
      1.000000
      1.750000
      -73.981537
      40.753235
      1.000000
      -73.979622
      40.753799
      1.000000
      9.500000
      0.000000
      0.500000
      1.160000
      0.000000
      0.300000
      12.300000
    
    
      75%
      2.000000
      2.000000
      3.230000
      -73.966461
      40.767929
      1.000000
      -73.962479
      40.768787
      2.000000
      15.000000
      0.500000
      0.500000
      2.350000
      0.000000
      0.300000
      18.300000
    
    
      max
      2.000000
      9.000000
      10083318.000000
      149.028534
      69.702576
      99.000000
      125.535568
      483.450012
      5.000000
      335413.690000
      652.420000
      60.350000
      980.910000
      900.970000
      0.700000
      335414.490000

Note that some of the numerical features like tip amount and fare amount actually contain negative values. Those invalid values will be deleted in the next section.

Data cleaning

1) Retaining only trips paid by credit card

Only the credit card tips are recorded in the data set. Therefore, let's only retain trips with credit card payment. This might introduce some bias (as credit card payers may have a different tipping behaviour than others).

As seen below, most of the trips are anyway paid by credit card (label "1", followed by cash payment, label "2").



In [7]:

    
df.groupby('payment_type').size().plot(kind='bar');

For some trips, people actually tipped with credit card, even though they did not pay with credit card:



In [8]:

    
np.sum((df.payment_type != 1) & (df.tip_amount != 0))









    Out[8]:





388

However, the number of those trips is negligible, so I ignore them here and only retain credit card trips. Then, the column "payment_type" can be removed:



In [9]:

    
df = df[df.payment_type == 1]
df.drop('payment_type', axis=1, inplace=True)
df.shape









    Out[9]:





(7719300, 18)

2) Checking for unfeasible values in numerical features

As seen above, some of the numerical features contained negative values. Let's have a closer look...



In [10]:

    
(df < 0).sum()









    Out[10]:





VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count                0
trip_distance                  0
pickup_longitude         7610164
pickup_latitude                2
RateCodeID                     0
store_and_fwd_flag             0
dropoff_longitude        7623507
dropoff_latitude               1
fare_amount                   10
extra                          4
mta_tax                        6
tip_amount                     8
tolls_amount                   0
improvement_surcharge         10
total_amount                  11
dtype: int64

...and remove the corresponding rows where negative values do not make any sense:



In [11]:

    
col_names = ['total_amount', 'improvement_surcharge', 'tip_amount', 'mta_tax', 'extra', 'fare_amount']

# this removes all rows where at least one value of the columns in col_names is < 0
rows_to_keep = (df[col_names] >= 0).sum(axis=1) == len(col_names)
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]

(df[col_names] < 0).sum() # check if it worked









    



removing 13 rows...






    Out[11]:





total_amount             0
improvement_surcharge    0
tip_amount               0
mta_tax                  0
extra                    0
fare_amount              0
dtype: int64

3) Deleting "invalid" trips

Inspecting trip distance



In [16]:

    
ax = df.loc[sample(df.index, 30000)].plot(y='trip_distance',kind='hist', bins=200)
ax.set_xlim([0,25]);

Delete trips that are longer than 50 miles...



In [18]:

    
rows_to_keep = df.trip_distance <= 50
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 247 rows...

...and shorter than 0.1 miles:



In [19]:

    
rows_to_keep = df.trip_distance >= 0.1
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 30086 rows...

Inspecting trip fare



In [29]:

    
ax = df.loc[sample(df.index, 300000)].plot(y='fare_amount',kind='hist', bins=200)
ax.set_xlim([0,102]);

There seem to be a decent amount of trips with a fixed rate of 50 USD (see spike above).

Now let's remove rows where the fare is below 1 USD:



In [38]:

    
rows_to_keep = df.fare_amount >= 1
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 274 rows...

Inspecting trip duration



In [39]:

    
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)



In [40]:

    
df['trip_duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime



In [41]:

    
df['trip_duration_minutes'] = df.trip_duration.dt.seconds/60



In [43]:

    
ax = df.loc[sample(df.index, 300000)].plot(y='trip_duration_minutes', kind='hist', bins=500)
ax.set_xlim([0,150]);

Remove trips that took less than half a minute...



In [45]:

    
rows_to_keep = df.trip_duration_minutes>0.5
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 3625 rows...

...as well as trips with a duration of more than 2 hours:



In [46]:

    
rows_to_keep = df.trip_duration_minutes<=2*60
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 5988 rows...

Inspecting passenger count



In [48]:

    
df.plot(y='passenger_count', kind='hist', bins=30);









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff6fc6c3990>

Remove trips with zero passenger count:



In [49]:

    
rows_to_keep = df.passenger_count > 0
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 784 rows...

Remove trips with a passenger count of more than 6:



In [50]:

    
rows_to_keep = df.passenger_count <= 6
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 17 rows...

Removing invalid location coordinates

Remove trips that obviously did not start in NY:



In [51]:

    
within_NY = (df.pickup_latitude > 40) & (df.pickup_latitude < 40.9) & \
            (df.pickup_longitude > -74.4) & (df.pickup_longitude < -73.4)

print 'removing '+ str((~within_NY).sum()) + ' rows...'
df = df[within_NY]









    



removing 100456 rows...

Plot the pickup locations to check if they look good. Choose a random sample of all trips, since plotting all trips would take quite a while.



In [53]:

    
fig, ax = plt.subplots(figsize=(15, 10))
df.loc[sample(df.index, 200000)].plot(x='pickup_longitude', y='pickup_latitude', 
                                      kind='scatter', ax=ax, alpha=0.3, s=3)
ax.set_xlim([-74.2, -73.7])
ax.set_ylim([40.6, 40.9]);

The above plot looks reasonable, you can clearly identify the geometry of New York. Let's plot a small subset of data points on a map. Next to central NY, one can identify small hotspots at the surrounding airports.



In [92]:

    
subdf = df.loc[sample(df.index, 10000)] # subsample df 
data = subdf[['pickup_latitude', 'pickup_longitude']].values

mapa = folium.Map([40.7, -73.9], zoom_start=11, tiles='stamentoner') # create heatmap
mapa.add_children(plugins.HeatMap(data, min_opacity=0.005, max_zoom=18,
                                  max_val=0.01, radius=3, blur=3))
mapa









    Out[92]:

Inspecting the tip

As the tip distribution below shows, people tend to tip whole numbers of dollars (see peaks at e.g. 1 and 2 dollars).



In [57]:

    
fig, ax = plt.subplots(figsize=(12,4))
ax = df.loc[sample(df.index, 100000)].plot(y='tip_amount', kind='hist',bins=1500, ax=ax)
ax.set_xlim([0,10.5])
ax.set_xticks(np.arange(0, 11, 0.5));

A useful metric for a taxi driver to compare tips is the percentage of tip given with respect to the total fare amount.



In [63]:

    
# check if the fares and fees sum up to total_amount
print pd.concat([df.tip_amount + df.fare_amount + df.tolls_amount + \
           df.extra + df.mta_tax  + df.improvement_surcharge, \
           df.total_amount], axis=1).head()

# calculate tip percentage
df['total_fare'] = df.total_amount - df.tip_amount
df['tip_percentage'] = df.tip_amount / df.total_fare * 100









    



       0  total_amount
1   8.30          8.30
2  11.00         11.00
3  17.16         17.16
5  10.55         10.55
6  16.30         16.30

The tip percentage distribution below shows that people mostly seem to tip 0, 20, 25 or 30%.



In [80]:

    
data = df.loc[sample(df.index, 100000)].tip_percentage.values
plt.hist(data, np.arange(min(data)-0.5, max(data)+1.5)) 
plt.gca().set_xlim([0,35])
plt.gca().set_xticks(np.arange(0, 51, 5));
plt.legend(['tip_percentage']);

Remove trips where a tip of more than 100% was recorded, regarding them as invalid outliers.



In [83]:

    
rows_to_keep = df.tip_percentage <= 100
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]









    



removing 5015 rows...



In [31]:

    
df.tip_percentage.mean()









    Out[31]:





18.455572081937966



In [32]:

    
df.tip_percentage.median()









    Out[32]:





19.999999999999996



In [33]:

    
df.tip_percentage.mode()









    Out[33]:





0    20
dtype: float64



In [189]:

    
df.tip_percentage.quantile(0.25)









    Out[189]:





15.384615384615385



In [50]:

    
# fig, ax = plt.subplots(figsize=(14,5))
# ax = df.loc[sample(df.index, 100000)].tip_percentage.plot(kind='hist',bins=2000, cumulative=True)
# ax.set_xlim([0,200])

Tip percentage by day of the week (Monday=0, Sunday=6). People tend to tip a little less on weekends (day 5-6).



In [90]:

    
fig, ax = plt.subplots(figsize=(12, 6))
for i in range(7):
    df[df.pickup_weekday==i].groupby('pickup_hour').mean().plot(y='tip_percentage', ax=ax)
plt.legend(['day ' + str(x) for x in range(7)])
ax.set_ylabel('average tip percentage')









    Out[90]:





<matplotlib.text.Text at 0x7f22daa24850>

Let's look at the number of trips per hour and day:



In [91]:

    
fig, ax = plt.subplots(figsize=(12, 6))
for i in range(7):
    df[df.pickup_weekday==i].groupby('pickup_hour').size().plot(ax=ax)
plt.legend(['day ' + str(x) for x in range(7)])
ax.set_ylabel('number of trips')









    Out[91]:





<matplotlib.text.Text at 0x7f22d6c417d0>

The tip percentage does seem to depend too much on the number of passengers:



In [70]:

    
fig, ax = plt.subplots(figsize=(8,7))
df.boxplot('tip_percentage', by='passenger_count', showmeans=True, ax=ax)
ax.set_ylim([15,21])









    Out[70]:





(15, 21)

Save the cleaned data frame to a file:



In [84]:

    
df.to_pickle('df.pickle')

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	pickup_longitude	pickup_latitude	RateCodeID	store_and_fwd_flag	dropoff_longitude	dropoff_latitude	payment_type	fare_amount	mta_tax	tip_amount	improvement_surcharge	total_amount
0	2	2015-06-02 11:19:29	2015-06-02 11:47:52	1	1.63	-73.954430	40.764141	1	N	-73.974754	40.754093	2	17.0	0.5	0.00	0.3	17.80
1	2	2015-06-02 11:19:30	2015-06-02 11:27:56	1	0.46	-73.971443	40.758942	1	N	-73.978539	40.761909	1	6.5	0.5	1.00	0.3	8.30
2	2	2015-06-02 11:19:31	2015-06-02 11:30:30	1	0.87	-73.978111	40.738434	1	N	-73.990273	40.745438	1	8.0	0.5	2.20	0.3	11.00
3	2	2015-06-02 11:19:31	2015-06-02 11:39:02	1	2.13	-73.945892	40.773529	1	N	-73.971527	40.760330	1	13.5	0.5	2.86	0.3	17.16
4	1	2015-06-02 11:19:32	2015-06-02 11:32:49	1	1.40	-73.979088	40.776772	1	N	-73.982162	40.758999	2	9.5	0.5	0.00	0.3	10.30

	VendorID	passenger_count	trip_distance	pickup_longitude	pickup_latitude	RateCodeID	dropoff_longitude	dropoff_latitude	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount
count	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000	12324935.000000
mean	1.523565	1.681898	11.829076	-72.913847	40.166867	1.041832	-72.944737	40.184784	1.379517	13.204085	0.323405	0.497618	1.736538	0.316152	0.299721	16.378268
std	0.499444	1.335180	7678.550287	8.796762	4.843162	0.581957	8.669562	4.777574	0.499528	106.076575	0.480415	0.042148	2.637613	1.542573	0.012164	106.382788
min	1.000000	0.000000	0.000000	-759.233337	-67.136963	1.000000	-754.166687	-16.177874	1.000000	-300.000000	-30.500000	-0.500000	-80.000000	-14.000000	-0.300000	-300.000000
25%	1.000000	1.000000	1.010000	-73.991898	40.736141	1.000000	-73.991302	40.734627	1.000000	6.500000	0.000000	0.500000	0.000000	0.000000	0.300000	8.760000
50%	2.000000	1.000000	1.750000	-73.981537	40.753235	1.000000	-73.979622	40.753799	1.000000	9.500000	0.000000	0.500000	1.160000	0.000000	0.300000	12.300000
75%	2.000000	2.000000	3.230000	-73.966461	40.767929	1.000000	-73.962479	40.768787	2.000000	15.000000	0.500000	0.500000	2.350000	0.000000	0.300000	18.300000
max	2.000000	9.000000	10083318.000000	149.028534	69.702576	99.000000	125.535568	483.450012	5.000000	335413.690000	652.420000	60.350000	980.910000	900.970000	0.700000	335414.490000