In [1]:
import urllib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# for subsampling the data
from random import sample
# for plotting on a geographic map
import folium
from folium import plugins
import mplleaflet
Downloading the data. At the moment, only data from June 2015 is considered.
In [2]:
# list containing the link(s) to the csv file(s)
data_links = ['https://storage.googleapis.com/tlc-trip-data/2015/yellow_tripdata_2015-06.csv']
filenames = []
for link in data_links:
filenames.append(link.split('/')[-1])
if not(os.path.isfile(filenames[-1])): # do not download file if it already exists
urllib.urlretrieve(link, filename)
Loading the data into a pandas data frame and look at it:
In [3]:
df = pd.DataFrame()
for filename in filenames:
df = df.append(pd.read_csv(filename), ignore_index=True)
In [4]:
df.head()
Out[4]:
In [5]:
df.info()
In [6]:
df.describe()
Out[6]:
Note that some of the numerical features like tip amount and fare amount actually contain negative values. Those invalid values will be deleted in the next section.
Only the credit card tips are recorded in the data set. Therefore, let's only retain trips with credit card payment. This might introduce some bias (as credit card payers may have a different tipping behaviour than others).
As seen below, most of the trips are anyway paid by credit card (label "1", followed by cash payment, label "2").
In [7]:
df.groupby('payment_type').size().plot(kind='bar');
For some trips, people actually tipped with credit card, even though they did not pay with credit card:
In [8]:
np.sum((df.payment_type != 1) & (df.tip_amount != 0))
Out[8]:
However, the number of those trips is negligible, so I ignore them here and only retain credit card trips. Then, the column "payment_type" can be removed:
In [9]:
df = df[df.payment_type == 1]
df.drop('payment_type', axis=1, inplace=True)
df.shape
Out[9]:
In [10]:
(df < 0).sum()
Out[10]:
...and remove the corresponding rows where negative values do not make any sense:
In [11]:
col_names = ['total_amount', 'improvement_surcharge', 'tip_amount', 'mta_tax', 'extra', 'fare_amount']
# this removes all rows where at least one value of the columns in col_names is < 0
rows_to_keep = (df[col_names] >= 0).sum(axis=1) == len(col_names)
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
(df[col_names] < 0).sum() # check if it worked
Out[11]:
In [16]:
ax = df.loc[sample(df.index, 30000)].plot(y='trip_distance',kind='hist', bins=200)
ax.set_xlim([0,25]);
Delete trips that are longer than 50 miles...
In [18]:
rows_to_keep = df.trip_distance <= 50
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
...and shorter than 0.1 miles:
In [19]:
rows_to_keep = df.trip_distance >= 0.1
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
In [29]:
ax = df.loc[sample(df.index, 300000)].plot(y='fare_amount',kind='hist', bins=200)
ax.set_xlim([0,102]);
There seem to be a decent amount of trips with a fixed rate of 50 USD (see spike above).
Now let's remove rows where the fare is below 1 USD:
In [38]:
rows_to_keep = df.fare_amount >= 1
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
In [39]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
In [40]:
df['trip_duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
In [41]:
df['trip_duration_minutes'] = df.trip_duration.dt.seconds/60
In [43]:
ax = df.loc[sample(df.index, 300000)].plot(y='trip_duration_minutes', kind='hist', bins=500)
ax.set_xlim([0,150]);
Remove trips that took less than half a minute...
In [45]:
rows_to_keep = df.trip_duration_minutes>0.5
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
...as well as trips with a duration of more than 2 hours:
In [46]:
rows_to_keep = df.trip_duration_minutes<=2*60
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
In [48]:
df.plot(y='passenger_count', kind='hist', bins=30);
Out[48]:
Remove trips with zero passenger count:
In [49]:
rows_to_keep = df.passenger_count > 0
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
Remove trips with a passenger count of more than 6:
In [50]:
rows_to_keep = df.passenger_count <= 6
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
In [51]:
within_NY = (df.pickup_latitude > 40) & (df.pickup_latitude < 40.9) & \
(df.pickup_longitude > -74.4) & (df.pickup_longitude < -73.4)
print 'removing '+ str((~within_NY).sum()) + ' rows...'
df = df[within_NY]
Plot the pickup locations to check if they look good. Choose a random sample of all trips, since plotting all trips would take quite a while.
In [53]:
fig, ax = plt.subplots(figsize=(15, 10))
df.loc[sample(df.index, 200000)].plot(x='pickup_longitude', y='pickup_latitude',
kind='scatter', ax=ax, alpha=0.3, s=3)
ax.set_xlim([-74.2, -73.7])
ax.set_ylim([40.6, 40.9]);
The above plot looks reasonable, you can clearly identify the geometry of New York. Let's plot a small subset of data points on a map. Next to central NY, one can identify small hotspots at the surrounding airports.
In [92]:
subdf = df.loc[sample(df.index, 10000)] # subsample df
data = subdf[['pickup_latitude', 'pickup_longitude']].values
mapa = folium.Map([40.7, -73.9], zoom_start=11, tiles='stamentoner') # create heatmap
mapa.add_children(plugins.HeatMap(data, min_opacity=0.005, max_zoom=18,
max_val=0.01, radius=3, blur=3))
mapa
Out[92]:
In [57]:
fig, ax = plt.subplots(figsize=(12,4))
ax = df.loc[sample(df.index, 100000)].plot(y='tip_amount', kind='hist',bins=1500, ax=ax)
ax.set_xlim([0,10.5])
ax.set_xticks(np.arange(0, 11, 0.5));
A useful metric for a taxi driver to compare tips is the percentage of tip given with respect to the total fare amount.
In [63]:
# check if the fares and fees sum up to total_amount
print pd.concat([df.tip_amount + df.fare_amount + df.tolls_amount + \
df.extra + df.mta_tax + df.improvement_surcharge, \
df.total_amount], axis=1).head()
# calculate tip percentage
df['total_fare'] = df.total_amount - df.tip_amount
df['tip_percentage'] = df.tip_amount / df.total_fare * 100
The tip percentage distribution below shows that people mostly seem to tip 0, 20, 25 or 30%.
In [80]:
data = df.loc[sample(df.index, 100000)].tip_percentage.values
plt.hist(data, np.arange(min(data)-0.5, max(data)+1.5))
plt.gca().set_xlim([0,35])
plt.gca().set_xticks(np.arange(0, 51, 5));
plt.legend(['tip_percentage']);
Remove trips where a tip of more than 100% was recorded, regarding them as invalid outliers.
In [83]:
rows_to_keep = df.tip_percentage <= 100
print 'removing '+ str((~rows_to_keep).sum()) + ' rows...'
df = df[rows_to_keep]
In [31]:
df.tip_percentage.mean()
Out[31]:
In [32]:
df.tip_percentage.median()
Out[32]:
In [33]:
df.tip_percentage.mode()
Out[33]:
In [189]:
df.tip_percentage.quantile(0.25)
Out[189]:
In [50]:
# fig, ax = plt.subplots(figsize=(14,5))
# ax = df.loc[sample(df.index, 100000)].tip_percentage.plot(kind='hist',bins=2000, cumulative=True)
# ax.set_xlim([0,200])
Tip percentage by day of the week (Monday=0, Sunday=6). People tend to tip a little less on weekends (day 5-6).
In [90]:
fig, ax = plt.subplots(figsize=(12, 6))
for i in range(7):
df[df.pickup_weekday==i].groupby('pickup_hour').mean().plot(y='tip_percentage', ax=ax)
plt.legend(['day ' + str(x) for x in range(7)])
ax.set_ylabel('average tip percentage')
Out[90]:
Let's look at the number of trips per hour and day:
In [91]:
fig, ax = plt.subplots(figsize=(12, 6))
for i in range(7):
df[df.pickup_weekday==i].groupby('pickup_hour').size().plot(ax=ax)
plt.legend(['day ' + str(x) for x in range(7)])
ax.set_ylabel('number of trips')
Out[91]:
The tip percentage does seem to depend too much on the number of passengers:
In [70]:
fig, ax = plt.subplots(figsize=(8,7))
df.boxplot('tip_percentage', by='passenger_count', showmeans=True, ax=ax)
ax.set_ylim([15,21])
Out[70]:
Save the cleaned data frame to a file:
In [84]:
df.to_pickle('df.pickle')