This notebook will perform the process that was applied to the August's data in the previous notebook to the rest of months. So, if you're just reading the project you can skip this file and open the next notebook.
In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
In [2]:
import os
import numpy as np
import pandas as pd
In [3]:
# VARIABLES
# Paths.
faredata_path = '../data/faredata/trip_fare_{0}.csv'
tripdata_path = '../data/tripdata/trip_data_{0}.csv'
cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'
# Tip percentage variables.
tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']
tip_perc_column_name = 'tip_perc'
# Coordinates variables.
min_x = -74.361107
min_y = 40.459518
max_x = -71.903083
max_y = 41.175342
step_x = 0.003205 # 270 meters (885 feet) approximately.
step_y = 0.002429 # 270 meters (885 feet) approximately.
# Number of maximum areas per axis in the 'area matrix'.
max_area_x = 766 # int((max_x - min_x) / step_x)
max_area_y = 294 # int((max_y - min_y) / step_y)
# Number of columns of the 'area matrix'.
num_columns = 767 # max_area_x + 1
# The area at the top right corner in the 'area matrix'.
max_area = 226264 # (max_area_y * num_columns) + max_area_x
# Minimum number of pickups or dropoffs per area.
min_area_number = 20
# Columns for creating the pickup and dropoff area attributes.
area_column_names = ['pickup_area', 'dropoff_area']
area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
# Order of the columns to apply it at before saving.
column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',
'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',
'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
In [4]:
# FUNCTIONS
# Function for caculating the tip percentage of a trip.
def calculate_tip_perc(row):
subtotal = row.fare_amount + row.surcharge + row.mta_tax
tip = row.tip_amount / subtotal
tip_perc = tip * 100
return pd.Series({tip_perc_column_name: tip_perc})
# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.
def get_1d_area(coor, min_coor, step_coor):
return int((coor - min_coor) / step_coor)
def get_area(area_x, area_y, columns):
return (area_y * columns) + area_x
def calculate_area(row):
pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)
pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)
pickup_area = get_area(pickup_x, pickup_y, num_columns)
dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)
dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)
dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)
return pd.Series({
area_column_names[0]: pickup_area,
area_column_names[1]: dropoff_area
})
# This auxiliary function applies another one to every row in a DataFrame for creating new columns.
def iterate_and_apply(dataframe, function, necesary_columns):
perform = True
step = 100000
start = 0
to = step
while perform:
new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)
if len(new_columns) == 0:
perform = False
else:
dataframe.update(new_columns)
new_columns = None
start += step
to += step
return dataframe
In [5]:
# Creating the directory for saving the cleaned files if it doesn't exist.
if not os.path.exists('../data/cleaned/'):
os.makedirs('../data/cleaned/')
In [6]:
# The range of files to clean.
months = range(1, 8) + range(9, 13)
for month in months:
# Cleaning 'fare' data.
data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)
payment_type = (data.payment_type == 'CRD')
fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))
surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))
mta_tax = (data.mta_tax == 0.5)
tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))
tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))
data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]
data.drop(['payment_type'], axis=1, inplace=True)
payment_type = None
fare_amount = None
surcharge = None
mta_tax = None
tip_amount = None
tolls_amount = None
# Calculating the tip percentage.
data[tip_perc_column_name] = np.nan
data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)
data.drop(['mta_tax'], axis=1, inplace=True)
tip_perc = (data.tip_perc <= 50.0)
data = data[tip_perc]
tip_perc = None
# Adding the trip data file.
tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])
for column in tripdata.columns:
data[column] = np.nan
'''
Careful! The next line use a lot of memory, a least more than 8GB.
It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:
Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them
into the data variable in a loop.
'''
data.update(tripdata)
tripdata = None
# Cleaning 'trip' data.
rate_code = (data.rate_code == 1.0)
passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))
trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))
trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))
pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))
pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))
dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))
dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))
data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude
& dropoff_latitude & dropoff_longitude]
data.drop(['rate_code'], axis=1, inplace=True)
rate_code = None
passenger_count = None
trip_time_in_secs = None
trip_distance = None
pickup_latitude = None
pickup_longitude = None
dropoff_latitude = None
dropoff_longitude = None
# Filtering the coordinates of unusal places.
for column in area_column_names:
data[column] = np.nan
data = iterate_and_apply(data, calculate_area, area_necesary_columns)
aux_pickup = data.groupby(['pickup_area']).size()
aux_pickup = aux_pickup[aux_pickup >= min_area_number]
aux_pickup = data['pickup_area'].isin(aux_pickup.index)
aux_dropoff = data.groupby(['dropoff_area']).size()
aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]
aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)
data = data[aux_pickup & aux_dropoff]
data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)
aux_pickup = None
aux_dropoff = None
# Saving the file.
data = data.reindex_axis(column_order, axis=1)
data.to_csv(cleaned_data_path.format(month), index=True)
data = None