Uber Data Analysis

Version 0.1

Analysis of a user's Uber Trips

10 Questions Answered.


In [1]:
# General syntax to import specific functions in a library: 
##from (library) import (specific library function)
from pandas import DataFrame, read_csv

import numpy as np
import pandas as pd
import matplotlib as plt
import sys
import datetime as dt

# Enable inline plotting
%matplotlib inline

# Style the plots
plt.style.use('ggplot')

In [2]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + plt.__version__)


Python version 3.5.1 (default, Apr 18 2016, 11:46:32) 
[GCC 4.2.1 Compatible Apple LLVM 7.3.0 (clang-703.0.29)]
Pandas version 0.18.1
Matplotlib version 1.5.1

In [3]:
file_location = r'~/Projects/uber_receipts/_sample_data/sample_data.csv'
# Read the CSV file and add custom header names
raw_data = pd.read_csv(file_location, header=None, names=['date', 'driver', 'fare', 'car_type', 'city', 'map'])

In [4]:
#
# DATA CLEANUP
# 
# Convert the date column to a datetime object
raw_data['date'] = pd.to_datetime(raw_data['date'], format='%Y-%m-%d')
# Set the date as the index of the DataFrame
raw_data.set_index('date', inplace=True)

In [5]:
# Remove the currency sign from the beginning of the Fare
# 
# Major assumption here is that the currency
# is un USD -- Change the value of
# currency_sign depending on the currency of your
# account
currency_sign = '$'
raw_data['fare'] = raw_data.fare.str.replace(currency_sign, '')

In [6]:
# Split the Paid rides / Canceled rides and Free rides
# into 3 separate data frames
paid_rides = raw_data.copy(deep=True)
paid_rides['fare'] = pd.to_numeric(paid_rides['fare'], errors='coerce').fillna(0)

In [7]:
# Canceled Rides
canceled_rides = raw_data[raw_data['fare'] == "Canceled"]

In [8]:
# Free Rides
free_rides = raw_data[raw_data['fare'] == "Free"]

#1) What is the total number of rides per month?


In [9]:
p = paid_rides['fare'].groupby([paid_rides.index.year, paid_rides.index.month]).count()
p.plot(kind='bar')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c948be0>

#2) Top 10 rides completed with the same driver?


In [10]:
p = paid_rides['driver'].groupby(paid_rides.driver).count().sort_values(ascending=False)[:10]
p.plot(kind='bar')


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c9e73c8>

#3) Total cost of rides per month?


In [11]:
p = paid_rides['fare'].groupby([paid_rides.index.year, paid_rides.index.month]).sum()
p.plot(kind='bar')


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cb00940>

#4) Total cost of rides per week?


In [12]:
p = paid_rides['fare'].groupby([paid_rides.index.year, paid_rides.index.month, paid_rides.index.week]).sum()
p.plot(kind='bar')


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cc2a4e0>

#5) Average ride cost per year?


In [13]:
p = paid_rides['fare'].groupby(paid_rides.index.year).mean()
p.plot(kind='bar')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cdc1048>

#6) Average single ride cost per week?


In [14]:
p = paid_rides['fare'].groupby([paid_rides.index.year, paid_rides.index.month, paid_rides.index.week]).mean()
p.plot(kind='bar')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ceb35c0>

#7) Most expensive ride per year?


In [15]:
p = paid_rides['fare'].groupby([paid_rides.index.year]).max()
p.plot(kind='bar')


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cef53c8>

#8) Top 15 max number of rides per day per year?


In [16]:
p = paid_rides['fare'].groupby([paid_rides.index.year, paid_rides.index.month, paid_rides.index.day]).count()
p = p.sort_values(ascending=False)[:15]
p.plot(kind='bar')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d120e48>

#9) Average number of rides per week per month?


In [17]:
p = paid_rides['fare'].groupby(pd.TimeGrouper(freq='M')).mean()
p.plot(kind='bar')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d284470>