In [1]:
import pandas as pd # For working with tabular data
import numpy as np
import matplotlib.colors as colors # For nice colors and color scales
import matplotlib.cm as cm # More color scale stuff
from ggplot import * # For doing cool plots using grammar of graphics
#from fast_kde import fast_kde # For lightning fast 2d kernel density estimation
# `fast_kde.py` has to be in the same folder as this IPython notebook
%matplotlib inline
In [2]:
# df = pd.read_csv('Data/orig_data.csv',parse_dates=["starttime", "stoptime"])
In [3]:
# # Add day of week fields
# df['dayofweek'] = df['starttime'].apply(lambda x: x.weekday())
# def day_type(x):
# days = ['Mon','Tues','Wednes','Thurs','Fri','Satur','Sun']
# return days[x.weekday()] + "day"
# df['dayofweek_name'] = df['starttime'].apply(day_type)
# # Add week type field
# def week_type(x):
# if x < 5:
# return 'Weekday'
# else:
# return 'Weekend'
# df['weektype'] = df['dayofweek'].apply(week_type)
In [4]:
# # Calculate geodesic distance
# from math import radians, cos, sin, asin, sqrt
# def haversine(lon1, lat1, lon2, lat2):
# """
# Calculate the great circle distance between two points
# on the earth (specified in decimal degrees)
# """
# # convert decimal degrees to radians
# lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# # haversine formula
# dlon = lon2 - lon1
# dlat = lat2 - lat1
# a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
# c = 2 * asin(sqrt(a))
# # 6367 km is the radius of the Earth
# km = 6367 * c
# return km
# def distance(row):
# return haversine(*row)
# df['dist'] = df[['start station longitude', 'start station latitude', 'end station longitude',\
# 'end station latitude']].apply(distance, axis=1)
# # Convert distance in km to feet
# df['dist_ft'] = df['dist'].apply(lambda x: x * 3280.84)
In [5]:
# df.to_csv('Data/working_data.csv', index=False)
In [2]:
df = pd.read_csv('Data/working_data.csv', parse_dates=["starttime", "stoptime"])
In [5]:
# Add X,Y Coords, Boro, Neighborhood Name fields
aux_info = pd.read_csv('Data/station_aux_info.csv')[['stationid','borough','neighborhood']]
for i in ['start station id', 'end station id']:
prefix = i.split(' ',1)[0]
df_merge = pd.merge(df, aux_info, left_on=i, right_on='stationid')
df_merge.rename(columns={'borough':prefix + '_boro', 'neighborhood':prefix + '_neigh'},\
inplace=True)
del df_merge['stationid']
df = df_merge
In [5]:
station_count = df['start station id'].unique().shape[0]
print 'As of May 2014, there are', station_count, 'bike stations.'
In [6]:
bike_count = df.bikeid.unique().shape[0] # Shape gets the count of index
print 'As of May 2014, there are', bike_count, 'bikes.'
In [7]:
df['tripmins'] = df['tripduration']/60 # Convert tripduration (in seconds) to minutes
avg_trip = df.tripmins.mean()
total_trip = df.shape[0]
print 'As of May 2014, there were a total of', total_trip, 'trips, with an average of', avg_trip, 'mins per trip.'
Differences between one-time customers and annual subscribers are aparent with one-time customers taking about 12.92 minutes longer per trip. Possibly reflects the differences in utilitarian and leisure usages.
In [8]:
type_count = df.usertype.value_counts()
type_ave_trip = df.groupby('usertype').tripmins.mean()
# np.diff calculates the discrete difference along an array; [::-1] reverses the order of values
type_trip_diff = np.diff(type_ave_trip[::-1])
print 'User type count:\n', type_count
print '\nAverage trip by user type:\n', type_ave_trip
print '\nDifference in trip length:', type_trip_diff[0]
In [9]:
# There's a bug in ggplot for geom_density with log_scale. So this codes helps to fix this
def sci_note(num, decimals=0, precision=None, exponent=None):
if not exponent:
exponent = int(floor(log10(abs(num))))
coeff = round(num / float(10**exponent), decimals)
if not precision:
precision = decimals
return r"$10^{{{0:d}}}$".format(exponent)
## First, we compute the log of the trip length to make things look a bit more 'normal'.
## Created a new column to store the logged trip lengths.
df['loglength'] = np.log(df.tripmins)
# Then we create the baseline plot and add some labels
p = ggplot(aes(x='loglength', color='usertype'), data=df)
p += xlab('Trip Duration (log scale)')
p += ylab('Density')
## These are the values we'll plot along the x axis
vals = [1,10,100,1000,10000,100000]
color_list = ['#33cbff','#006bb6']
# Take the baseline plot, and add density and a continuous x scale (with our custom sci_note labels)
fig_1 = p + geom_density(fill=True, alpha=0.3) + scale_x_continuous(breaks=list(np.log(vals)),
labels=[sci_note(i) for i in vals]) + scale_colour_manual(values=color_list)
fig_1
Out[9]:
In [10]:
# Save Figure 1
ggsave(fig_1, 'Figures_Exploratory/Figure 1.png')
Usage level does not vary much throughout the weekday, with a very slight decrease during the weekend.
In [11]:
p = ggplot(aes("dayofweek"), data=df)
p += xlab("Day of Week")
p += ylab("Counts")
fig_2 = p + geom_bar(fill='#006bb6') + \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
fig_2
In [12]:
#Save Figure 2
ggsave(fig_2, 'Figures_Exploratory/Figure_2.png')
While usage level does not vary much throughout the week, usertype varies between weekday and weekend as shown in the density plot. This reflects the utilitarian usage (i.e. commuting) during the weekday and the leisure usage during the weekend.
In [ ]:
color_list = ['#33cbff','#006bb6']
p = ggplot(aes("dayofweek", color='usertype'), data=df)
p += xlab("Day of Week")
p += ylab("Density")
fig_3 = p + geom_density(size=2)+ \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + \
scale_colour_manual(values=color_list)
fig_3
In [14]:
# Save Figure 3
ggsave(fig_3, 'Figures_Exploratory//Figure_3.png')
In [15]:
usertypeAgg = df.groupby(['dayofweek', 'usertype'], as_index=False).agg(len)[['dayofweek', 'usertype', 'bikeid']]
usertypeAgg.columns = ['dayofweek', 'usertype', 'count']
In [22]:
usertypeAgg['dayofweek'] = usertypeAgg['dayofweek'].astype(str)
p = ggplot(aes('dayofweek', y='count', color='usertype', fill='usertype'), data=usertypeAgg)
p += xlab("Day of Week")
p += ylab("Counts")
stackedBar = p + geom_bar(stat="bar", color='steelblue') +\
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + scale_color_brewer()
stackedBar
Out[22]:
In [13]:
ggsave(stackedBar, 'Figures_Exploratory//stackedbar.png')
In [16]:
color_list = ['#006bb6','#33cbff']
p = ggplot(aes(x='dist_ft', color='usertype'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_4 = p + geom_density(fill=True, alpha=0.3) + scale_colour_manual(values=color_list)
fig_4
Out[16]:
In [17]:
ggsave(fig_4, 'Figures_Exploratory//Figure_4.png')
In [18]:
# Recode the data using more semantically appropriate names
df.gender[df.gender==1] = 'Male'
df.gender[df.gender==2] = 'Female'
df.gender[df.gender==0] = 'Unknown'
#Gender counts
gender_counts = df.gender.value_counts()
# Gender proportions:
gender_prop = df.gender.value_counts() / df.gender.value_counts().sum()
print 'Gender counts:\n', gender_counts
print '\nProportions:\n', gender_prop
In [19]:
p = ggplot(aes("gender"), data=df)
p += xlab("Gender")
p += ylab("Counts")
fig_5 = p + geom_bar(fill='#006bb6')
fig_5
Out[19]:
In [20]:
#Save Figure 5
ggsave(fig_5, 'Figures_Exploratory//Figure_5.png')
In [21]:
# Let's look at differences in distance by gender
color_list = ['#FFAAAA','#33cbff','#006bb6']
p = ggplot(aes(x='dist_ft', color='gender'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_6 = p + geom_density(size=2) + scale_colour_manual(values=color_list)
fig_6
Out[21]:
In [22]:
# Save Figure 6
ggsave(fig_6, 'Figures_Exploratory//Figure_6.png')
In [23]:
#Create new field to store speed in miles/hour (convert km to miles and seconds to hour)
df['speed'] = (df.dist/1.60934)/(df.tripmins/60)
In [24]:
avg_speed = df.speed.mean()
print 'Average speed per trip with round trips:', avg_speed, 'miles/hour'
Average daily Manhattan travel speed in 2012 was 10.2 mph. Weekday Manhattan central business district (CBD) from 8am-6pm in July-Aug 2012 average taxi speed was about 9.3 mph. Source: http://www.nyc.gov/html/dot/downloads/pdf/sustainable_streets_index_12.pdf
Average walking speed 3.1 mph.
In [25]:
round_trip = df[df['speed'] == 0]
total_trip = df.shape[0]
round_trip_percent = round(((float(round_trip.shape[0])/total_trip)*100),2)
avg_speed = df[df['speed'] != 0].speed.mean()
print 'Total Round Trips:', round_trip.shape[0]
print 'Percent of Round Trips:', round_trip_percent
print 'Average speed per trip sans round trips:', avg_speed, 'miles/hour'
In [26]:
usertype_speed = df[df['speed'] != 0].groupby('usertype').speed.mean()
usertype_speed_diff = np.diff(usertype_speed[::-1])
print usertype_speed.sort_index(ascending = False)
print 'Difference in speed by usertype:', usertype_speed_diff[0]
In [27]:
gender_speed = df[df['speed'] != 0].groupby('gender').speed.mean()
gender_speed_diff = np.diff(gender_speed[['Female', 'Male']][::-1])
print gender_speed.sort_index(ascending = False)
print 'Difference in speed by known gender:', gender_speed_diff[0]
In [28]:
dayofwk_speed = df[df['speed'] != 0].groupby('dayofweek').speed.mean()
dayofwk_speed_diff = dayofwk_speed.max() - dayofwk_speed.min()
print 'Average speed for each day of week:', dayofwk_speed
print 'Difference in maximum and minimum day of week speed:', dayofwk_speed_diff
In [29]:
wktype_speed = df[df['speed'] != 0].groupby('weektype').speed.mean()
wktype_speed_diff = np.diff(wktype_speed[::-1])
print 'Average speed by week type:', wktype_speed
print 'Difference in speed between weekday and weekend:', wktype_speed_diff[0]
In [54]:
# Aggregate the data by start location
starts = df.groupby('start station id', as_index=False).agg(len)[['start station id', 'bikeid']]
starts.columns = ['StartStationID', 'Count']
starts.to_csv('Data/StartStationCount.csv')
In [55]:
# Aggregate the data by end location
starts = df.groupby('end station id', as_index=False).agg(len)[['end station id', 'bikeid']]
starts.columns = ['EndStationID', 'Count']
starts.to_csv('Data/EndStationCount.csv')
In [56]:
# Function to aggregate subset data
def aggFunc(aggColList, col, name):
subset = df[aggColList + col]
hour = subset[col[0]].apply(lambda x: x.hour)
am_subset = subset[(hour >= 6) & (hour <= 11)]
mid_subset = subset[(hour > 11) & (hour < 17)]
pm_subset = subset[(hour >= 17) & (hour <= 20)]
for i, j in zip([am_subset, mid_subset, pm_subset], ['am', 'mid', 'pm']):
agg_df = i.groupby(aggColList, as_index=False).agg(len)[aggColList + [col[1]]]
agg_df.columns = aggColList + ['count']
agg_df.to_csv(name + j + '.csv', index=False)
agg_df.to_csv
In [57]:
# Subsets of start locations by subscribers on weekdays at different timeframe
aggFunc(['start station id', 'weektype', 'usertype'], ['starttime', 'bikeid'], 'Data/subset_start_')
# Subsets of end locations by subscribers on weekdays at different timeframe
aggFunc(['end station id', 'weektype', 'usertype'], ['stoptime', 'bikeid'], 'Data/subset_end_')
In [58]:
# Start locations at different timeframe
aggFunc(['start station id'], ['starttime', 'bikeid'], 'Data/start_')
# End locations at different timeframe
aggFunc(['end station id'], ['stoptime', 'bikeid'], 'Data/end_')