In [1]:

    
import pandas as pd # For working with tabular data
import numpy as np
import matplotlib.colors as colors # For nice colors and color scales
import matplotlib.cm as cm # More color scale stuff
from ggplot import * # For doing cool plots using grammar of graphics
#from fast_kde import fast_kde # For lightning fast 2d kernel density estimation
# `fast_kde.py` has to be in the same folder as this IPython notebook
%matplotlib inline

Add Auxillary Information

Note: Run this section only once



In [2]:

    
# df = pd.read_csv('Data/orig_data.csv',parse_dates=["starttime", "stoptime"])



In [3]:

    
# # Add day of week fields
# df['dayofweek'] = df['starttime'].apply(lambda x: x.weekday())

# def day_type(x):
#     days = ['Mon','Tues','Wednes','Thurs','Fri','Satur','Sun']
#     return days[x.weekday()] + "day"
# df['dayofweek_name'] = df['starttime'].apply(day_type)

# # Add week type field
# def week_type(x):
#     if x < 5:
#         return 'Weekday'
#     else:
#         return 'Weekend'

# df['weektype'] = df['dayofweek'].apply(week_type)



In [4]:

    
# # Calculate geodesic distance
# from math import radians, cos, sin, asin, sqrt

# def haversine(lon1, lat1, lon2, lat2):
#     """
#     Calculate the great circle distance between two points 
#     on the earth (specified in decimal degrees)
#     """
#     # convert decimal degrees to radians 
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

#     # haversine formula 
#     dlon = lon2 - lon1 
#     dlat = lat2 - lat1 
#     a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
#     c = 2 * asin(sqrt(a)) 

#     # 6367 km is the radius of the Earth
#     km = 6367 * c
#     return km 

# def distance(row):
#     return haversine(*row)

# df['dist'] = df[['start station longitude', 'start station latitude', 'end station longitude',\
#                  'end station latitude']].apply(distance, axis=1)

# # Convert distance in km to feet
# df['dist_ft'] = df['dist'].apply(lambda x: x * 3280.84)



In [5]:

    
# df.to_csv('Data/working_data.csv', index=False)

Data



In [2]:

    
df = pd.read_csv('Data/working_data.csv', parse_dates=["starttime", "stoptime"])



In [5]:

    
# Add X,Y Coords, Boro, Neighborhood Name fields
aux_info = pd.read_csv('Data/station_aux_info.csv')[['stationid','borough','neighborhood']]
for i in ['start station id', 'end station id']:
    prefix = i.split(' ',1)[0]
    df_merge = pd.merge(df, aux_info, left_on=i, right_on='stationid')
    df_merge.rename(columns={'borough':prefix + '_boro', 'neighborhood':prefix + '_neigh'},\
                    inplace=True)
    del df_merge['stationid']
    df = df_merge

Descriptive Statistics



In [5]:

    
station_count = df['start station id'].unique().shape[0]
print 'As of May 2014, there are', station_count, 'bike stations.'









    



As of May 2014, there are 332 bike stations.



In [6]:

    
bike_count = df.bikeid.unique().shape[0] # Shape gets the count of index
print 'As of May 2014, there are', bike_count, 'bikes.'









    



As of May 2014, there are 6943 bikes.



In [7]:

    
df['tripmins'] = df['tripduration']/60 # Convert tripduration (in seconds) to minutes
avg_trip = df.tripmins.mean()
total_trip = df.shape[0]
print 'As of May 2014, there were a total of', total_trip, 'trips, with an average of', avg_trip, 'mins per trip.'









    



As of May 2014, there were a total of 7538335 trips, with an average of 14.4207644792 mins per trip.

Usertype Analysis

Differences between one-time customers and annual subscribers are aparent with one-time customers taking about 12.92 minutes longer per trip. Possibly reflects the differences in utilitarian and leisure usages.



In [8]:

    
type_count = df.usertype.value_counts()
type_ave_trip = df.groupby('usertype').tripmins.mean()
# np.diff calculates the discrete difference along an array; [::-1] reverses the order of values
type_trip_diff = np.diff(type_ave_trip[::-1]) 
print 'User type count:\n', type_count
print '\nAverage trip by user type:\n', type_ave_trip
print '\nDifference in trip length:', type_trip_diff[0]









    



User type count:
Subscriber    6663944
Customer       874391
dtype: int64

Average trip by user type:
usertype
Customer      25.854774
Subscriber    12.920483
Name: tripmins, dtype: float64

Difference in trip length: 12.9342915415

Figure 1 - Trip Duration By Usertype Density Plot



In [9]:

    
# There's a bug in ggplot for geom_density with log_scale.  So this codes helps to fix this
def sci_note(num, decimals=0, precision=None, exponent=None):
    if not exponent:
        exponent = int(floor(log10(abs(num))))
    coeff = round(num / float(10**exponent), decimals)
    if not precision:
        precision = decimals
    return r"$10^{{{0:d}}}$".format(exponent)

## First, we compute the log of the trip length to make things look a bit more 'normal'.  
## Created a new column to store the logged trip lengths.
df['loglength'] = np.log(df.tripmins)

# Then we create the baseline plot and add some labels
p = ggplot(aes(x='loglength', color='usertype'), data=df)
p += xlab('Trip Duration (log scale)')
p += ylab('Density')

## These are the values we'll plot along the x axis
vals = [1,10,100,1000,10000,100000]

color_list = ['#33cbff','#006bb6']

# Take the baseline plot, and add density and a continuous x scale (with our custom sci_note labels)
fig_1 = p + geom_density(fill=True, alpha=0.3) + scale_x_continuous(breaks=list(np.log(vals)), 
   labels=[sci_note(i) for i in vals]) + scale_colour_manual(values=color_list)
fig_1









    












    Out[9]:





<ggplot: (341574369)>



In [10]:

    
# Save Figure 1
ggsave(fig_1, 'Figures_Exploratory/Figure 1.png')









    



Saving 11.0 x 8.0 in image.

Figure 2 - Day Of Week Bar Graph

Usage level does not vary much throughout the weekday, with a very slight decrease during the weekend.



In [11]:

    
p = ggplot(aes("dayofweek"), data=df)
p += xlab("Day of Week")
p += ylab("Counts")        
fig_2 = p + geom_bar(fill='#006bb6') + \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) 
fig_2



In [12]:

    
#Save Figure 2
ggsave(fig_2, 'Figures_Exploratory/Figure_2.png')









    



Saving 11.0 x 8.0 in image.

Figure 3 - Day Of Week By Usertype Density Plot

While usage level does not vary much throughout the week, usertype varies between weekday and weekend as shown in the density plot. This reflects the utilitarian usage (i.e. commuting) during the weekday and the leisure usage during the weekend.



In [ ]:

    
color_list = ['#33cbff','#006bb6']

p = ggplot(aes("dayofweek", color='usertype'), data=df)
p += xlab("Day of Week")
p += ylab("Density")        
fig_3 = p + geom_density(size=2)+ \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + \
scale_colour_manual(values=color_list)
fig_3



In [14]:

    
# Save Figure 3
ggsave(fig_3, 'Figures_Exploratory//Figure_3.png')









    



Saving 11.0 x 8.0 in image.



In [15]:

    
usertypeAgg = df.groupby(['dayofweek', 'usertype'], as_index=False).agg(len)[['dayofweek', 'usertype', 'bikeid']]
usertypeAgg.columns = ['dayofweek', 'usertype', 'count']



In [22]:

    
usertypeAgg['dayofweek'] = usertypeAgg['dayofweek'].astype(str)
p = ggplot(aes('dayofweek', y='count', color='usertype', fill='usertype'), data=usertypeAgg)  
p += xlab("Day of Week")
p += ylab("Counts")
stackedBar = p + geom_bar(stat="bar", color='steelblue') +\
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + scale_color_brewer() 
stackedBar









    












    Out[22]:





<ggplot: (275319825)>



In [13]:

    
ggsave(stackedBar, 'Figures_Exploratory//stackedbar.png')









    



Saving 11.0 x 8.0 in image.

Figure 4 - Distances By Usertype Density Plot



In [16]:

    
color_list = ['#006bb6','#33cbff']

p = ggplot(aes(x='dist_ft', color='usertype'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_4 = p + geom_density(fill=True, alpha=0.3) + scale_colour_manual(values=color_list)
fig_4









    












    Out[16]:





<ggplot: (280035681)>



In [17]:

    
ggsave(fig_4, 'Figures_Exploratory//Figure_4.png')









    



Saving 11.0 x 8.0 in image.

Gender Analysis



In [18]:

    
# Recode the data using more semantically appropriate names
df.gender[df.gender==1] = 'Male'
df.gender[df.gender==2] = 'Female'
df.gender[df.gender==0] = 'Unknown'

#Gender counts
gender_counts = df.gender.value_counts()

# Gender proportions:
gender_prop = df.gender.value_counts() / df.gender.value_counts().sum()

print 'Gender counts:\n', gender_counts
print '\nProportions:\n', gender_prop









    



Gender counts:
Male       5135682
Female     1527842
Unknown     874811
dtype: int64

Proportions:
Male       0.681275
Female     0.202676
Unknown    0.116048
dtype: float64

Figure 5 - Gender Bar Graph



In [19]:

    
p = ggplot(aes("gender"), data=df)
p += xlab("Gender")
p += ylab("Counts")
fig_5 = p + geom_bar(fill='#006bb6')
fig_5









    












    Out[19]:





<ggplot: (280033777)>



In [20]:

    
#Save Figure 5
ggsave(fig_5, 'Figures_Exploratory//Figure_5.png')









    



Saving 11.0 x 8.0 in image.

Figure 6 - Distance by Gender Density Plot



In [21]:

    
# Let's look at differences in distance by gender
color_list = ['#FFAAAA','#33cbff','#006bb6']

p = ggplot(aes(x='dist_ft', color='gender'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_6 = p + geom_density(size=2) + scale_colour_manual(values=color_list)
fig_6









    












    Out[21]:





<ggplot: (381245505)>



In [22]:

    
# Save Figure 6
ggsave(fig_6, 'Figures_Exploratory//Figure_6.png')









    



Saving 11.0 x 8.0 in image.

Speed



In [23]:

    
#Create new field to store speed in miles/hour (convert km to miles and seconds to hour)
df['speed'] = (df.dist/1.60934)/(df.tripmins/60)



In [24]:

    
avg_speed = df.speed.mean()
print 'Average speed per trip with round trips:', avg_speed, 'miles/hour'









    



Average speed per trip with round trips: 5.79460836111 miles/hour

Average daily Manhattan travel speed in 2012 was 10.2 mph. Weekday Manhattan central business district (CBD) from 8am-6pm in July-Aug 2012 average taxi speed was about 9.3 mph. Source: http://www.nyc.gov/html/dot/downloads/pdf/sustainable_streets_index_12.pdf

Average walking speed 3.1 mph.



In [25]:

    
round_trip = df[df['speed'] == 0]
total_trip = df.shape[0]
round_trip_percent = round(((float(round_trip.shape[0])/total_trip)*100),2)
avg_speed = df[df['speed'] != 0].speed.mean()

print 'Total Round Trips:', round_trip.shape[0] 
print 'Percent of Round Trips:', round_trip_percent
print 'Average speed per trip sans round trips:', avg_speed, 'miles/hour'









    



Total Round Trips: 187035
Percent of Round Trips: 2.48
Average speed per trip sans round trips: 5.94203732943 miles/hour



In [26]:

    
usertype_speed = df[df['speed'] != 0].groupby('usertype').speed.mean()
usertype_speed_diff = np.diff(usertype_speed[::-1]) 

print usertype_speed.sort_index(ascending = False)
print 'Difference in speed by usertype:', usertype_speed_diff[0]









    



usertype
Subscriber    6.154158
Customer      4.222590
Name: speed, dtype: float64
Difference in speed by usertype: -1.93156744269



In [27]:

    
gender_speed = df[df['speed'] != 0].groupby('gender').speed.mean()
gender_speed_diff = np.diff(gender_speed[['Female', 'Male']][::-1]) 

print gender_speed.sort_index(ascending = False)
print 'Difference in speed by known gender:', gender_speed_diff[0]









    



gender
Unknown    4.224056
Male       6.328280
Female     5.566742
Name: speed, dtype: float64
Difference in speed by known gender: -0.761537827534



In [28]:

    
dayofwk_speed = df[df['speed'] != 0].groupby('dayofweek').speed.mean()
dayofwk_speed_diff = dayofwk_speed.max() - dayofwk_speed.min()

print 'Average speed for each day of week:', dayofwk_speed
print 'Difference in maximum and minimum day of week speed:', dayofwk_speed_diff









    



Average speed for each day of week: dayofweek
0            6.087622
1            6.131699
2            6.144333
3            6.101675
4            5.998251
5            5.427059
6            5.566592
Name: speed, dtype: float64
Difference in maximum and minimum day of week speed: 0.717274187289



In [29]:

    
wktype_speed = df[df['speed'] != 0].groupby('weektype').speed.mean()
wktype_speed_diff = np.diff(wktype_speed[::-1])

print 'Average speed by week type:', wktype_speed
print 'Difference in speed between weekday and weekend:', wktype_speed_diff[0]









    



Average speed by week type: weektype
Weekday     6.093168
Weekend     5.493215
Name: speed, dtype: float64
Difference in speed between weekday and weekend: 0.599953330966

Data aggregation for GIS



In [54]:

    
# Aggregate the data by start location
starts = df.groupby('start station id', as_index=False).agg(len)[['start station id', 'bikeid']]
starts.columns = ['StartStationID', 'Count']

starts.to_csv('Data/StartStationCount.csv')



In [55]:

    
# Aggregate the data by end location
starts = df.groupby('end station id', as_index=False).agg(len)[['end station id', 'bikeid']]
starts.columns = ['EndStationID', 'Count']

starts.to_csv('Data/EndStationCount.csv')



In [56]:

    
# Function to aggregate subset data
def aggFunc(aggColList, col, name):
    subset = df[aggColList + col] 

    hour = subset[col[0]].apply(lambda x: x.hour) 

    am_subset = subset[(hour >= 6) & (hour <= 11)]
    mid_subset = subset[(hour > 11) & (hour < 17)]
    pm_subset = subset[(hour >= 17) & (hour <= 20)]
    
    for i, j in zip([am_subset, mid_subset, pm_subset], ['am', 'mid', 'pm']):
        agg_df = i.groupby(aggColList, as_index=False).agg(len)[aggColList + [col[1]]]
        agg_df.columns = aggColList + ['count']
        agg_df.to_csv(name + j + '.csv', index=False)   
        agg_df.to_csv



In [57]:

    
# Subsets of start locations by subscribers on weekdays at different timeframe
aggFunc(['start station id', 'weektype', 'usertype'], ['starttime', 'bikeid'], 'Data/subset_start_')

# Subsets of end locations by subscribers on weekdays at different timeframe
aggFunc(['end station id', 'weektype', 'usertype'], ['stoptime', 'bikeid'], 'Data/subset_end_')



In [58]:

    
# Start locations at different timeframe
aggFunc(['start station id'], ['starttime', 'bikeid'], 'Data/start_')

# End locations at different timeframe
aggFunc(['end station id'], ['stoptime', 'bikeid'], 'Data/end_')