In [1]:
import pandas as pd # For working with tabular data
import numpy as np
import matplotlib.colors as colors # For nice colors and color scales
import matplotlib.cm as cm # More color scale stuff
from ggplot import * # For doing cool plots using grammar of graphics
#from fast_kde import fast_kde # For lightning fast 2d kernel density estimation
# `fast_kde.py` has to be in the same folder as this IPython notebook
%matplotlib inline

Add Auxillary Information

Note: Run this section only once


In [2]:
# df = pd.read_csv('Data/orig_data.csv',parse_dates=["starttime", "stoptime"])

In [3]:
# # Add day of week fields
# df['dayofweek'] = df['starttime'].apply(lambda x: x.weekday())

# def day_type(x):
#     days = ['Mon','Tues','Wednes','Thurs','Fri','Satur','Sun']
#     return days[x.weekday()] + "day"
# df['dayofweek_name'] = df['starttime'].apply(day_type)

# # Add week type field
# def week_type(x):
#     if x < 5:
#         return 'Weekday'
#     else:
#         return 'Weekend'

# df['weektype'] = df['dayofweek'].apply(week_type)

In [4]:
# # Calculate geodesic distance
# from math import radians, cos, sin, asin, sqrt

# def haversine(lon1, lat1, lon2, lat2):
#     """
#     Calculate the great circle distance between two points 
#     on the earth (specified in decimal degrees)
#     """
#     # convert decimal degrees to radians 
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

#     # haversine formula 
#     dlon = lon2 - lon1 
#     dlat = lat2 - lat1 
#     a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
#     c = 2 * asin(sqrt(a)) 

#     # 6367 km is the radius of the Earth
#     km = 6367 * c
#     return km 

# def distance(row):
#     return haversine(*row)

# df['dist'] = df[['start station longitude', 'start station latitude', 'end station longitude',\
#                  'end station latitude']].apply(distance, axis=1)

# # Convert distance in km to feet
# df['dist_ft'] = df['dist'].apply(lambda x: x * 3280.84)

In [5]:
# df.to_csv('Data/working_data.csv', index=False)

Data


In [2]:
df = pd.read_csv('Data/working_data.csv', parse_dates=["starttime", "stoptime"])

In [5]:
# Add X,Y Coords, Boro, Neighborhood Name fields
aux_info = pd.read_csv('Data/station_aux_info.csv')[['stationid','borough','neighborhood']]
for i in ['start station id', 'end station id']:
    prefix = i.split(' ',1)[0]
    df_merge = pd.merge(df, aux_info, left_on=i, right_on='stationid')
    df_merge.rename(columns={'borough':prefix + '_boro', 'neighborhood':prefix + '_neigh'},\
                    inplace=True)
    del df_merge['stationid']
    df = df_merge

Descriptive Statistics


In [5]:
station_count = df['start station id'].unique().shape[0]
print 'As of May 2014, there are', station_count, 'bike stations.'


As of May 2014, there are 332 bike stations.

In [6]:
bike_count = df.bikeid.unique().shape[0] # Shape gets the count of index
print 'As of May 2014, there are', bike_count, 'bikes.'


As of May 2014, there are 6943 bikes.

In [7]:
df['tripmins'] = df['tripduration']/60 # Convert tripduration (in seconds) to minutes
avg_trip = df.tripmins.mean()
total_trip = df.shape[0]
print 'As of May 2014, there were a total of', total_trip, 'trips, with an average of', avg_trip, 'mins per trip.'


As of May 2014, there were a total of 7538335 trips, with an average of 14.4207644792 mins per trip.

Usertype Analysis

Differences between one-time customers and annual subscribers are aparent with one-time customers taking about 12.92 minutes longer per trip. Possibly reflects the differences in utilitarian and leisure usages.


In [8]:
type_count = df.usertype.value_counts()
type_ave_trip = df.groupby('usertype').tripmins.mean()
# np.diff calculates the discrete difference along an array; [::-1] reverses the order of values
type_trip_diff = np.diff(type_ave_trip[::-1]) 
print 'User type count:\n', type_count
print '\nAverage trip by user type:\n', type_ave_trip
print '\nDifference in trip length:', type_trip_diff[0]


User type count:
Subscriber    6663944
Customer       874391
dtype: int64

Average trip by user type:
usertype
Customer      25.854774
Subscriber    12.920483
Name: tripmins, dtype: float64

Difference in trip length: 12.9342915415

Figure 1 - Trip Duration By Usertype Density Plot


In [9]:
# There's a bug in ggplot for geom_density with log_scale.  So this codes helps to fix this
def sci_note(num, decimals=0, precision=None, exponent=None):
    if not exponent:
        exponent = int(floor(log10(abs(num))))
    coeff = round(num / float(10**exponent), decimals)
    if not precision:
        precision = decimals
    return r"$10^{{{0:d}}}$".format(exponent)

## First, we compute the log of the trip length to make things look a bit more 'normal'.  
## Created a new column to store the logged trip lengths.
df['loglength'] = np.log(df.tripmins)

# Then we create the baseline plot and add some labels
p = ggplot(aes(x='loglength', color='usertype'), data=df)
p += xlab('Trip Duration (log scale)')
p += ylab('Density')

## These are the values we'll plot along the x axis
vals = [1,10,100,1000,10000,100000]

color_list = ['#33cbff','#006bb6']

# Take the baseline plot, and add density and a continuous x scale (with our custom sci_note labels)
fig_1 = p + geom_density(fill=True, alpha=0.3) + scale_x_continuous(breaks=list(np.log(vals)), 
   labels=[sci_note(i) for i in vals]) + scale_colour_manual(values=color_list)
fig_1


Out[9]:
<ggplot: (341574369)>

In [10]:
# Save Figure 1
ggsave(fig_1, 'Figures_Exploratory/Figure 1.png')


Saving 11.0 x 8.0 in image.

Figure 2 - Day Of Week Bar Graph

Usage level does not vary much throughout the weekday, with a very slight decrease during the weekend.


In [11]:
p = ggplot(aes("dayofweek"), data=df)
p += xlab("Day of Week")
p += ylab("Counts")        
fig_2 = p + geom_bar(fill='#006bb6') + \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) 
fig_2

In [12]:
#Save Figure 2
ggsave(fig_2, 'Figures_Exploratory/Figure_2.png')


Saving 11.0 x 8.0 in image.

Figure 3 - Day Of Week By Usertype Density Plot

While usage level does not vary much throughout the week, usertype varies between weekday and weekend as shown in the density plot. This reflects the utilitarian usage (i.e. commuting) during the weekday and the leisure usage during the weekend.


In [ ]:
color_list = ['#33cbff','#006bb6']

p = ggplot(aes("dayofweek", color='usertype'), data=df)
p += xlab("Day of Week")
p += ylab("Density")        
fig_3 = p + geom_density(size=2)+ \
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + \
scale_colour_manual(values=color_list)
fig_3

In [14]:
# Save Figure 3
ggsave(fig_3, 'Figures_Exploratory//Figure_3.png')


Saving 11.0 x 8.0 in image.

In [15]:
usertypeAgg = df.groupby(['dayofweek', 'usertype'], as_index=False).agg(len)[['dayofweek', 'usertype', 'bikeid']]
usertypeAgg.columns = ['dayofweek', 'usertype', 'count']

In [22]:
usertypeAgg['dayofweek'] = usertypeAgg['dayofweek'].astype(str)
p = ggplot(aes('dayofweek', y='count', color='usertype', fill='usertype'), data=usertypeAgg)  
p += xlab("Day of Week")
p += ylab("Counts")
stackedBar = p + geom_bar(stat="bar", color='steelblue') +\
scale_x_continuous(labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) + scale_color_brewer() 
stackedBar


Out[22]:
<ggplot: (275319825)>

In [13]:
ggsave(stackedBar, 'Figures_Exploratory//stackedbar.png')


Saving 11.0 x 8.0 in image.

Figure 4 - Distances By Usertype Density Plot


In [16]:
color_list = ['#006bb6','#33cbff']

p = ggplot(aes(x='dist_ft', color='usertype'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_4 = p + geom_density(fill=True, alpha=0.3) + scale_colour_manual(values=color_list)
fig_4


Out[16]:
<ggplot: (280035681)>

In [17]:
ggsave(fig_4, 'Figures_Exploratory//Figure_4.png')


Saving 11.0 x 8.0 in image.

Gender Analysis


In [18]:
# Recode the data using more semantically appropriate names
df.gender[df.gender==1] = 'Male'
df.gender[df.gender==2] = 'Female'
df.gender[df.gender==0] = 'Unknown'

#Gender counts
gender_counts = df.gender.value_counts()

# Gender proportions:
gender_prop = df.gender.value_counts() / df.gender.value_counts().sum()

print 'Gender counts:\n', gender_counts
print '\nProportions:\n', gender_prop


Gender counts:
Male       5135682
Female     1527842
Unknown     874811
dtype: int64

Proportions:
Male       0.681275
Female     0.202676
Unknown    0.116048
dtype: float64

Figure 5 - Gender Bar Graph


In [19]:
p = ggplot(aes("gender"), data=df)
p += xlab("Gender")
p += ylab("Counts")
fig_5 = p + geom_bar(fill='#006bb6')
fig_5


Out[19]:
<ggplot: (280033777)>

In [20]:
#Save Figure 5
ggsave(fig_5, 'Figures_Exploratory//Figure_5.png')


Saving 11.0 x 8.0 in image.

Figure 6 - Distance by Gender Density Plot


In [21]:
# Let's look at differences in distance by gender
color_list = ['#FFAAAA','#33cbff','#006bb6']

p = ggplot(aes(x='dist_ft', color='gender'), data=df)
p += xlab('Distance (feet)')
p += ylab('Density')
fig_6 = p + geom_density(size=2) + scale_colour_manual(values=color_list)
fig_6


Out[21]:
<ggplot: (381245505)>

In [22]:
# Save Figure 6
ggsave(fig_6, 'Figures_Exploratory//Figure_6.png')


Saving 11.0 x 8.0 in image.

Speed


In [23]:
#Create new field to store speed in miles/hour (convert km to miles and seconds to hour)
df['speed'] = (df.dist/1.60934)/(df.tripmins/60)

In [24]:
avg_speed = df.speed.mean()
print 'Average speed per trip with round trips:', avg_speed, 'miles/hour'


Average speed per trip with round trips: 5.79460836111 miles/hour

Average daily Manhattan travel speed in 2012 was 10.2 mph. Weekday Manhattan central business district (CBD) from 8am-6pm in July-Aug 2012 average taxi speed was about 9.3 mph. Source: http://www.nyc.gov/html/dot/downloads/pdf/sustainable_streets_index_12.pdf

Average walking speed 3.1 mph.


In [25]:
round_trip = df[df['speed'] == 0]
total_trip = df.shape[0]
round_trip_percent = round(((float(round_trip.shape[0])/total_trip)*100),2)
avg_speed = df[df['speed'] != 0].speed.mean()

print 'Total Round Trips:', round_trip.shape[0] 
print 'Percent of Round Trips:', round_trip_percent
print 'Average speed per trip sans round trips:', avg_speed, 'miles/hour'


Total Round Trips: 187035
Percent of Round Trips: 2.48
Average speed per trip sans round trips: 5.94203732943 miles/hour

In [26]:
usertype_speed = df[df['speed'] != 0].groupby('usertype').speed.mean()
usertype_speed_diff = np.diff(usertype_speed[::-1]) 

print usertype_speed.sort_index(ascending = False)
print 'Difference in speed by usertype:', usertype_speed_diff[0]


usertype
Subscriber    6.154158
Customer      4.222590
Name: speed, dtype: float64
Difference in speed by usertype: -1.93156744269

In [27]:
gender_speed = df[df['speed'] != 0].groupby('gender').speed.mean()
gender_speed_diff = np.diff(gender_speed[['Female', 'Male']][::-1]) 

print gender_speed.sort_index(ascending = False)
print 'Difference in speed by known gender:', gender_speed_diff[0]


gender
Unknown    4.224056
Male       6.328280
Female     5.566742
Name: speed, dtype: float64
Difference in speed by known gender: -0.761537827534

In [28]:
dayofwk_speed = df[df['speed'] != 0].groupby('dayofweek').speed.mean()
dayofwk_speed_diff = dayofwk_speed.max() - dayofwk_speed.min()

print 'Average speed for each day of week:', dayofwk_speed
print 'Difference in maximum and minimum day of week speed:', dayofwk_speed_diff


Average speed for each day of week: dayofweek
0            6.087622
1            6.131699
2            6.144333
3            6.101675
4            5.998251
5            5.427059
6            5.566592
Name: speed, dtype: float64
Difference in maximum and minimum day of week speed: 0.717274187289

In [29]:
wktype_speed = df[df['speed'] != 0].groupby('weektype').speed.mean()
wktype_speed_diff = np.diff(wktype_speed[::-1])

print 'Average speed by week type:', wktype_speed
print 'Difference in speed between weekday and weekend:', wktype_speed_diff[0]


Average speed by week type: weektype
Weekday     6.093168
Weekend     5.493215
Name: speed, dtype: float64
Difference in speed between weekday and weekend: 0.599953330966

Data aggregation for GIS


In [54]:
# Aggregate the data by start location
starts = df.groupby('start station id', as_index=False).agg(len)[['start station id', 'bikeid']]
starts.columns = ['StartStationID', 'Count']

starts.to_csv('Data/StartStationCount.csv')

In [55]:
# Aggregate the data by end location
starts = df.groupby('end station id', as_index=False).agg(len)[['end station id', 'bikeid']]
starts.columns = ['EndStationID', 'Count']

starts.to_csv('Data/EndStationCount.csv')

In [56]:
# Function to aggregate subset data
def aggFunc(aggColList, col, name):
    subset = df[aggColList + col] 

    hour = subset[col[0]].apply(lambda x: x.hour) 

    am_subset = subset[(hour >= 6) & (hour <= 11)]
    mid_subset = subset[(hour > 11) & (hour < 17)]
    pm_subset = subset[(hour >= 17) & (hour <= 20)]
    
    for i, j in zip([am_subset, mid_subset, pm_subset], ['am', 'mid', 'pm']):
        agg_df = i.groupby(aggColList, as_index=False).agg(len)[aggColList + [col[1]]]
        agg_df.columns = aggColList + ['count']
        agg_df.to_csv(name + j + '.csv', index=False)   
        agg_df.to_csv

In [57]:
# Subsets of start locations by subscribers on weekdays at different timeframe
aggFunc(['start station id', 'weektype', 'usertype'], ['starttime', 'bikeid'], 'Data/subset_start_')

# Subsets of end locations by subscribers on weekdays at different timeframe
aggFunc(['end station id', 'weektype', 'usertype'], ['stoptime', 'bikeid'], 'Data/subset_end_')

In [58]:
# Start locations at different timeframe
aggFunc(['start station id'], ['starttime', 'bikeid'], 'Data/start_')

# End locations at different timeframe
aggFunc(['end station id'], ['stoptime', 'bikeid'], 'Data/end_')