In [1]:
import pandas as pd
import numpy as np
import locale
import datetime
import math
from pylab import *
In [2]:
# Load data
trips = pd.read_csv('Divvy_Stations_Trips_2013/Divvy_Trips_2013.csv')
stations = pd.read_csv('Divvy_Stations_Trips_2013/Divvy_Stations_2013.csv')
In [3]:
trips.tail()
Out[3]:
In [4]:
# Convert date columns to pandas datetime objects
trips.starttime = pd.to_datetime(trips.starttime)
trips.stoptime = pd.to_datetime(trips.stoptime)
In [5]:
# Convert trip duration to numeric
locale.setlocale(locale.LC_NUMERIC, '')
trips.tripduration = trips.tripduration.apply(locale.atof)
In [6]:
# Create an age field
trips['age'] = 2013 - trips.birthyear
In [10]:
# Join the lat/lon coordinates to the trips dataframe
trips_lat_lon = pd.merge(trips,stations,left_on='from_station_name',right_on='name')
trips_geo = pd.merge(trips_lat_lon,stations,left_on='to_station_name',right_on='name')
In [25]:
# Haversine distance formula
def gc_dist(lat1, lat2, lon1, lon2):
return 2*math.asin(sqrt((sin((radians(lat1)-radians(lat2))/2))^2 +
cos(radians(lat1))*cos(radians(lat2))*(sin((radians(lon1)-radians(lon2))/2))^2))*6371/1.6
# Manhattan distance formula
def manhattan_dist(lat1, lat2, lon1, lon2):
return (abs(lat2 - lat1) + abs(lon2 - lon1)) * 111 / 1.6
In [27]:
dist_func = lambda x: manhattan_dist(x['latitude_x'], x['latitude_y'],
x['longitude_x'], x['longitude_y'])
gc_dist_func = lambda x: gc_dist(x['latitude_x'], x['latitude_y'],
x['longitude_x'], x['longitude_y'])
trips_geo['dist'] = trips_geo.apply(dist_func, axis=1)
#trips_geo['gcdist'] = trips_geo.apply(gc_dist_func, axis=1)
In [29]:
# Summary statistics
trips_geo.sort(columns=['dist'],ascending=False)[['from_station_name','to_station_name','dist','tripduration']].head()
Out[29]:
In [30]:
# Find date range
2491 / 60
Out[30]:
Quick first impressions of the data
In [31]:
# Most popular station by trip origin
trip_origins = trips[['from_station_name']].groupby(by=['from_station_name']).count()
trip_origins.sort(columns=['from_station_name'],ascending=False)[0:10]
Out[31]:
The 10 most popular stations are centered around the lakefront and major rail stations. Canal St and Jackson is the large Divvy station in front of Union Station, and Clinton St and Washington Blvd is next to Ogilvie Transportation Center. Franklin St and Jackson is the Station around the corner from Willis Tower.
In [32]:
# Repeat with an ascending sort to find the least popular stations
trip_origins.sort(columns=['from_station_name'])[0:10]
Out[32]:
The 10 least popular stations are all located on the southside. A quick glance at the Divvy station map (as of February 2014) shows that many of these stations are located at the periphery of the Divvy service area. Overall station density on the southside is much lower as well which reduces the utility of bikeshare.
In [33]:
# Find most popular station pairs
group = trips[['from_station_name','to_station_name','trip_id']].groupby(by=['from_station_name','to_station_name']).count()
In [34]:
group.sort(columns=['trip_id'],ascending=False).head()
Out[34]:
In [35]:
trip_cust = trips[['usertype']].groupby(by=['usertype']).count()
trip_cust
Out[35]:
Subscribers took about 50,000 more trips than those who purchased day passes.
In [36]:
# Create date and time columns for time series analysis
trips['startdate'] = trips.starttime.apply(lambda x: x.date())
trips['starttimetime'] = trips.starttime.apply(lambda x: x.time())
In [37]:
trips_ts = trips[['startdate']].groupby(by=['startdate']).count()
trips_ts.rename(columns={'startdate':'trips'},inplace=True)
trips_ma = pd.rolling_mean(trips_ts.trips,7)
In [38]:
# Plot rides time series
ticks = np.arange(0,len(trips_ma),14)
fig = plt.figure(figsize=(7,5), dpi=80)
ax = fig.add_subplot(111)
lns = ax.plot(trips_ma,linewidth=2.5)
ax.set_xticks(ticks)
ax.set_xticklabels(trips_ma.index[ticks])
for label in ax.xaxis.get_ticklabels():
# label is a Text instance
label.set_rotation(45)
ax.set_title('Total Daily Trips, 7D Moving Avg')
show()
Trips really ramped up toward the end of July after about a month since Divvy's introduction and continued at high rates until mid-October when they fell off sharply and have declined since then.
In [39]:
trips_user_ts = trips[['usertype','startdate']].groupby(by=['startdate','usertype']).count()
trips_user_ts.rename(columns={'startdate':'trips','usertype':'trips2'},inplace=True)
trips_user_ts.reset_index(inplace=True)
#trips_user_ts.drop('trips2')
#trips_ma = pd.rolling_mean(trips_ts.trips,7)
In [40]:
trips_user_pivot = trips_user_ts.pivot(index='startdate',columns='usertype',values='trips')
In [41]:
# Plot customers vs subscriber trips
ticks = np.arange(0,len(trips_ma),14)
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
custs = ax.plot(pd.rolling_mean(trips_user_pivot.Customer, 7),linewidth=2.5)
subs = ax.plot(pd.rolling_mean(trips_user_pivot.Subscriber, 7), linewidth=2.5)
ax.set_xticks(ticks)
ax.set_xticklabels(trips_user_pivot.index[ticks])
for label in ax.xaxis.get_ticklabels():
# label is a Text instance
label.set_rotation(45)
ax.set_title("Customer and Subscriber Daily Trips, 7D Moving Avg")
show()
Day pass holder trips fell off sharply after the end of tourist season while subscriber trips have declined at a much slower rate with the onset of cold weather. During the holiday period in December subscribers were averaging several hundred trips per day in adversse conditions.
In [22]: