In [119]:

    
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
#di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)



In [2]:

    
import pandas as pd
import pandas
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt
from geojson import dumps
import geojson as g
%matplotlib inline
plt.style.use('ggplot')
import csv
import collections
import os
from __future__ import division
import scipy as sp
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly
import plotly.graph_objs as go
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import zipcode  #To find city, latitude and longitude corresponding to a zipcode
import folium   #For map
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
import parser
import math
import warnings; warnings.simplefilter('ignore')

Authors: Ashkan Saboori , Pamela Patterson , and Zamirbek Akimbekov



In [3]:

    
### Read the data



In [4]:

    
#plotly.tools.set_credentials_file(username='zamirg13', api_key='3dMc8cJ9Gxf4At0fdV49')
#mapbox_token = 'pk.eyJ1IjoiYXNoa2Fuc2Fib29yaSIsImEiOiJjaXp5djVldTMwM2FtMzNveHRpaTFqdHhyIn0.uVPRMwfIuYE_3j3jRajBFA'
## USE THE BELOW CREDENTIALS, PLEASE ### 
plotly.tools.set_credentials_file(username = 'ppatterson', api_key='jK180Tj3zyqIgqswVizx')
mapbox_token = 'pk.eyJ1IjoicGFtZWxvdDMxNyIsImEiOiJjajBlMnIzbGwwMTE0MzNwNHB3Mmt2MHI1In0.jvc8C_8qJonSMXVL-tFOfg'



In [5]:

    
path = '/Users/admin/project_data_141' #Zamir's path
#path = '/Users/ashkansaboori/Documents/OneDrive/MyPhD/Courses/5Winter-2017/STA141B/Project/Project_Datasets' #Ashkan's
#path = '~/Dropbox/141B/Project/data'



In [6]:

    
def read_data(path, year):
    """
    reads csv data and returns 4 data frames for the given year
    input: path
    output: 4 data frames (station, status, trip, and weather)
    """
    path = path + '/' + str(year) + '/'
    if year == 2014:
        station1 = pd.read_csv(path + 'station1.csv')
        station2 = pd.read_csv(path + 'station2.csv')
        status1 = pd.read_csv(path + 'status1.csv')
        status2 = pd.read_csv(path + 'status2.csv')
        trip1 = pd.read_csv(path + 'trip1.csv')
        trip2 = pd.read_csv(path + 'trip2.csv')
        weather1 = pd.read_csv(path + 'weather1.csv')
        weather2 = pd.read_csv(path + 'weather2.csv')
        station = pd.concat([station1, station2]).drop_duplicates()   
        status = pd.concat([status1, status2])
        trip = pd.concat([trip1, trip2])
        weather = pd.concat([weather1, weather2])
    
    else: 
        station = pd.read_csv(path + 'station.csv')
        status = pd.read_csv(path + 'status.csv')
        trip = pd.read_csv(path + 'trip.csv')
        weather = pd.read_csv(path + 'weather.csv')
    
    
    return station, status , trip, weather



In [7]:

    
station16, status16, trip16, weather16 = read_data(path, 2016)



In [8]:

    
station15, status15, trip15, weather15 = read_data(path, 2015)



In [9]:

    
station14, status14, trip14, weather14 = read_data(path, 2014)



In [10]:

    
crime = pd.read_csv(path + '/Crime_Incidents.csv')
income = pd.read_csv(path + '/Income.csv', header = 0)
calstations = pd.read_excel(path + '/caltrain_coordinates.xlsx')
#Source of crime data: https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z
#Source of income data: http://www.psc.isr.umich.edu/dis/census/Features/tract2zip/
#Also this can be useful: http://www.psc.isr.umich.edu/dis/data/kb/answer/1123



In [11]:

    
### Format the data for analysis



In [12]:

    
def format_df(status, trip, weather, year):
    """
    Takes trip and weather data frames and formats them for analysis. Also creates new
    dataframes for analysis. 
    Input: trip and weather pandas dataframes; year as a string
    Output: dataframes trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, 
    total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, 
    total_rides_by_hour 
    """
    status['time'] = pd.to_datetime(status.time, infer_datetime_format = True)
    trip['Count'] = 1
    trip['Start Date'] = pd.to_datetime(trip['Start Date'], format='%m/%d/%Y %H:%M')
    trip['End Date'] = pd.to_datetime(trip['End Date'], format='%m/%d/%Y %H:%M')
    trip['Day of Week'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%A'), axis=1)
    trip['Date'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%m/%d/%Y'), axis=1)
    trip['Date'] = pd.to_datetime(trip['Date'], format = '%m/%d/%Y')
    diff = (trip['End Date']-trip['Start Date'])
    trip['Duration'] = diff.apply(lambda x:  x.seconds)
    trip = trip.rename(columns={'Start Terminal' : 'station_id'})
    cal = calendar()
    holidays = cal.holidays(start=trip['Date'].min(), end=trip['Date'].max())
    trip['Holiday']=trip['Date'].apply(lambda date: date in holidays)
    total_rides = trip[['Count', 'Date']]
    total_rides = total_rides.groupby(['Date'], as_index = False).sum()
    weather = weather.rename(index=str, columns={'PDT': 'Date',})
    weather['Date'] = pd.to_datetime(weather['Date'], format='%m/%d/%Y')
    weather = weather.set_index('Date')
    if year == '2015':
        weather = weather.set_index('Zip', append = True)
    elif year == '2014':
        weather = weather.set_index('Zip', append = True)
    elif year == '2016':
        weather = weather.set_index('ZIP', append = True)       
    weather['PrecipitationIn'] = pd.to_numeric(weather['PrecipitationIn'].replace('T', 0))
    weather_ave = weather.groupby(level = 'Date').mean()
    weekend_rides = trip.loc[trip['Day of Week'].isin(['Saturday','Sunday'])]
    weekend_rides = weekend_rides[['Count', 'Date']]
    weekend_rides = weekend_rides.groupby(['Date'], as_index = False).sum()
    hol_rides = trip.loc[trip['Holiday'] == True]
    hol_rides = hol_rides[['Count', 'Date']]
    hol_rides = hol_rides.groupby(['Date'], as_index = False).sum()
    weekday_rides = trip[['Count', 'Subscriber Type', 'Day of Week']]
    weekday_rides = weekday_rides.groupby(['Day of Week', 'Subscriber Type'], as_index = False).sum()
    sorter = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
    weekday_rides['Day of Week'] = weekday_rides['Day of Week'].astype("category")
    weekday_rides['Day of Week'].cat.set_categories(sorter, inplace=True)
    weekday_rides = weekday_rides.sort_values(['Day of Week'])
    weekday_rides.loc[weekday_rides['Subscriber Type']=='Customer']
    total_weekday_rides = weekday_rides.groupby(['Day of Week'], as_index = False).sum()
    min_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    min_dur_rides.loc[:,'Duration'] = min_dur_rides['Duration']//60
    min_dur_rides = min_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    min_dur_rides = min_dur_rides.groupby([pd.cut(min_dur_rides['Duration'], np.arange(0,61,5)),'Subscriber Type']).sum()
    hr_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    hr_dur_rides.loc[:,'Duration'] = hr_dur_rides['Duration']//3600
    hr_dur_rides = hr_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    hr_dur_rides = hr_dur_rides.groupby([pd.cut(hr_dur_rides['Duration'], np.arange(0,25,2)),'Subscriber Type']).sum()
    total_min_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_min_rides.loc[:,'Duration'] = total_min_rides['Duration']//60
    total_min_rides = total_min_rides.groupby(['Duration'], as_index = False).sum()
    total_min_rides = total_min_rides.groupby(pd.cut(total_min_rides['Duration'], np.arange(0,61,5))).sum()
    total_hr_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_hr_rides.loc[:,'Duration'] = total_hr_rides['Duration']//3600
    total_hr_rides = total_hr_rides.groupby(['Duration'], as_index = False).sum()
    total_hr_rides = total_hr_rides.groupby(pd.cut(total_hr_rides['Duration'], np.arange(0,25,2))).sum()
    trip['Hour'] = trip['Start Date'].apply(lambda x:  x.hour)
    rides_by_hour = trip[['Count', 'Subscriber Type', 'Hour']]
    rides_by_hour = rides_by_hour.groupby(['Hour', 'Subscriber Type'], as_index = False).sum()
    total_rides_by_hour = rides_by_hour.groupby(['Hour'], as_index = False).sum()
    return status, trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, total_rides_by_hour



In [13]:

    
status14, trip14, weather14, total_rides14, weather_ave14, weekend_rides14, hol_rides14, weekday_rides14, total_weekday_rides14, min_dur_rides14, hr_dur_rides14, total_min_rides14, total_hr_rides14, rides_by_hour14, total_rides_by_hour14 = format_df(status14, trip14, weather14, '2014')



In [14]:

    
status15, trip15, weather15, total_rides15, weather_ave15, weekend_rides15, hol_rides15, weekday_rides15, total_weekday_rides15, min_dur_rides15, hr_dur_rides15, total_min_rides15, total_hr_rides15, rides_by_hour15, total_rides_by_hour15 = format_df(status15, trip15, weather15, '2015')



In [15]:

    
status16, trip16, weather16, total_rides16, weather_ave16, weekend_rides16, hol_rides16, weekday_rides16, total_weekday_rides16, min_dur_rides16, hr_dur_rides16, total_min_rides16, total_hr_rides16, rides_by_hour16, total_rides_by_hour16 = format_df(status16, trip16, weather16, '2016')

The Bay Area Bike Share is the region’s bike sharing system with 700 bikes and 70 stations across the region, with locations in San Francisco, San Jose, Mountain View, and Palo Alto. A bike sharing system consists of a fleet of specially designed bikes that are locked into a network of docking stations located throughout a region. Bay Area bikes can be rented from and returned to any station in the system, creating an efficient network with many possible combinations of start and end points. With hundreds of bikes at stations, the system is available for use 24 hours a day, 365 days a year. The station network provides twice as many docking points as bicycles, assuring that an available dock to return your bike is always nearby.

On the interactive map, you can see the bike stations that are shown with dark (installed after 2014) and light (installed in 2013) blue markers. The two busiest stations near San Francisco Caltrain station are shown with light green marker. Clicking on each marker brings a pop-up with the station name, number of trips as starting point from that station in 2016, and the total dockcounts.

On the map, one can also see the CalTrain stations shown with red marker. The pop-up station’s name and the weekly average number of passengers.

This interactive map provides overall information about the bike share stations and their locations. Later in the analysis we will analyze how CalTrain station association with the bike share rides.

1. Overview of data for 2014, 2015, and 2016

Data Set	2014	2015	2016
Station	70, 7	70, 7	67, 7
Status	35336812, 4	36647622, 4	35517185, 4
Trip	315807, 17	354152, 16	313689, 16
Weather	1840, 22	1825, 22	1830, 22

For each year, we have four data sets. The table below shows the size of each data set for corresponding years. The Status data set was the most difficult one to handle during the analysis due to its number of rows being larger than 35 million rows. 2014 and 2015 data sets contain information about bike rides in 5 cities: San Francisco, San Jose, Mountain View, Palo Alto, and Redwood City. In 2016, the bike stations in Redwood City were removed and hence 2016 data set contains information regarding only 4 cities.



In [16]:

    
# 2014
#print 'Size of station DF {}'.format(station14.shape)
#print 'Size of status DF {}'.format(status14.shape)
#print 'Size of trip DF {}'.format(trip14.shape)
#print 'Size of weather DF {}'.format(weather14.shape)



In [17]:

    
# 2015
#print 'Size of station DF {}'.format(station15.shape)
#print 'Size of status DF {}'.format(status15.shape)
#print 'Size of trip DF {}'.format(trip15.shape)
#print 'Size of weather DF {}'.format(weather15.shape)



In [18]:

    
#station16 = station16.dropna()



In [19]:

    
# 2016
# print 'Size of station DF {}'.format(station16.shape)
#print 'Size of status DF {}'.format(status16.shape)
#print 'Size of trip DF {}'.format(trip16.shape)
#print 'Size of weather DF {}'.format(weather16.shape)

2. Statistics about bike rides for 2014, 2015, and 2016

Data Set	2014	2015	2016
Number of bike rides	315807	354152	312126
Average riding time in minutes	18.31	15.25	13.79
Total riding time in minutes	96376.63	90065.83	71738.7
Total riding time in days	4015.69	3752.74	2989.11
Total riding time in years	11.00	10.28	8.18

The table below has the overall statistics about bike rides for each of three years. In all cases, one year is a date from September 1st to the next year’s August 31st. Hence, e.g., 2014 statistics stands for the bike rides starting from September 1st, 2013, to August 31st, 2014. The reason that the number of bike rides and total riding is decreasing from 2013 to 2016 is that the company removed stations that are used less frequently and pulled out from the Redwood City market at the end of 2015. Hence, 2016 contains bike ride statistics only for San Francisco, San Jose, Mountain View, and Palo Alto. Moreover, it should be noticed out that the average riding time in minutes is around 15 minutes. This is the desired ride time since bike sharing business model is designed only for short trips (less than 30 minutes) and serve as a “last mile connection”.



In [20]:

    
##2014
merged_st14 = pd.merge(station14, trip14, on = 'station_id')



In [21]:

    
##2014
#total_duration14 = sum(merged_st14.Duration)
#print 'number of bike rides between 09/01/2013 and 08/31/2014 : {}'.format(len(merged_st14['Trip ID']))
#print 'averaging about {} minutes per ride'.format(total_duration14/(len(merged_st14['Trip ID']) *60))
#print 'total riding time is {} hours'.format(total_duration14/3600)
#print 'or {} days'.format(total_duration14/(3600 * 24))
#print 'or {} years'.format(total_duration14 / (3600 * 24 * 365))



In [22]:

    
merged_st15 = pd.merge(station15, trip15, on = 'station_id')



In [23]:

    
total_duration15 = sum(merged_st15.Duration)
#print 'number of bike rides between 09/01/2013 and 08/31/2014 : {}'.format(len(merged_st15['Trip ID']))
#print 'averaging about {} minutes per ride'.format(total_duration15/(len(merged_st15['Trip ID']) *60))
#print 'total riding time is {} hours'.format(total_duration15/3600)
#print 'or {} days'.format(total_duration15/(3600 * 24))
#print 'or {} years'.format(total_duration15/ (3600 * 24 * 365))



In [24]:

    
#2016
merged_st = pd.merge(station16, trip16, on = 'station_id')



In [25]:

    
total_duration = sum(merged_st.Duration)
#print 'number of bike rides between 09/01/2015 and 08/31/2016 : {}'.format(len(merged_st['Trip ID']))
#print 'averaging about {} minutes per ride'.format(total_duration/(len(merged_st['Trip ID']) *60))
#print 'total riding time is {} hours'.format(total_duration/3600)
#print 'or {} days'.format(total_duration/(3600 * 24))
#print 'or {} years'.format(total_duration / (3600 * 24 * 365))



In [112]:

    
def weather_plot(x,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the date, the left and right y-axis being inputed by user
    Input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    Output: a graph
    """

    trace1 = go.Scatter(x=x,y=y1,name='Total Rides')
    trace2 = go.Scatter(x=x,y=y2,name=y2_label,yaxis='y2')
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)

3. Total rides and mean wind speed by date for 2014, 2015, 2016

Investigating these three plots, we can obtain the dates and total rides for some of the windiest days in 2014, 2015, and 2016:

April 22, 2014: 14 mph, 1066 rides Feb 8, 2014: 11 mph, 94 rides Feb 6, 2015: 18 mph, 608 rides Dec 30th, 2015: 13.6 mph, 486 rides Feb 17th, 2016: 15 mph, 973 rides March 5th, 2016: 14 mph, 68 rides May 20th, 2016: 14.2 mph, 1103 rides

There doesn’t seem to be a correlation between wind speed and total rides. Some of the windiest days saw near 1000 rides. This could be due to the fact that San Francisco doesn’t get very strong winds, and the tall city buildings could provide a strong windbreak from any wind.



In [123]:

    
weather_plot(total_rides14['Date'], total_rides14['Count'], weather_ave14[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2014', 'Total Rides and Mean Wind Speed by Date 2014')









    Out[123]:



In [124]:

    
weather_plot(total_rides15['Date'], total_rides15['Count'], weather_ave15[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2015', 'Total Rides and Mean Wind Speed by Date 2015')









    Out[124]:



In [125]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2016', 'Total Rides and Mean Wind Speed by Date 2016')









    Out[125]:

4. Total rides and mean temperature by date for 2014, 2015, 2016

There appears to be a correlation between temperature and total rides among these three years. We can see that there is a dip in total rides around the fall/winter months (late November to mid January). However, it is unclear if this is due to temperatures or other factors, such as the holiday season. We will also investigate the trends between total rides and holidays.



In [127]:

    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2014', 'Total Rides and Mean Temperature by Date 2014')









    Out[127]:



In [128]:

    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2015', 'Total Rides and Mean Temperature by Date 2015')









    Out[128]:



In [130]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2016', 'Total Rides and Mean Temperature by Date 2016')









    Out[130]:

5. Total rides and precipitation by date for 2014, 2015, and 2016

There wasn’t much rain in 2014, so we will primarily focus on 2015 and 2016. Some dates that stand out are: Dec 11, 2015: 1.92 inches rain, 107 rides Dec 2nd, 2014: 0.94 inches rain, 491 rides March 5th, 2016: 0.652 inches rain, 68 rides Nov 2, 2015: 0.616 inches rain, 863 rides

There seems to be a correlation with rides and inches rain. Overall, the number of rides is lower when there is more rain. We also know that March 5th, 2016 was a windy day, so it seems that if there is a lot of rain, or if it is both rainy and windy, the number of rides is low.



In [131]:

    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2014', 'Total Rides and Precipitation by Date 2014')









    Out[131]:



In [132]:

    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2015', 'Total Rides and Precipitation by Date 2015')









    Out[132]:



In [133]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2016', 'Total Rides and Precipitation by Date 2016')









    Out[133]:



In [36]:

    
def weekend_hol_plot(x1,x2,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the day, the left and right y-axis being inputed by user
    input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    output: a graph
    """

    trace1 = go.Scatter(x=x1,y=y1,name='Total Rides')
    trace2 = go.Bar(x=x2,y=y2,name=y2_label)
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)

6. Total rides with holidays highlighted for 2014, 2015, 2016

We can see that overall, there are fewer rides on holidays, with Thanksgiving, Christmas, and New Years having the fewest rides of all the holidays. However, there are two holidays that don’t share this trend each year. In 2015, the dates were October 12th and November 11th. These correspond to Columbus Day and Veteren’s Day. Not many people get both of those days off, so it is starting to look like a lot of the people using the bike share are using during normal business and commuting times. This suggests that the primary customers are residents and not tourists. We will learn more with further analysis.



In [134]:

    
weekend_hol_plot(total_rides14['Date'],hol_rides14['Date'],total_rides14['Count'],hol_rides14['Count'],'Holidays',
                 'Holidays and Trips 2014', 'Holidays and Trips 2014')









    Out[134]:



In [135]:

    
weekend_hol_plot(total_rides15['Date'],hol_rides15['Date'],total_rides15['Count'],hol_rides15['Count'],'Holidays',
                 'Holidays and Trips 2015', 'Holidays and Trips 2015')









    Out[135]:



In [136]:

    
weekend_hol_plot(total_rides16['Date'],hol_rides16['Date'],total_rides16['Count'],hol_rides16['Count'],'Holidays',
                 'Holidays and Trips 2016', 'Holidays and Trips 2016')









    Out[136]:

7. Total rides with weekends highlighted for 2014, 2015, and 2016

There is a clear correlation between total rides and weekends. We can see that the low points in the total rides correspond to the weekends for all three years. This supports our guess that the majority of users are using the bikes during working days, perhaps to commute. We can look at this further by looking at day of the week, and time of the day.



In [137]:

    
weekend_hol_plot(total_rides14['Date'],weekend_rides14['Date'],total_rides14['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2014', 'Weekends and Trips 2014')









    Out[137]:



In [138]:

    
weekend_hol_plot(total_rides15['Date'],weekend_rides15['Date'],total_rides15['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2015', 'Weekends and Trips 2015')









    Out[138]:



In [139]:

    
weekend_hol_plot(total_rides16['Date'],weekend_rides16['Date'],total_rides16['Count'],weekend_rides16['Count'],
                 'Weekends','Weekends and Trips 2016', 'Weekends and Trips 2016')









    Out[139]:



In [43]:

    
def grouped_bar(x1, y1, x2, y2, x3, y3, file_name, title, x_title):
    """
    Takes axis data for a 3 grouped bar chart and outputs a grouped bar chart. 
    Input: x1, y1, x2, y2, x3, y3 series data for different bars, the x data should align;
    filename: what to name the plotly file, string; title: of the graph; x_title: lable for x-axis
    Output: a grouped bar chart
    """
    trace1 = go.Bar(x=x1,y=y1,name='Total')
    trace2 = go.Bar(x=x2,y=y2,name='Customer')
    trace3 = go.Bar(x=x3,y=y3,name='Subscriber')
    data = [trace1, trace2, trace3]
    layout = go.Layout(title = title, xaxis = dict(title = x_title),yaxis = dict(title = 'Total Rides'),barmode='group')
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename=file_name)

8. Total rides by day of the week, separated into subscriber type for 2014, 2015, and 2016

It appears that the majority of the rides are coming from subscribers. We will look at the exact proportion a bit later. For now, let’s look at the rides by day of the week. In all three years, the subscribers are using the bike share more during the week (Monday-Friday) while customers are using the bike share slightly more during the weekends. This provides further support to our idea that most of the users are using the bikes to commute to and from work.



In [140]:

    
grouped_bar(total_weekday_rides14['Day of Week'],total_weekday_rides14['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2014', 
            'Total Rides by Day of Week 2014', 'Day of Week')









    Out[140]:



In [141]:

    
grouped_bar(total_weekday_rides15['Day of Week'],total_weekday_rides15['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2015', 
            'Total Rides by Day of Week 2015', 'Day of Week')









    Out[141]:



In [142]:

    
grouped_bar(total_weekday_rides16['Day of Week'],total_weekday_rides16['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2016', 
            'Total Rides by Day of Week 2016', 'Day of Week')









    Out[142]:

9. Total rides by duration in minutes, grouped by subscriber type, for 2014, 2015, and 2016

Now let’s look at how long the rides last in minutes. We can see that both customers and subscribers follow a similar trend, with most rides lasting between 5-10 minutes, and the next most common durations being between 0 and 5 minutes and between 5 and 15 minutes. It seems reasonable that a commute for work wouldn’t take more than 15 minutes in San Francisco, especially with the prevalence of bart and caltrain stations throughout the city. We will look at the locations of the most common stations later and compare to landmarks, such as train stations, to further investigate our ideas about how the bike share is being used.



In [143]:

    
grouped_bar(total_min_rides14.index.get_level_values('Duration'),total_min_rides14['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2014', 'Ride Duration by Minute 2014', 'Minutes')









    Out[143]:



In [144]:

    
grouped_bar(total_min_rides15.index.get_level_values('Duration'),total_min_rides15['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2015', 'Ride Duration by Minute 2015', 'Minutes')









    Out[144]:



In [145]:

    
grouped_bar(total_min_rides16.index.get_level_values('Duration'),total_min_rides16['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2016', 'Ride Duration by Minute 2016', 'Minutes')









    Out[145]:

10. Total rides by duration in hours, grouped by subscriber type, for 2014, 2015, and 2016

We see a similar trend as above, with most of the rides lasting less than 2 hours. The rides lasting more than 2 hours long are most commonly by customers, and not subscribers, which suggests that the customers may be using the bike share for touring purposes.



In [146]:

    
grouped_bar(total_hr_rides14.index.get_level_values('Duration'),total_hr_rides14['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2014', 'Ride Duration by Hour 2014', 'Hours')









    Out[146]:



In [147]:

    
grouped_bar(total_hr_rides15.index.get_level_values('Duration'),total_hr_rides15['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2015', 'Ride Duration by Hour 2015', 'Hours')









    Out[147]:



In [148]:

    
grouped_bar(total_hr_rides16.index.get_level_values('Duration'),total_hr_rides16['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2016', 'Ride Duration by Hour 2016', 'Hours')









    Out[148]:

11. Total rides by hour of the day, grouped by subscriber type, for 2014, 2015, and 2016

Now if we look at the total rides by the hour of the day (using military time), we see a similar trend among all three years. It appears that subscribers use the bikes mostly in the morning and evening hours, which would correspond to commuting times for normal business hours. However, customers follow a different trend, with most of their rides happening in the afternoon and evening hours. Again, this supports the idea that subscribers are using the bikes to commute to and from work, while customers are most likely using them for touring purposes.



In [149]:

    
grouped_bar(total_rides_by_hour14['Hour'],total_rides_by_hour14['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2014', 'Total Rides Per Hour 2014', 'Hour of Day')









    Out[149]:



In [150]:

    
grouped_bar(total_rides_by_hour15['Hour'],total_rides_by_hour15['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2015', 'Total Rides Per Hour 2015', 'Hour of Day')









    Out[150]:



In [151]:

    
grouped_bar(total_rides_by_hour16['Hour'],total_rides_by_hour16['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2016', 'Total Rides Per Hour 2016', 'Hour of Day')









    Out[151]:

12. Trip counts by subscription types

As pie charts indicate, the number of subscribers has increased since 2014 and now makes up whopping 89.3% of total bike share users. This make sense since one year subscription is the cheapest way to use daily bike sharing. Moreover, company endorses this model business by providing 30 minutes bike rides free for subscribers. Hence, the Bay Area Bike Share company is moving in the right direction towards their goal.



In [56]:

    
totals14 = merged_st14['Subscriber Type'].value_counts()
Subscribers14, Customers14 = totals14[0], totals14[1]

fig = {
    'data': [{'labels': ['Subscribers', 'Customers'],
              'values': [Subscribers14, Customers14],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by customer types in 2014'}
     }

py.iplot(fig)









    Out[56]:



In [57]:

    
totals15 = merged_st15['Subscriber Type'].value_counts()
Subscribers15, Customers15 = totals15[0], totals15[1]

fig = {
    'data': [{'labels': ['Subscribers', 'Customers'],
              'values': [Subscribers15, Customers15],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by customer types in 2015'}
     }

py.iplot(fig)









    Out[57]:



In [58]:

    
totals = merged_st['Subscriber Type'].value_counts()
Subscribers, Customers = totals[0], totals[1]

fig = {
    'data': [{'labels': ['Subscribers', 'Customers'],
              'values': [Subscribers, Customers],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by customer types in 2016'}
     }

py.iplot(fig)









    Out[58]:

13. Trip counts by cities for 2016

Surprisingly, the San Francisco bike sharing dominates the trip counts by cities with about 92% of total trips in 2016. Also, notice that the Redwood City had the least proportion of trips both in 2014 and 2015 which made the company to remove the all bike stations in that city. Moreover, Palo Alto and Mountain View comprise about 3-4 % of total rides. The company may have to change their business model in those cities or remove stations in the future, and concentrate on San Francisco and San Jose markets. Palo Alto and Mountain View have technology companies which provide their workers with free bikes and shuttle to commit. Hence, it does not make sense to target those two markets with bike sharing business. Instead, as we will see, there are some stations in San Francisco which desperately require more bikes and docks, and the company could focus on those parts of the city. Instead of trying to cover all Bay Area, the company could focus on growing in San Francisco and San Jose, and in East Bay. With the proposed expansion plan, that’s exactly what they are trying to do by the end of 2018.



In [59]:

    
by_cities14 = merged_st14.landmark.value_counts()
sf14, sj14, mv14, pa14, rc14 = by_cities14[0], by_cities14[1], by_cities14[2], by_cities14[3], by_cities14[4]
fig_cities = {
    'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto', 'Redwood City'],
              'values': [sf14, sj14, mv14, pa14, rc14],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by cities in 2014'}
     }

py.iplot(fig_cities)









    Out[59]:



In [60]:

    
by_cities15 = merged_st15.landmark.value_counts()
sf15, sj15, mv15, pa15, rc15 = by_cities15[0], by_cities15[1], by_cities15[2], by_cities15[3], by_cities15[4]
fig_cities = {
    'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto', 'Redwood City'],
              'values': [sf15, sj15, mv15, pa15, rc15],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by cities in 2015'}
     }

py.iplot(fig_cities)









    Out[60]:



In [61]:

    
by_cities = merged_st.landmark.value_counts()
sf, sj, mv, pa = by_cities[0], by_cities[1], by_cities[2], by_cities[3]
fig_cities = {
    'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto'],
              'values': [sf, sj, mv, pa],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by cities'}
     }

py.iplot(fig_cities)









    Out[61]:



In [62]:

    
### 14. Codes to generate maps



In [63]:

    
trip_counts = pd.DataFrame(merged_st.sort_values(by = 'station_id').station_id.value_counts())
trip_counts = trip_counts.reset_index().rename(columns = {'station_id' : 'trip_counts',
                                                          'index' : 'station_id'})
station_merged = pd.merge(station16, trip_counts, on = 'station_id')



In [64]:

    
start_stations = trip16[['Start Station', 'Duration']]
grouped = (start_stations.groupby('Start Station').
                sum().
                sort_values(by = 'Duration', ascending = False))



In [65]:

    
#merged_st.name.value_counts()[:5] #most popular five starting stations



In [66]:

    
calstations['station_type'] = 'caltrain'
calstations['dockcount'] = 0
calstations['trip_counts'] = 0
calstations['station_id'] = 0
calstations['installation'] = 0 
calstations = calstations.rename(columns={'caltrain_name' : 'name'})
calstations = calstations[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts', 'station_type', 'passengers']]



In [67]:

    
df_for_map = station_merged[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts']]
df_for_map['station_type'] = 'bike_station'
df_for_map['passengers'] = 0
df_for_map = df_for_map.append(calstations)
df_for_map.reset_index(drop='index', inplace = True)



In [68]:

    
start_lat, start_long, start_name = calstations.iloc[1]['lat'], calstations.iloc[1]['long'], calstations.iloc[1]['name']



In [69]:

    
def parser(df):
    """
    Parses a pandas DF into JSON-line object
    input: a dataframe
    output: JSON-line object
    """
    fields = df.columns
    parsed_data = (dict(zip(fields, df.iloc[i])) for i in xrange(len(df)))
    return parsed_data


def create_map(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        if line['station_type'] == 'bike_station':
            year = line['installation'].split('/')[2]
            if line['station_id'] == 70 or line['station_id'] == 69:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#00cc00',
                              'marker-size': 'large',
                              'marker-symbol': 'bicycle'
                                     }
            elif year == '2013':
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#0e2f44',
                                'marker-size':'large',
                                'marker-symbol': 'bicycle',
                                "fill-opacity": 0.3
                                     }
            else:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                                'marker-size': 'large',
                                'marker-symbol': 'bicycle'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        else:
            data['properties'] = {'name': line['name'],
                                 'passengers': line['passengers'],
                              'marker-color':'#a11f27',
                                  'marker-size': 'large',
                                  'marker-symbol': 'rail'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)

    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('bikecaltrain.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'bikecaltrain.geojson' file has been created, check out your cd"

def create_linestring(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        
        data['properties'] = {'name': line['name'],
                         'dockcount': line['dockcount'],
                              'marker-color':'#a11f27',
                              'marker-size': 'large'
                                     }
        data['geometry'] = {'type': 'LineString',
                         'coordinates': ([line['long'], line['lat']], [start_long, start_lat])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)

    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('lines.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'lines.geojson' file has been created, check out your cd"



In [70]:

    
#create_map(df_for_map)



In [71]:

    
#create_linestring(station16)



In [152]:

    
def status_for_city(station, status, city = 'San Francisco'):
    """
    returns a dataframe station ids for the specified city
    
    input: status and station dataframes, city (default = 'San Francisco')
    output: a stations dataframe with for the specified city
    
    """
    
    city_stations = station[station.landmark == city]['station_id']
    city_status = status[status.station_id.isin(city_stations)]
    
    return city_status

def groupby_term_mean(status, term):
    """
    returns dataframe grouped by term and average values
    
    input: status data frame, term (month or hour)
    output: dataframe grouped by term and average values of
            other variables
    """
    assert term in ['month', 'hour', 'weekday'], 'term should be either "month", "hour" or "weekday".'
    
    if term == 'hour':
        status['hour'] = status.time.dt.strftime('%H')
        grouped = status.groupby('hour').mean()
        grouped.reset_index(inplace = True)
    elif term == 'month':
        status['month'] = status.time.dt.strftime('%m')
        grouped = status.groupby('month').mean()
        grouped.reset_index(inplace = True)
    else:
        status['weekday'] = status.time.dt.weekday_name
        grouped = status.groupby('weekday').mean()
        grouped.reset_index(inplace = True)
        grouped['Sorter'] = [5, 1, 6, 7, 4, 2, 3]
        grouped.sort_values(by = 'Sorter', inplace = True)
    
    return grouped

def plot_status(grouped, time_format, city = None, station_name = None):
    """
    returns a plot of average bike and dock availability
    input: grouped dataframe by hour or month, time_format 
          (month or hour), city (default is None)
    output: a plot of average bike and dock availability
    """
    import plotly.plotly as py
    import plotly.graph_objs as go
    
    assert time_format in ['month', 'hour', 'weekday'], 'time format should be either "month", "hour", or "weekday".'

    # Add data
    
    if time_format is 'hour':
        time = grouped.hour
        title_time = 'Daily'
        x_axis_name = 'Hour'
    elif time_format is 'month':
        time = grouped.month
        title_time = 'Yearly'
        x_axis_name = 'Month'
    else:
        time = grouped.weekday
        title_time = 'Weekly'
        x_axis_name = 'Week Days'
        
    bikes_available = grouped.bikes_available
    docks_available = grouped.docks_available
    # Create and style traces
    trace0 = go.Scatter(
        x = time,
        y = bikes_available,
        name = 'Bikes Available',
        line = dict(
            color = ('rgb(205, 12, 24)'),
            width = 4,
            dash = 'dash')
    )
    trace1 = go.Scatter(
        x = time,
        y = docks_available,
        name = 'Docks Available',
        line = dict(
            color = ('rgb(22, 96, 167)'),
            width = 4,
            dash = 'dash')
    )

    data = [trace0, trace1]
    
    if station_name:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, station_name)
    elif city:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, city)
    else:
        title_name = '{} Average Counts of Available Bikes and Docks across all 4 cities'.format(title_time)

    # Edit the layout
    layout = dict(title = title_name,
                  xaxis = dict(title = x_axis_name),
                  yaxis = dict(title = 'Counts'),
                  )

    fig = dict(data=data, layout=layout)
    file_name = 'plot for {} by {}'.format(city, time_format)
    return py.iplot(fig, filename=file_name)

def plot_summary_for_city(city, term, station_id = None):
    """
    returns plot summary for the given city and term
    input: city name, term ('hour', 'month', 'weekday')
    output: a figure
    
    NOTE: station and status data should be available in global env.
    """
    
    assert city in ['San Francisco', 'Mountain View', 'Palo Alto', 'San Jose'], 'unavailable city'
    assert term in ['hour', 'month', 'weekday'], 'term should be either "hour", "month", or "weekday".'
    assert type(status16['time'][0]) is pandas.tslib.Timestamp, 'status["time"] type error. convert it time series.'
    
    city_status = status_for_city(station16, status16, city)
    station_name = None
    if station_id:
        assert station_id in station16.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station.name[station16.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return plot_status(grouped, term, city, station_name)

15. Plots for average number of available bikes and docks in San Francisco and San Jose.

Having analyzed the trips and subscribers, we wanted to analyze the status of bikes and docks across all cities to get more confidence that bike sharing is used mostly to commute to work. Then, we thought that due the fact that Palo Alto and Mountain View comprise only about 3-4% of total bikes trips, it would be better to focus only on San Francisco and San Jose.

Below, you can see the average availability of bikes in San Francisco and San Jose during a day, week, and month.

Daily average counts of available bikes and docks in San Francisco clearly shows that starting from night to around 5:00 - 6:00 am the counts of bikes and docks are almost constant. This is probably because very few people use bike sharing at nights. Starting from 5:00 - 6:00 am, less bikes and more docks become available. At 8:30 am the least number of bikes are available, and most docks are free. This clearly shows that in San Francisco, most people use bike sharing as a way to commute to their work. Later, after 10, more bikes become avaialbe till around 3:00 pm. As expected, the counts of free bikes and busy docks start to decrease from 3:00 pm, and at 5:30 pm the one might not be able to find a bike on a nearby bike share station. This again proves that most customers use bike riding to commute back home or nearby Caltrain, bart, or bus stations.

For San Jose, this pattern is not clear but closer look might tell similar story. However, since San Jose comprises only about 5% of total bike rides in 2016, this data might be not sufficient to reveal any patterns yet. Clearly, Bay Area Bike Share company should promote their business in San Jose.



In [153]:

    
plot_summary_for_city('San Francisco', 'hour')









    Out[153]:



In [155]:

    
plot_summary_for_city('San Jose', 'hour')









    Out[155]:

Weekly average counts of available bikes and docks in both cities show that during Monday - Friday, the pattern of bike sharing usage is almost constant. On the other hand, more bikes and less docks become available during weekends. This is probably because most customers do not work over the weekend and the system is used less.



In [156]:

    
plot_summary_for_city('San Francisco', 'weekday')









    Out[156]:



In [157]:

    
plot_summary_for_city('San Jose', 'weekday')









    Out[157]:

Yearly average counts of available bikes and docks in both cities show that bike sharing is mostly used during February to October, that is, during months that have suitable weather for bike riding. From October to February, more bikes and less docks are avaialable. Also, during October to January, there are different holidays like Christmas, New Year, and winter vacations that probably also contributes to less usage of bikes.



In [160]:

    
plot_summary_for_city('San Francisco', 'month')









    Out[160]:



In [161]:

    
plot_summary_for_city('San Jose', 'month')









    Out[161]:



In [162]:

    
#plot_summary_for_city('Mountain View', 'hour')
#plot_summary_for_city('Mountain View', 'weekday')
#plot_summary_for_city('Mountain View', 'month')



In [163]:

    
#plot_summary_for_city('Palo Alto', 'hour')
#plot_summary_for_city('Palo Alto', 'weekday')
#plot_summary_for_city('Palo Alto', 'month')



In [164]:

    
def convert_df(station_in, status_in, term, station_id = None):
    """
    createst a DF which is grouped by term and has average
    bike and docks available for a station if speficied, otherwise
    it returns average value for San Francisco
    
    input: station_in, status_in, term, station_id (default = None)
    output: a grouped DF, station_name (default = None)
    """
    
    city_status = status_for_city(station_in, status_in)
    station_name = None
    if station_id:
        assert station_id in station_in.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station_in.name[station_in.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return grouped, station_name



In [165]:

    
def compares_years(term, station_id = None):
    """
    compares average avaialble bike and dock counts
    for three years per term for San Francisco,
    if station_id specified compares for that station.
    
    input: term ('hour', 'month', or 'weekday')
    output: an interactive figure
    
    """
    
    sf14, station_name = convert_df(station14, status14, term, station_id)
    sf15, station_name = convert_df(station15, status15, term, station_id)
    sf16, station_name = convert_df(station16, status16, term, station_id)
    
    bikes14 = sf14.bikes_available.values
    docks14 = sf14.docks_available.values
    bikes15 = sf15.bikes_available.values
    docks15 = sf15.docks_available.values
    bikes16 = sf16.bikes_available.values
    docks16 = sf16.docks_available
    xcoord = sf15[term]
    
    if station_name:
        title = 'Average Available Bike and Dock Counts for {}'.format(station_name)
    else:
        title = 'Average Available Bike and Dock Counts for San Francisco'

    trace1 = Scatter(
        x=xcoord, y=bikes15,
        line=Line(
            color='#FFD700',
            width=3
        ),
        name='2015: Bikes'
    )

    trace2 = Scatter(
        x=xcoord, y=docks15,
        line=Line(
            color='#C0C0C0',
            width=3
        ),
        name='2015: Docks'
    )

    trace3 = Scatter(
        x=xcoord, y=bikes16,
        line=Line(
            color='#BA8651',
            width=3
        ),
        name='2016: Bikes'
    )

    trace4 = Scatter(
        x=xcoord, y=docks16,
        line=Line(
            color='#000000',
            width=3
        ),
        name='2016: Docks'
    )
    
    trace5 = Scatter(
        x=xcoord, y=bikes14,
        line=Line(
            color='#b9f442',
            width=3
        ),
        name='2014: Bikes'
    )
    
    trace6 = Scatter(
        x=xcoord, y=docks14,
        line=Line(
            color='#416bf4',
            width=3
        ),
        name='2014: Docks'
    )

    data = Data([trace1, trace2, trace3, trace4, trace5, trace6])
    layout = Layout(
        title=title,
        updatemenus=list([
            dict(
                x=-0.05,
                y=1,
                yanchor='top',
                buttons=list([
                    dict(
                        args=['visible', [True, True, True, True, True, True]],
                        label='All',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, True, False, False, False, False]],
                        label='2015',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, True, True, False, False]],
                        label='2016',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, False, False, True, True]],
                        label='2014',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, False, True, False, True, False]],
                        label='Bikes',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, True, False, True, False, True]],
                        label='Docks',
                        method='restyle'
                    )
                ]),
            )
        ]),
    )
    fig = Figure(data=data, layout=layout)
    return py.iplot(fig)



In [166]:

    
def route_countFunc(trip_df, station_df):
    """
    This function gets 'trip' and 'station' dataframe and returns a dataframe contains routes and their counts
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
    @returns:
        routes_count: contains routes and their counts, duration, and landmark
    """
    #Add start landmark
    trip = pd.merge(left = trip_df[['Trip ID', 'Duration', 'station_id', 'End Terminal']], right = station_df[['name', 'station_id', 'landmark']], 
             how='left', left_on='station_id', right_on= 'station_id')
    trip.rename(columns={'landmark':'start_landmark', 'name':'Start Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
    #Add end landmark
    trip = pd.merge(left = trip, right = station_df[['name','station_id', 'landmark']], 
             how='left', left_on='End Terminal', right_on= 'station_id')
    trip.rename(columns={'landmark':'end_landmark', 'name':'End Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
       #Pick only trips within one landmark
    trip = trip[trip.start_landmark == trip.end_landmark]
    #Count the number of trips between stations
    routes_count = trip.groupby(['Start Station', 'End Station', 'start_landmark']).agg({'Trip ID': 'count', 'Duration': 'mean'}).reset_index()
    routes_count = routes_count.rename(columns={'Trip ID': 'Counts'})
    routes_count.Duration = routes_count['Duration'].apply(lambda x: round((x/60),2))
    #Change the column names
    routes_count.rename(columns={'start_landmark':'Landmark', 'Duration': 'Duration (min)'}, inplace=True)
    return routes_count



In [167]:

    
def route_heatmap(trip_df, station_df, landmark):
    """
    This function recieve the trip and station dataframes for a specific year 
    and plots the trip heatmap for a given landmark
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
        landmark: Could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        Heatmap graph
    """
    routes = route_countFunc(trip_df, station_df)
    df_routes = routes[routes.Landmark == landmark]
    start_station = df_routes['Start Station'].sort_values().unique()
    end_station = df_routes['End Station'].sort_values().unique()

    #create z values for heatmap and text for hover text in the map
    z = []
    trip_text = []
    for start in start_station:
        new_row = []
        text_row = []
        for end in end_station:
            try:
                trip_count = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)].Counts.values[0]
                average_duration = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)]['Duration (min)'].values[0]
            except IndexError:
                trip_count = 0
                average_duration = 0
            text = 'Start Station: {}<br>End Station: {}<br>Trip Counts: {}<br>Average Duration: {} Mins'.format(start, end, trip_count, average_duration)
            text_row.append(text)    
            new_row.append(trip_count)
        z.append(list(new_row))
        trip_text.append(list(text_row))

    #Plot Heatmap
    heatmap = go.Heatmap(x = start_station,
               y = end_station,
               z = z,
                hoverinfo = 'text',
                text = trip_text,
               colorscale='YIGnBu',
               reversescale=True)

    data = go.Data([heatmap])
    layout = go.Layout(
        title=landmark.upper() + ' TRIP COUNTS',
        titlefont = dict(color='black', size=35, family='monospace'),
        xaxis=dict(
            title='END STATION',
            tickangle = -90,
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        yaxis=dict(
            title='START STATION',
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        width=1000,
        height=1000,
        margin=go.Margin(
        l=300,
        r=50,
        b=300,
        t=100,
        pad=4
    )
    )
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig)



In [168]:

    
def crimeFunc(df):
    """
    This function gets the crime data frame and select the crimes potentially dangerous to bikers and bikes such as:
    'BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT'
    in the year 2016
    @args:
        df: crime dataframe
    @returns:
        a dataframe containing crimes in 2015 related to bikes
    """
    #List of related crimes
    relatedCrimes = ['BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']
    bikeCrime = df[df['Category'].isin(relatedCrimes)].reset_index(drop = True)
    #Extract year from "Date" column
    bikeCrime.Date = bikeCrime.Date.str.slice(6,10)
    bikeCrime = bikeCrime.rename(columns = {'X': 'lon', 'Y': 'lat', 'Date': 'year'})
    #Crimes in year 2016
    bikeCrime = bikeCrime[bikeCrime['year'] == '2016'][['Category', 'lat', 'lon']].reset_index(drop=True)
    return bikeCrime



In [169]:

    
def crime_objectFunc(df):
    """
    This function gets the crime data frame and creates a scatter map plot object
    @args:
        df: crime data frame
    @returns:
        a scatter map plot object
    """
    bikeCrime = crimeFunc(df)
    crime_graph = go.Scattermapbox(lon = bikeCrime['lon'],
                        lat = bikeCrime['lat'],
                        hoverinfo = 'text',
                        text = bikeCrime['Category'],
                        mode = 'markers',
                        marker = dict(size = 3,
                                     color = 'red'
                                      ),
                        opacity = 0.1,
                        name = 'Crime')
    return crime_graph



In [170]:

    
def extract_info(zipNum, info_type):
    '''
    This function gets the zipcode and returns either city name, latitude, or longitude
    @args:
        zipNum: zipcode
        info_type: could be either: 'city', 'lat', or 'longitude'
    @returns:
        city name, latitude, or longitude
    '''
    
    try:
        zipcode.isequal(str(zipNum)).state #Check to see if info exists for the zipcode
        info = zipcode.isequal(str(zipNum))
    except AttributeError:
        return 'Not valid'
    if info_type == 'state':
        return info.state
    if info_type == 'city':
        return info.city
    elif info_type == 'lat':
        return info.lat
    else:
        return info.lon



In [171]:

    
def incomeFunc(df):
    """
    This function gets the income dataframe and finds the city and state corresponding to each lattidue and longitude
    and returns a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    @args:
        df: income dataframe
    @returns:
        a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    """
    #Add state, city, latitude, and longitude columns to income dataset
    info_list = ['state', 'city', 'lat', 'lon']
    for i in info_list:
        df[i] = [extract_info(str(x), i) for x in df.iloc[:,0]]

    #Extract Sanfrancicso data
    SF_income = df[(df.state == 'CA') & (df.city == 'SAN FRANCISCO')].reset_index(drop=True)
    return SF_income



In [172]:

    
def hover_incomeTextFunc(df):
    """
    This function gets the income dataframe and creates a list of texts for hover text in plot
    @args:
        df: income dataframe
    @returns:
        a list of texts for hover text in the plot
    """
    #For hover in the map graph
    text_income = df.apply(lambda x: 'Zipcode: {}<br>Population: {}<br>Median Income: ${}<br>Mean Income: ${}'.format(
            x[0], x[3], x[1], x[2]),axis=1)
    return text_income



In [173]:

    
def point_incomeSizeFunc(df):
    """
    This function gets the income dataframe and creates a list of point sizes for plot
    @args:
        df: income dataframe
    @returns:
        a list of point sizes for plot
    """
    #Scale the point size for map
    point_size = df['Median'].str.replace(',', '').apply(int)/df['Median'].str.replace(',', '').apply(int).max()
    return point_size



In [174]:

    
def point_incomeTransFunc(df):
    """
    This function gets the income dataframe and creates of a list of point transparency for plot
    @args:
        df: income dataframe
    @returns:
        a list of point transparency for plot
    """
    #Scale the point transparancy to population
    point_transparency = df['Population'].str.replace(',', '').apply(int)/df['Population'].str.replace(',', '').apply(int).max()
    return point_transparency



In [175]:

    
def income_objectFunc(df):
    """
    This function gets the income dataframe and creates a scatter map plot object
    @args:
        df: income dataframe
    @returns:
        a scatter map plot object
    """
    SF_income = incomeFunc(df)
    income_graph = go.Scattermapbox(lon = incomeFunc(SF_income)['lon'],
                        lat = incomeFunc(SF_income)['lat'],
                        hoverinfo = 'text',
                        text = hover_incomeTextFunc(SF_income),
                        mode = 'markers',
                        marker = dict(size = point_incomeSizeFunc(SF_income)*20,
                                     color = 'light blue'
                                      ),
                        opacity = point_incomeTransFunc(SF_income)/10,
                        name = 'Median Income')
    return income_graph



In [176]:

    
def graph_CrimeIncome(income_df, crime_df):
    """
    This function gets two data frames income and crime and plots the scatter map plot for both in San Francisco
    @args:
        income_df: income dataframe
        crime_df: crime dataframe
    @returns:
        A scatter map plot in San Francisco
    """
    income_object = income_objectFunc(income_df)
    crime_object = crime_objectFunc(crime_df)
    data = go.Data([income_object, crime_object])
    layout = dict(
    title='MEDIAN INCOME AND CRIME MAP IN SAN FRANCISCO',
    titlefont = dict(color='black', size=35, family='monospace'),
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_token,
        bearing=0,
        center=dict(
            lat=37.773972,
            lon=-122.431297
        ),
        pitch=0,
        zoom=10
    ),
    )
    layout = go.Layout(layout)
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)



In [177]:

    
def routeDock_countFunc(tripDF, stationDF, landmark):
    """
    This function gets trip and station dataframes and the landmark of interests and returns a dataframe
    containing all stations with total number of trips started at the stations and their number of dockcounts
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A dataframe containing all stations with total number of trips started at the stations and their 
        number of dockcounts
    """
    routeCount = route_countFunc(tripDF, stationDF)
    routeCount = routeCount[routeCount.Landmark == landmark].groupby('Start Station').agg({'Counts': 'sum'}).reset_index()
    routeCount = pd.merge(left=routeCount, right=stationDF[['name', 'dockcount']], how='inner', left_on='Start Station', right_on='name')
    routeCount.drop('name', axis=1, inplace=True)
    return routeCount



In [178]:

    
def hover_text(df):
    """
    This function gets a dataframe made by function "routeDock_countFunc" and returns a list containing the 
    text information used in the barplot
    @args:
        df: dataframe made with function "routeDock_countFunc"
    @returns:
        a list containing the text information used in hover text for the barplot
    """
    text = df.apply(lambda x: 'Station: {}<br>Trip Counts: {}<br>Dock Counts: {}<br>Dock/Trip: {}%'.format(
        x[0], x[1], int(x[2]), round(float(x[2])*100/x[1], 3)), axis=1)
    return text



In [179]:

    
def bar_object(tripDF, stationDF, landmark, year):
    """
    This function gets trip and station dataframes and the landmark of interests and specific year and returns 
    a barobject used for plotting barplot
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
        year: year of interest, either 2014, 2015, 2016
    @returns:
      Barplot object used to plot a barplot
    """
    df = routeDock_countFunc(tripDF, stationDF, landmark)
    bar = go.Bar(x=df['Start Station'], y=df['Counts'],
                name = year, hoverinfo='text', text=hover_text(df))
    return bar



In [180]:

    
def barPlot(landmark):
    """
    This funciton gets a landmark and plots the number of trips from statins in a landmark. The barplot
    contains the information for three years 2014, 2015, and 2016.
    @args:
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A barplot
    """
    
    #Bar object
    bar14 = bar_object(trip14, station14, landmark, 2014)
    bar15 = bar_object(trip15, station15, landmark, 2015)
    bar16 = bar_object(trip16, station16, landmark, 2016)
    data = go.Data([bar14, bar15, bar16])
    
    #Layout
    layout = go.Layout(
     width=1000,
    height=800,
    hovermode = 'closest',
    title='TOTAL TRIPS FROM STATIONS IN ' + landmark.upper(),
    titlefont = dict(color='black', size=35, family='monospace'),
    xaxis=dict(
        title='STATION',
        tickangle = -90,
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
    yaxis=dict(
        title='TRIP COUNTS',
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
        margin=go.Margin(
        l=100,
        r=50,
        b=300,
        t=100,
        pad=4), 
        showlegend = 'True', 
        legend = dict(x=0, y=1, orientation='h'))
    
    #Make the plot
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)

17. Trips from stations in San Francisco

The following bar plot shows the number of trips started from each station in San Francisco in three years. Pointing the mouse cursor over each bar will give information on number of docks and docks/rides percent, in addition to station name and exact number of rides for each station.



In [181]:

    
barPlot('San Francisco')









    Out[181]:

Looking at 2016 data, the top 5 most popular starting stations are as follows:

Station	Rides
San Francisco Caltrain (Townsend at 4th)	23591
San Francisco Caltrain 2 (330 Townsend)	22358
Harry Bridges Plaza (Ferry Building)	16127
2nd at Townsend	14099
Steuart at Market	13690

San Francisco Caltrain 1 and 2 stations that are near the Caltrain station are on top of the list. This shows that, these two stations serve people who are using Caltrain line to commute. Check out the first map to see their location and the closest bike stations. Harry Bridge Plaza and Steuart at Market stations are ranked third and fifth in the list. Both two stations are close to Ferry Building which is a destination for tourists and a center of transit hub that connects San Francisco’s neighborhoods and the surrounding bay area communities. Also, 2nd at Townsend station ranked fourth in the list is in the proximity of MUNI station. Therefore, top five bike stations are adjacent to public transit stations that people normally use to commute to their works or places of interest where tourists visit.

Rides initiated from these five stations constitute about 31 percent of total rides from starting stations in San Francisco, however, number of bikes allocated to these 5 stations varies between 15 to 23. On the other hand, San Francisco City Hall station with the least number of rides (2195) among all stations has 19 bikes. This shows that the number of bikes available for high demand stations is not proportional to the number of trips. Therefore, this will cause in empty bike docks and longer waiting time that eventually leads to user dissatisfaction. It is highly recommended to increase the number of bikes in these stations or install new stations near the existing ones to optimize the serviceability and waiting time.

Two new stations have been installed in 2016 at 5th St at Folsom St and Cyril Magning St at Ellis St. These stations have operated less than a week in 2016 and hence, their ride counts are extremely low. Broadway St at Batter St and San Francisco Caltrain 2 (330 Townsend) stations have a huge jump in the ride counts between 2014 and 2015. Broadway St at Batter St station was installed in late 2014 which explains the low ride rate. However, San Francisco Caltrain 2 (230 Townsend) has the same installation date as San Francisco Caltrain (Townsend at 4th) with more dock counts and much less ride counts in 2014. No significant observation found to explain the low ride rate for this station in 2014 and the following significant jump in 2015.

17. Route count heatmap by station in SF for 2016

In the previous section, the popularity of starting stations was explored. In this section, we will look at the popularity of routes in the San Francisco bike share system. Trip table contains information on trips starting station, ending station, and duration. This information is plotted in the heatmap below. The x-axis and y-axis represent the end station and start station, respectively. The color of squares indicates the total number of trips from starting station to ending station with lighter colors representing lower trip counts and darker color representing higher trip counts. The map is interactive and hovering the mouse pointer over the squares gives detailed information on exact number of trips as well as average trip duration. It should be noted that, the heatmap shows only the trips within the city boundaries and does not count for trips between different landmarks.



In [182]:

    
route_heatmap(trip16, station16, 'San Francisco')









    Out[182]:

Looking at the heatmap, the top 5 most popular routes are as follows:

Start Station	End Station	Counts	Avg. Duration (min)
Townsend at 7th	San Francisco Caltrain 2 (330 Townsend)	3341	4.26
Market at 10th	San Francisco Caltrain 2 (330 Townsend)	3035	9.89
Harry Bridges Plaza (Ferry Building)	Embarcadero at Sansome	2986	17.17
2nd at Townsend	Harry Bridges Plaza (Ferry Building)	2781	9.66
Embarcadero at Sansome	Steuart at Market	2605	7.66

Townsend at 7th to San Francisco Caltrain 2 is ranked first in the list. Townsend at 7th is in a business region that contains many companies. This could be a reason for the popularity of this route. Companies employees use this bike station to get to the Caltrain. Also, the opposite route, from San Francisco Caltrain 2 to Townsend at 7th, has a high trip count that reinforces the idea that this route has been used by employees to commute from home to work and vice versa. Harry Bridges Plaza to Embarcadero at Sansome and Embarcadero at Sansome to Steuart at Market are other two routes that have heavy trip counts. Compared to other routes mentioned earlier, opposite direction of these two routes are not trafficked as other direction. These stations are located close to Ferry Building and Pier 39 which are great spots for tourists. Therefore, bikes in these stations are mostly used by tourists to explore around the area and visit different attractive parts along the coast.

18. Compare availability of bikes and docks for the two bike stations near San Francisco Caltrain 2

The plots below show the availability fo bikes and docks throughout a day for the two busiest staions: Townsend at 4th and 330 Townsend. One can see that the number of available bikes reaches peak at midnight and at around 6:00am - 7:00am in the morning starts to decline. Meanwhile, the number of available docks is least during night since bikes are docked. Again, starting from 6:00am - 7:00am more docks become available. Hence, between 6:00am - 7:00am availability of bikes and docks change. It must be due to the fact that there is Caltrain station and it is the last line coming to San Francisco Downtown business area. And as stated above, most people probably use Bay Area Bike Share as subscribers and commute via bike to their work. Around 4:00-6:00pm pm they return back to Caltrain to go home and at the same time the number of available bikes starts to increase while docks become less available. Surprisingly, both bike share stations have the same pattern. Moreover, for all three years this pattern is preserved. Hence, it might a good idea to invest more into locations that have Caltrain or BART stations and promote people to use bike sharing as a "last mile connection".



In [183]:

    
compares_years(term='hour', station_id= 70)









    Out[183]:



In [184]:

    
compares_years(term = 'hour', station_id = 69)









    Out[184]:

The bike riding pattern for the two stations by week days can be seen below. Starting from Monday to Friday, the pattern of number of available bikes and docks are almost constant with the slope showing that less bikes are used as weekends approach. On Saturday and Sunday, even less bikes are used and hence less number of docks are available. This pattern is similar for all three years, and for both stations. This means that the majority of bike share systems users are the ones that work in San Francisco and commute with Caltrain. It might also explain the fact that almost 92% of users are subscribers. Hence, people tend to subscribe for yearly bike sharing and use bikes to fill the last gap in their commute.



In [185]:

    
compares_years(term = 'weekday', station_id = 70)









    Out[185]:



In [186]:

    
compares_years('weekday', 69)









    Out[186]:

The bike riding pattern plots according month for Townsend at 4th and 330 Townsend show that the average number of bikes and docks is almost constant between February and October. This is expected since those months are usually warm and have suitable weather for bike riding. During winter months, people tend to use bikes less and hence more bikes and less docks are available. Again, the pattern is similar for all three years, and for both stations. Also, during December and January, there is Thanksgiving, Christmas, New Year, and holidays. Hence, less people work which might also explain less usage of bikes in those months. That period of time could be used to do bike condition check ups, repairs, etc.



In [187]:

    
compares_years('month', 70)









    Out[187]:



In [188]:

    
compares_years('month', 69)









    Out[188]:

18. Median income and crime map in SF for 2016

In this section, we would like to explore whether crimes and income distribution in San Francisco were influencing factors in choosing the current bike stations. The crime data is provided by City of San Francisco and is published on SFGOV The dataset contains various crimes from 2003 to 2016. Among different types of crimes available in the dataset, crimes that are more relevant to bikes or may endanger the bikers were picked. The selected crimes were as follows: burglary, driving under influence, rubbery, stolen property, vandalism, and vehicle theft. Also, the analysis is focused only on crimes taken place in 2016. The following plot shows the distribution of crimes as red dots on San Francisco map. This is an interactive map, hovering the mouse over the points will show the type of crime. It could be seen that the crime points are all over the map with much higher density in downtown where the bike stations are located. Therefore, there is no significant relationship between the station locations and crime distribution in the city.



In [189]:

    
graph_CrimeIncome(income, crime)









    Out[189]:

The income data for each zip code is obtained from Census Bureau. Hovering the mouse pointer over the blue dots will give information on mean and median income and resident population for each zip code in San Francisco. The size of the dots is proportional to the median income. There is no obvious relationship between neither resident population nor median income for each zip code and station locations. Based on the analysis provided in the previous sections, it can be concluded that the company has aimed locations with more businesses, tourist attractions, and transit hubs to install bike stations. Employees and tourists constitute most of the customers rather the residents and stations closer to business area and tourist attraction have the highest trip rates. The locations that met these criteria are mostly located in downtown and vicinity where all the bike stations are located at this moment.

Suggestions for the future stations

A yearly subscribed customer who commutes to his/her work in San Francisco using Caltrain Station can to the points shown in the below map. Clicking on the line shows that bike station and the dockounts.

As can be seen from the map, someone who works outside of the region where the bike stations are available has hard time in using bike share even if that person wants.

Of course, it's been only 3 full years since the Bay Area Bike Share company started to operate. We think that their business has been proven to be in high demand, especially in San Francisco. Though, we have no information regarding their financial statements.

Also, with the fact that the Ford has invested in the company to expand the bike share system from 700 to 7000 bikes, it is obvious that company is working on to install new stations in the right regions. Particularly, with the current data we have analyzed, it might be possible to suggest new stations at least in San Francisco.

First, we assume that the two busiest stations near Caltrain station in the San Francisco Downtown area should be expanded by including more bikes and docks. As we have seen, the majority of customers use those two locations for everyday commute to work.

Second, the Caltrain line on 22nd street has no nearby bike station. The closest bike station at Townsend 7th had about 12000 bike trips as a starting point as can be seen from the first map we provided. Moreover, close to 22nd street there are business areas such Mission Bay, etc. Hence, it might be a good idea to invest in that location also.

Third, local San Francisco people and the ones who commute from EAST BAY tend to use BART, and it would be a good idea to invest in parts of San Francisco that have BART stations. Especially, Market St. and Dolores St. that have BART lines connecting West and South parts of San Francisco to the Downtown could be targeted for bike share station locations. Similar to Caltrain stations, subscribers would commute to work everyday via bike sharing. This would help to avoid using cars, taxis, or uber, to fill the "last mile" in their commute.

Furthermore, since we could not find any relations between income and crime map with the location of bike stations, we would argue that parts of San Francisco that have many offices but considered dangerous could also be targeted (Turk St. example). Also, the west parts of San Francisco that allure high number of tourists due to national parks (Golden Gate, e.g.) or beaches, could also be a target to expand the business in the San Francisco. However, it should not be a priority during expansion.

On the other hand, as the data show, during winter and holidays, people tend to use less bike sharing. Therefore, those periods of time could be a suitable for repairing, re-installing new docks, etc.

All in all, the data reveals a clear pattern of bike share usage and definitely helps during the expansion project to find the right locations for new stations.

Insight into Bike Sharing in the Bay Area

Bay Area Bike Share stations and CalTrain station map