Authors: Ashkan Saboori , Pamela Patterson , and Zamirbek Akimbekov



In [2]:

    
import pandas as pd
import pandas
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt
from geojson import dumps
import geojson as g
%matplotlib inline
plt.style.use('ggplot')
import csv
import collections
import os
from __future__ import division
import scipy as sp
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly
import plotly.graph_objs as go
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import zipcode  #To find city, latitude and longitude corresponding to a zipcode
import folium   #For map
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
import parser
import math
import warnings; warnings.simplefilter('ignore')

Read the data



In [3]:

    
#plotly.tools.set_credentials_file(username='zamirg13', api_key='3dMc8cJ9Gxf4At0fdV49')
#mapbox_token = 'pk.eyJ1IjoiYXNoa2Fuc2Fib29yaSIsImEiOiJjaXp5djVldTMwM2FtMzNveHRpaTFqdHhyIn0.uVPRMwfIuYE_3j3jRajBFA'
## USE THE BELOW CREDENTIALS, PLEASE ### 
plotly.tools.set_credentials_file(username = 'ppatterson', api_key='jK180Tj3zyqIgqswVizx')
mapbox_token = 'pk.eyJ1IjoicGFtZWxvdDMxNyIsImEiOiJjajBlMnIzbGwwMTE0MzNwNHB3Mmt2MHI1In0.jvc8C_8qJonSMXVL-tFOfg'



In [4]:

    
path = '/Users/admin/project_data_141' #Zamir's path
#path = '/Users/ashkansaboori/Documents/OneDrive/MyPhD/Courses/5Winter-2017/STA141B/Project/Project_Datasets' #Ashkan's
#path = '~/Dropbox/141B/Project/data'



In [5]:

    
def read_data(path, year):
    """
    reads csv data and returns 4 data frames for the given year
    input: path
    output: 4 data frames (station, status, trip, and weather)
    """
    path = path + '/' + str(year) + '/'
    if year == 2014:
        station1 = pd.read_csv(path + 'station1.csv')
        station2 = pd.read_csv(path + 'station2.csv')
        status1 = pd.read_csv(path + 'status1.csv')
        status2 = pd.read_csv(path + 'status2.csv')
        trip1 = pd.read_csv(path + 'trip1.csv')
        trip2 = pd.read_csv(path + 'trip2.csv')
        weather1 = pd.read_csv(path + 'weather1.csv')
        weather2 = pd.read_csv(path + 'weather2.csv')
        station = pd.concat([station1, station2]).drop_duplicates()   
        status = pd.concat([status1, status2])
        trip = pd.concat([trip1, trip2])
        weather = pd.concat([weather1, weather2])
    
    else: 
        station = pd.read_csv(path + 'station.csv')
        status = pd.read_csv(path + 'status.csv')
        trip = pd.read_csv(path + 'trip.csv')
        weather = pd.read_csv(path + 'weather.csv')
    
    
    return station, status , trip, weather



In [6]:

    
station16, status16, trip16, weather16 = read_data(path, 2016)



In [7]:

    
station15, status15, trip15, weather15 = read_data(path, 2015)



In [8]:

    
station14, status14, trip14, weather14 = read_data(path, 2014)



In [9]:

    
crime = pd.read_csv(path + '/Crime_Incidents.csv')
income = pd.read_csv(path + '/Income.csv', header = 0)
calstations = pd.read_excel(path + '/caltrain_coordinates.xlsx')
#Source of crime data: https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z
#Source of income data: http://www.psc.isr.umich.edu/dis/census/Features/tract2zip/
#Also this can be useful: http://www.psc.isr.umich.edu/dis/data/kb/answer/1123

Format the data for analysis



In [10]:

    
def format_df(status, trip, weather, year):
    """
    Takes trip and weather data frames and formats them for analysis. Also creates new
    dataframes for analysis. 
    Input: trip and weather pandas dataframes; year as a string
    Output: dataframes trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, 
    total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, 
    total_rides_by_hour 
    """
    status['time'] = pd.to_datetime(status.time, infer_datetime_format = True)
    trip['Count'] = 1
    trip['Start Date'] = pd.to_datetime(trip['Start Date'], format='%m/%d/%Y %H:%M')
    trip['End Date'] = pd.to_datetime(trip['End Date'], format='%m/%d/%Y %H:%M')
    trip['Day of Week'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%A'), axis=1)
    trip['Date'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%m/%d/%Y'), axis=1)
    trip['Date'] = pd.to_datetime(trip['Date'], format = '%m/%d/%Y')
    diff = (trip['End Date']-trip['Start Date'])
    trip['Duration'] = diff.apply(lambda x:  x.seconds)
    trip = trip.rename(columns={'Start Terminal' : 'station_id'})
    cal = calendar()
    holidays = cal.holidays(start=trip['Date'].min(), end=trip['Date'].max())
    trip['Holiday']=trip['Date'].apply(lambda date: date in holidays)
    total_rides = trip[['Count', 'Date']]
    total_rides = total_rides.groupby(['Date'], as_index = False).sum()
    weather = weather.rename(index=str, columns={'PDT': 'Date',})
    weather['Date'] = pd.to_datetime(weather['Date'], format='%m/%d/%Y')
    weather = weather.set_index('Date')
    if year == '2015':
        weather = weather.set_index('Zip', append = True)
    elif year == '2014':
        weather = weather.set_index('Zip', append = True)
    elif year == '2016':
        weather = weather.set_index('ZIP', append = True)       
    weather['PrecipitationIn'] = pd.to_numeric(weather['PrecipitationIn'].replace('T', 0))
    weather_ave = weather.groupby(level = 'Date').mean()
    weekend_rides = trip.loc[trip['Day of Week'].isin(['Saturday','Sunday'])]
    weekend_rides = weekend_rides[['Count', 'Date']]
    weekend_rides = weekend_rides.groupby(['Date'], as_index = False).sum()
    hol_rides = trip.loc[trip['Holiday'] == True]
    hol_rides = hol_rides[['Count', 'Date']]
    hol_rides = hol_rides.groupby(['Date'], as_index = False).sum()
    weekday_rides = trip[['Count', 'Subscriber Type', 'Day of Week']]
    weekday_rides = weekday_rides.groupby(['Day of Week', 'Subscriber Type'], as_index = False).sum()
    sorter = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
    weekday_rides['Day of Week'] = weekday_rides['Day of Week'].astype("category")
    weekday_rides['Day of Week'].cat.set_categories(sorter, inplace=True)
    weekday_rides = weekday_rides.sort_values(['Day of Week'])
    weekday_rides.loc[weekday_rides['Subscriber Type']=='Customer']
    total_weekday_rides = weekday_rides.groupby(['Day of Week'], as_index = False).sum()
    min_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    min_dur_rides.loc[:,'Duration'] = min_dur_rides['Duration']//60
    min_dur_rides = min_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    min_dur_rides = min_dur_rides.groupby([pd.cut(min_dur_rides['Duration'], np.arange(0,61,5)),'Subscriber Type']).sum()
    hr_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    hr_dur_rides.loc[:,'Duration'] = hr_dur_rides['Duration']//3600
    hr_dur_rides = hr_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    hr_dur_rides = hr_dur_rides.groupby([pd.cut(hr_dur_rides['Duration'], np.arange(0,25,2)),'Subscriber Type']).sum()
    total_min_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_min_rides.loc[:,'Duration'] = total_min_rides['Duration']//60
    total_min_rides = total_min_rides.groupby(['Duration'], as_index = False).sum()
    total_min_rides = total_min_rides.groupby(pd.cut(total_min_rides['Duration'], np.arange(0,61,5))).sum()
    total_hr_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_hr_rides.loc[:,'Duration'] = total_hr_rides['Duration']//3600
    total_hr_rides = total_hr_rides.groupby(['Duration'], as_index = False).sum()
    total_hr_rides = total_hr_rides.groupby(pd.cut(total_hr_rides['Duration'], np.arange(0,25,2))).sum()
    trip['Hour'] = trip['Start Date'].apply(lambda x:  x.hour)
    rides_by_hour = trip[['Count', 'Subscriber Type', 'Hour']]
    rides_by_hour = rides_by_hour.groupby(['Hour', 'Subscriber Type'], as_index = False).sum()
    total_rides_by_hour = rides_by_hour.groupby(['Hour'], as_index = False).sum()
    return status, trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, total_rides_by_hour



In [11]:

    
status14, trip14, weather14, total_rides14, weather_ave14, weekend_rides14, hol_rides14, weekday_rides14, total_weekday_rides14, min_dur_rides14, hr_dur_rides14, total_min_rides14, total_hr_rides14, rides_by_hour14, total_rides_by_hour14 = format_df(status14, trip14, weather14, '2014')



In [12]:

    
status15, trip15, weather15, total_rides15, weather_ave15, weekend_rides15, hol_rides15, weekday_rides15, total_weekday_rides15, min_dur_rides15, hr_dur_rides15, total_min_rides15, total_hr_rides15, rides_by_hour15, total_rides_by_hour15 = format_df(status15, trip15, weather15, '2015')



In [13]:

    
status16, trip16, weather16, total_rides16, weather_ave16, weekend_rides16, hol_rides16, weekday_rides16, total_weekday_rides16, min_dur_rides16, hr_dur_rides16, total_min_rides16, total_hr_rides16, rides_by_hour16, total_rides_by_hour16 = format_df(status16, trip16, weather16, '2016')

Begin Analysis

1. Overview of data for 2016



In [14]:

    
station16 = station16.dropna()



In [15]:

    
print 'Size of station DF {}'.format(station16.shape)
print 'Size of status DF {}'.format(status16.shape)
print 'Size of trip DF {}'.format(trip16.shape)
print 'Size of weather DF {}'.format(weather16.shape)









    



Size of station DF (67, 7)
Size of status DF (35517185, 4)
Size of trip DF (313689, 16)
Size of weather DF (1830, 22)

2. Statistics about bike rides for 2016



In [16]:

    
merged_st = pd.merge(station16, trip16, on = 'station_id')



In [17]:

    
total_duration = sum(merged_st.Duration)
print 'number of bike rides between 09/01/2015 and 08/31/2016 : {}'.format(len(merged_st['Trip ID']))
print 'averaging about {} minutes per ride'.format(total_duration/(len(merged_st['Trip ID']) *60))
print 'total riding time is {} hours'.format(total_duration/3600)
print 'or {} days'.format(total_duration/(3600 * 24))
print 'or {} years'.format(total_duration / (3600 * 24 * 365))









    



number of bike rides between 09/01/2015 and 08/31/2016 : 312126
averaging about 13.7903346725 minutes per ride
total riding time is 71738.7 hours
or 2989.1125 days
or 8.18934931507 years



In [18]:

    
def weather_plot(x,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the date, the left and right y-axis being inputed by user
    Input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    Output: a graph
    """

    trace1 = go.Scatter(x=x,y=y1,name='Total Rides')
    trace2 = go.Scatter(x=x,y=y2,name=y2_label,yaxis='y2')
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)

3. Total rides and mean wind speed by date for 2014, 2015, 2016



In [19]:

    
weather_plot(total_rides14['Date'], total_rides14['Count'], weather_ave14[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2014', 'Total Rides and Mean Wind Speed by Date 2014')









    Out[19]:



In [20]:

    
weather_plot(total_rides15['Date'], total_rides15['Count'], weather_ave15[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2015', 'Total Rides and Mean Wind Speed by Date 2015')









    Out[20]:



In [21]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2016', 'Total Rides and Mean Wind Speed by Date 2016')









    Out[21]:

4. Total rides and mean temperature by date for 2014, 2015, 2016



In [22]:

    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2014', 'Total Rides and Mean Temperature by Date 2014')









    Out[22]:



In [23]:

    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2015', 'Total Rides and Mean Temperature by Date 2015')









    Out[23]:



In [24]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2016', 'Total Rides and Mean Temperature by Date 2016')









    Out[24]:

5. Total rides and precipitation by date for 2014, 2015, and 2016



In [25]:

    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2014', 'Total Rides and Precipitation by Date 2014')









    Out[25]:



In [26]:

    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2015', 
             'Total Rides and Precipitation by Date 2015')









    Out[26]:



In [27]:

    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2016', 
             'Total Rides and Precipitation by Date 2016')









    Out[27]:



In [28]:

    
def weekend_hol_plot(x1,x2,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the day, the left and right y-axis being inputed by user
    input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    output: a graph
    """

    trace1 = go.Scatter(x=x1,y=y1,name='Total Rides')
    trace2 = go.Bar(x=x2,y=y2,name=y2_label)
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)

6. Total rides with holidays highlighted for 2014, 2015, 2016



In [29]:

    
weekend_hol_plot(total_rides14['Date'],hol_rides14['Date'],total_rides14['Count'],hol_rides14['Count'],'Holidays',
                 'Holidays and Trips 2014', 'Holidays and Trips 2014')









    Out[29]:



In [30]:

    
weekend_hol_plot(total_rides15['Date'],hol_rides15['Date'],total_rides15['Count'],hol_rides15['Count'],'Holidays',
                 'Holidays and Trips 2015', 'Holidays and Trips 2015')









    Out[30]:



In [31]:

    
weekend_hol_plot(total_rides16['Date'],hol_rides16['Date'],total_rides16['Count'],hol_rides16['Count'],'Holidays',
                 'Holidays and Trips 2016', 'Holidays and Trips 2016')









    Out[31]:

7. Total rides with weekends highlighted for 2014, 2015, and 2016



In [32]:

    
weekend_hol_plot(total_rides14['Date'],weekend_rides14['Date'],total_rides14['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2014', 'Weekends and Trips 2014')









    Out[32]:



In [33]:

    
weekend_hol_plot(total_rides15['Date'],weekend_rides15['Date'],total_rides15['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2015', 'Weekends and Trips 2015')









    Out[33]:



In [34]:

    
weekend_hol_plot(total_rides16['Date'],weekend_rides16['Date'],total_rides16['Count'],weekend_rides16['Count'],
                 'Weekends','Weekends and Trips 2016', 'Weekends and Trips 2016')









    Out[34]:



In [35]:

    
def grouped_bar(x1, y1, x2, y2, x3, y3, file_name, title, x_title):
    """
    Takes axis data for a 3 grouped bar chart and outputs a grouped bar chart. 
    Input: x1, y1, x2, y2, x3, y3 series data for different bars, the x data should align;
    filename: what to name the plotly file, string; title: of the graph; x_title: lable for x-axis
    Output: a grouped bar chart
    """
    trace1 = go.Bar(x=x1,y=y1,name='Total')
    trace2 = go.Bar(x=x2,y=y2,name='Customer')
    trace3 = go.Bar(x=x3,y=y3,name='Subscriber')
    data = [trace1, trace2, trace3]
    layout = go.Layout(title = title, xaxis = dict(title = x_title),yaxis = dict(title = 'Total Rides'),barmode='group')
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename=file_name)

8. Total rides by day of the week, separated into subscriber type for 2014, 2015, and 2016



In [36]:

    
grouped_bar(total_weekday_rides14['Day of Week'],total_weekday_rides14['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2014', 
            'Total Rides by Day of Week 2014', 'Day of Week')









    Out[36]:



In [37]:

    
grouped_bar(total_weekday_rides15['Day of Week'],total_weekday_rides15['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2015', 
            'Total Rides by Day of Week 2015', 'Day of Week')









    Out[37]:



In [38]:

    
grouped_bar(total_weekday_rides16['Day of Week'],total_weekday_rides16['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2016', 
            'Total Rides by Day of Week 2016', 'Day of Week')









    Out[38]:

9. Total rides by duration in minutes, grouped by subscriber type, for 2014, 2015, and 2016



In [39]:

    
grouped_bar(total_min_rides14.index.get_level_values('Duration'),total_min_rides14['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2014', 'Ride Duration by Minute 2014', 'Minutes')









    Out[39]:



In [40]:

    
grouped_bar(total_min_rides15.index.get_level_values('Duration'),total_min_rides15['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2015', 'Ride Duration by Minute 2015', 'Minutes')









    Out[40]:



In [41]:

    
grouped_bar(total_min_rides16.index.get_level_values('Duration'),total_min_rides16['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2016', 'Ride Duration by Minute 2016', 'Minutes')









    Out[41]:

10. Total rides by duration in hours, grouped by subscriber type, for 2014, 2015, and 2016



In [42]:

    
grouped_bar(total_hr_rides14.index.get_level_values('Duration'),total_hr_rides14['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2014', 'Ride Duration by Hour 2014', 'Hours')









    Out[42]:



In [43]:

    
grouped_bar(total_hr_rides15.index.get_level_values('Duration'),total_hr_rides15['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2015', 'Ride Duration by Hour 2015', 'Hours')









    Out[43]:



In [44]:

    
grouped_bar(total_hr_rides16.index.get_level_values('Duration'),total_hr_rides16['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2016', 'Ride Duration by Hour 2016', 'Hours')









    Out[44]:

11. Total rides by hour of the day, grouped by subscriber type, for 2014, 2015, and 2016



In [45]:

    
grouped_bar(total_rides_by_hour14['Hour'],total_rides_by_hour14['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2014', 'Total Rides Per Hour 2014', 'Hour of Day')









    Out[45]:



In [46]:

    
grouped_bar(total_rides_by_hour15['Hour'],total_rides_by_hour15['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2015', 'Total Rides Per Hour 2015', 'Hour of Day')









    Out[46]:



In [47]:

    
grouped_bar(total_rides_by_hour16['Hour'],total_rides_by_hour16['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2016', 'Total Rides Per Hour 2016', 'Hour of Day')









    Out[47]:

12. Trip counts by subscription types for 2016



In [48]:

    
totals = merged_st['Subscriber Type'].value_counts()
Subscribers, Customers = totals[0], totals[1]

fig = {
    'data': [{'labels': ['Subscribers', 'Customers'],
              'values': [Subscribers, Customers],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by customer types'}
     }

py.iplot(fig)









    Out[48]:

13. Trip counts by cities for 2016



In [49]:

    
by_cities = merged_st.landmark.value_counts()
sf, sj, mv, pa = by_cities[0], by_cities[1], by_cities[2], by_cities[3]
fig_cities = {
    'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto'],
              'values': [sf, sj, mv, pa],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by cities'}
     }

py.iplot(fig_cities)









    Out[49]:

14. Most popular five starting stations for 2016



In [50]:

    
trip_counts = pd.DataFrame(merged_st.sort_values(by = 'station_id').station_id.value_counts())
trip_counts = trip_counts.reset_index().rename(columns = {'station_id' : 'trip_counts',
                                                          'index' : 'station_id'})
station_merged = pd.merge(station16, trip_counts, on = 'station_id')



In [51]:

    
start_stations = trip16[['Start Station', 'Duration']]
grouped = (start_stations.groupby('Start Station').
                sum().
                sort_values(by = 'Duration', ascending = False))



In [52]:

    
merged_st.name.value_counts()[:5] #most popular five starting stations









    Out[52]:





San Francisco Caltrain (Townsend at 4th)    23591
San Francisco Caltrain 2 (330 Townsend)     22358
Harry Bridges Plaza (Ferry Building)        16128
2nd at Townsend                             14099
Steuart at Market                           13693
Name: name, dtype: int64



In [53]:

    
calstations['station_type'] = 'caltrain'
calstations['dockcount'] = 0
calstations['trip_counts'] = 0
calstations['station_id'] = 0
calstations['installation'] = 0 
calstations = calstations.rename(columns={'caltrain_name' : 'name'})
calstations = calstations[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts', 'station_type', 'passengers']]



In [54]:

    
df_for_map = station_merged[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts']]
df_for_map['station_type'] = 'bike_station'
df_for_map['passengers'] = 0
df_for_map = df_for_map.append(calstations)
df_for_map.reset_index(drop='index', inplace = True)



In [55]:

    
start_lat, start_long, start_name = calstations.iloc[1]['lat'], calstations.iloc[1]['long'], calstations.iloc[1]['name']



In [56]:

    
def parser(df):
    """
    Parses a pandas DF into JSON-line object
    input: a dataframe
    output: JSON-line object
    """
    fields = df.columns
    parsed_data = (dict(zip(fields, df.iloc[i])) for i in xrange(len(df)))
    return parsed_data


def create_map(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        if line['station_type'] == 'bike_station':
            year = line['installation'].split('/')[2]
            if line['station_id'] == 70 or line['station_id'] == 69:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#00cc00',
                              'marker-size': 'large',
                              'marker-symbol': 'bicycle'
                                     }
            elif year == '2013':
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#0e2f44',
                                'marker-size':'large',
                                'marker-symbol': 'bicycle',
                                "fill-opacity": 0.3
                                     }
            else:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                                'marker-size': 'large',
                                'marker-symbol': 'bicycle'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        else:
            data['properties'] = {'name': line['name'],
                                 'passengers': line['passengers'],
                              'marker-color':'#a11f27',
                                  'marker-size': 'large',
                                  'marker-symbol': 'rail'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)

    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('bikecaltrain.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'bikecaltrain.geojson' file has been created, check out your cd"

def create_linestring(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        
        data['properties'] = {'name': line['name'],
                         'dockcount': line['dockcount'],
                              'marker-color':'#a11f27',
                              'marker-size': 'large'
                                     }
        data['geometry'] = {'type': 'LineString',
                         'coordinates': ([line['long'], line['lat']], [start_long, start_lat])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)

    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('lines.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'lines.geojson' file has been created, check out your cd"



In [57]:

    
create_map(df_for_map)









    Out[57]:





"'bikecaltrain.geojson' file has been created, check out your cd"



In [58]:

    
create_linestring(station16)









    Out[58]:





"'lines.geojson' file has been created, check out your cd"



In [59]:

    
def status_for_city(station, status, city = 'San Francisco'):
    """
    returns a dataframe station ids for the specified city
    
    input: status and station dataframes, city (default = 'San Francisco')
    output: a stations dataframe with for the specified city
    
    """
    
    city_stations = station[station.landmark == city]['station_id']
    city_status = status[status.station_id.isin(city_stations)]
    
    return city_status

def groupby_term_mean(status, term):
    """
    returns dataframe grouped by term and average values
    
    input: status data frame, term (month or hour)
    output: dataframe grouped by term and average values of
            other variables
    """
    assert term in ['month', 'hour', 'weekday'], 'term should be either "month", "hour" or "weekday".'
    
    if term == 'hour':
        status['hour'] = status.time.dt.strftime('%H')
        grouped = status.groupby('hour').mean()
        grouped.reset_index(inplace = True)
    elif term == 'month':
        status['month'] = status.time.dt.strftime('%m')
        grouped = status.groupby('month').mean()
        grouped.reset_index(inplace = True)
    else:
        status['weekday'] = status.time.dt.weekday_name
        grouped = status.groupby('weekday').mean()
        grouped.reset_index(inplace = True)
        grouped['Sorter'] = [5, 1, 6, 7, 4, 2, 3]
        grouped.sort_values(by = 'Sorter', inplace = True)
    
    return grouped

def plot_status(grouped, time_format, city = None, station_name = None):
    """
    returns a plot of average bike and dock availability
    input: grouped dataframe by hour or month, time_format 
          (month or hour), city (default is None)
    output: a plot of average bike and dock availability
    """
    import plotly.plotly as py
    import plotly.graph_objs as go
    
    assert time_format in ['month', 'hour', 'weekday'], 'time format should be either "month", "hour", or "weekday".'

    # Add data
    
    if time_format is 'hour':
        time = grouped.hour
        title_time = 'Daily'
        x_axis_name = 'Hour'
    elif time_format is 'month':
        time = grouped.month
        title_time = 'Yearly'
        x_axis_name = 'Month'
    else:
        time = grouped.weekday
        title_time = 'Weekly'
        x_axis_name = 'Week Days'
        
    bikes_available = grouped.bikes_available
    docks_available = grouped.docks_available
    # Create and style traces
    trace0 = go.Scatter(
        x = time,
        y = bikes_available,
        name = 'Bikes Available',
        line = dict(
            color = ('rgb(205, 12, 24)'),
            width = 4,
            dash = 'dash')
    )
    trace1 = go.Scatter(
        x = time,
        y = docks_available,
        name = 'Docks Available',
        line = dict(
            color = ('rgb(22, 96, 167)'),
            width = 4,
            dash = 'dash')
    )

    data = [trace0, trace1]
    
    if station_name:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, station_name)
    elif city:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, city)
    else:
        title_name = '{} Average Counts of Available Bikes and Docks across all 4 cities'.format(title_time)

    # Edit the layout
    layout = dict(title = title_name,
                  xaxis = dict(title = x_axis_name),
                  yaxis = dict(title = 'Counts'),
                  )

    fig = dict(data=data, layout=layout)
    file_name = 'plot for {} by {}'.format(city, time_format)
    return py.iplot(fig, filename=file_name)

def plot_summary_for_city(city, term, station_id = None):
    """
    returns plot summary for the given city and term
    input: city name, term ('hour', 'month', 'weekday')
    output: a figure
    
    NOTE: station and status data should be available in global env.
    """
    
    assert city in ['San Francisco', 'Mountain View', 'Palo Alto', 'San Jose'], 'unavailable city'
    assert term in ['hour', 'month', 'weekday'], 'term should be either "hour", "month", or "weekday".'
    assert type(status16['time'][0]) is pandas.tslib.Timestamp, 'status["time"] type error. convert it time series.'
    
    city_status = status_for_city(station16, status16, city)
    station_name = None
    if station_id:
        assert station_id in station16.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station.name[station16.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return plot_status(grouped, term, city, station_name)

15. Plots for average number of available bikes and docks in 4 different cities.



In [60]:

    
plot_summary_for_city('San Francisco', 'hour')









    Out[60]:



In [61]:

    
plot_summary_for_city('San Jose', 'hour')









    Out[61]:



In [62]:

    
plot_summary_for_city('San Francisco', 'weekday')









    Out[62]:



In [63]:

    
plot_summary_for_city('San Jose', 'weekday')









    Out[63]:



In [64]:

    
plot_summary_for_city('San Francisco', 'month')









    Out[64]:



In [65]:

    
plot_summary_for_city('San Jose', 'month')









    Out[65]:



In [66]:

    
#plot_summary_for_city('Mountain View', 'hour')
#plot_summary_for_city('Mountain View', 'weekday')
#plot_summary_for_city('Mountain View', 'month')



In [67]:

    
#plot_summary_for_city('Palo Alto', 'hour')
#plot_summary_for_city('Palo Alto', 'weekday')
#plot_summary_for_city('Palo Alto', 'month')



In [68]:

    
def convert_df(station_in, status_in, term, station_id = None):
    """
    createst a DF which is grouped by term and has average
    bike and docks available for a station if speficied, otherwise
    it returns average value for San Francisco
    
    input: station_in, status_in, term, station_id (default = None)
    output: a grouped DF, station_name (default = None)
    """
    
    city_status = status_for_city(station_in, status_in)
    station_name = None
    if station_id:
        assert station_id in station_in.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station_in.name[station_in.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return grouped, station_name



In [69]:

    
def compares_years(term, station_id = None):
    """
    compares average avaialble bike and dock counts
    for three years per term for San Francisco,
    if station_id specified compares for that station.
    
    input: term ('hour', 'month', or 'weekday')
    output: an interactive figure
    
    """
    
    sf14, station_name = convert_df(station14, status14, term, station_id)
    sf15, station_name = convert_df(station15, status15, term, station_id)
    sf16, station_name = convert_df(station16, status16, term, station_id)
    
    bikes14 = sf14.bikes_available.values
    docks14 = sf14.docks_available.values
    bikes15 = sf15.bikes_available.values
    docks15 = sf15.docks_available.values
    bikes16 = sf16.bikes_available.values
    docks16 = sf16.docks_available
    xcoord = sf15[term]
    
    if station_name:
        title = 'Average Available Bike and Dock Counts for {}'.format(station_name)
    else:
        title = 'Average Available Bike and Dock Counts for San Francisco'

    trace1 = Scatter(
        x=xcoord, y=bikes15,
        line=Line(
            color='#FFD700',
            width=3
        ),
        name='2015: Bikes'
    )

    trace2 = Scatter(
        x=xcoord, y=docks15,
        line=Line(
            color='#C0C0C0',
            width=3
        ),
        name='2015: Docks'
    )

    trace3 = Scatter(
        x=xcoord, y=bikes16,
        line=Line(
            color='#BA8651',
            width=3
        ),
        name='2016: Bikes'
    )

    trace4 = Scatter(
        x=xcoord, y=docks16,
        line=Line(
            color='#000000',
            width=3
        ),
        name='2016: Docks'
    )
    
    trace5 = Scatter(
        x=xcoord, y=bikes14,
        line=Line(
            color='#b9f442',
            width=3
        ),
        name='2014: Bikes'
    )
    
    trace6 = Scatter(
        x=xcoord, y=docks14,
        line=Line(
            color='#416bf4',
            width=3
        ),
        name='2014: Docks'
    )

    data = Data([trace1, trace2, trace3, trace4, trace5, trace6])
    layout = Layout(
        title=title,
        updatemenus=list([
            dict(
                x=-0.05,
                y=1,
                yanchor='top',
                buttons=list([
                    dict(
                        args=['visible', [True, True, True, True, True, True]],
                        label='All',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, True, False, False, False, False]],
                        label='2015',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, True, True, False, False]],
                        label='2016',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, False, False, True, True]],
                        label='2014',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, False, True, False, True, False]],
                        label='Bikes',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, True, False, True, False, True]],
                        label='Docks',
                        method='restyle'
                    )
                ]),
            )
        ]),
    )
    fig = Figure(data=data, layout=layout)
    return py.iplot(fig)

16. Let's see how the average number of available bikes and docks has changed since 2014 for San Francisco Caltrain (Townsend at 4th)



In [70]:

    
compares_years('hour', 70)









    Out[70]:



In [71]:

    
compares_years('weekday', 70)









    Out[71]:

17. Route count heatmap by station in SF for 2016



In [72]:

    
def route_countFunc(trip_df, station_df):
    """
    This function gets 'trip' and 'station' dataframe and returns a dataframe contains routes and their counts
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
    @returns:
        routes_count: contains routes and their counts, duration, and landmark
    """
    #Add start landmark
    trip = pd.merge(left = trip_df[['Trip ID', 'Duration', 'station_id', 'End Terminal']], right = station_df[['name', 'station_id', 'landmark']], 
             how='left', left_on='station_id', right_on= 'station_id')
    trip.rename(columns={'landmark':'start_landmark', 'name':'Start Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
    #Add end landmark
    trip = pd.merge(left = trip, right = station_df[['name','station_id', 'landmark']], 
             how='left', left_on='End Terminal', right_on= 'station_id')
    trip.rename(columns={'landmark':'end_landmark', 'name':'End Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
       #Pick only trips within one landmark
    trip = trip[trip.start_landmark == trip.end_landmark]
    #Count the number of trips between stations
    routes_count = trip.groupby(['Start Station', 'End Station', 'start_landmark']).agg({'Trip ID': 'count', 'Duration': 'mean'}).reset_index()
    routes_count = routes_count.rename(columns={'Trip ID': 'Counts'})
    routes_count.Duration = routes_count['Duration'].apply(lambda x: round((x/60),2))
    #Change the column names
    routes_count.rename(columns={'start_landmark':'Landmark', 'Duration': 'Duration (min)'}, inplace=True)
    return routes_count



In [73]:

    
def route_heatmap(trip_df, station_df, landmark):
    """
    This function recieve the trip and station dataframes for a specific year 
    and plots the trip heatmap for a given landmark
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
        landmark: Could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        Heatmap graph
    """
    routes = route_countFunc(trip_df, station_df)
    df_routes = routes[routes.Landmark == landmark]
    start_station = df_routes['Start Station'].sort_values().unique()
    end_station = df_routes['End Station'].sort_values().unique()

    #create z values for heatmap and text for hover text in the map
    z = []
    trip_text = []
    for start in start_station:
        new_row = []
        text_row = []
        for end in end_station:
            try:
                trip_count = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)].Counts.values[0]
                average_duration = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)]['Duration (min)'].values[0]
            except IndexError:
                trip_count = 0
                average_duration = 0
            text = 'Start Station: {}<br>End Station: {}<br>Trip Counts: {}<br>Average Duration: {} Mins'.format(start, end, trip_count, average_duration)
            text_row.append(text)    
            new_row.append(trip_count)
        z.append(list(new_row))
        trip_text.append(list(text_row))

    #Plot Heatmap
    heatmap = go.Heatmap(x = start_station,
               y = end_station,
               z = z,
                hoverinfo = 'text',
                text = trip_text,
               colorscale='YIGnBu',
               reversescale=True)

    data = go.Data([heatmap])
    layout = go.Layout(
        title=landmark.upper() + ' TRIP COUNTS',
        titlefont = dict(color='black', size=35, family='monospace'),
        xaxis=dict(
            title='END STATION',
            tickangle = -90,
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        yaxis=dict(
            title='START STATION',
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        width=1000,
        height=1000,
        margin=go.Margin(
        l=300,
        r=50,
        b=300,
        t=100,
        pad=4
    )
    )
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig)



In [74]:

    
route_heatmap(trip16, station16, 'San Francisco')









    Out[74]:

18. Median income and crime map in SF for 2016



In [75]:

    
def crimeFunc(df):
    """
    This function gets the crime data frame and select the crimes potentially dangerous to bikers and bikes such as:
    'BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT'
    in the year 2016
    @args:
        df: crime dataframe
    @returns:
        a dataframe containing crimes in 2015 related to bikes
    """
    #List of related crimes
    relatedCrimes = ['BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']
    bikeCrime = df[df['Category'].isin(relatedCrimes)].reset_index(drop = True)
    #Extract year from "Date" column
    bikeCrime.Date = bikeCrime.Date.str.slice(6,10)
    bikeCrime = bikeCrime.rename(columns = {'X': 'lon', 'Y': 'lat', 'Date': 'year'})
    #Crimes in year 2016
    bikeCrime = bikeCrime[bikeCrime['year'] == '2016'][['Category', 'lat', 'lon']].reset_index(drop=True)
    return bikeCrime



In [76]:

    
def crime_objectFunc(df):
    """
    This function gets the crime data frame and creates a scatter map plot object
    @args:
        df: crime data frame
    @returns:
        a scatter map plot object
    """
    bikeCrime = crimeFunc(df)
    crime_graph = go.Scattermapbox(lon = bikeCrime['lon'],
                        lat = bikeCrime['lat'],
                        hoverinfo = 'text',
                        text = bikeCrime['Category'],
                        mode = 'markers',
                        marker = dict(size = 3,
                                     color = 'red'
                                      ),
                        opacity = 0.1,
                        name = 'Crime')
    return crime_graph



In [77]:

    
def extract_info(zipNum, info_type):
    '''
    This function gets the zipcode and returns either city name, latitude, or longitude
    @args:
        zipNum: zipcode
        info_type: could be either: 'city', 'lat', or 'longitude'
    @returns:
        city name, latitude, or longitude
    '''
    
    try:
        zipcode.isequal(str(zipNum)).state #Check to see if info exists for the zipcode
        info = zipcode.isequal(str(zipNum))
    except AttributeError:
        return 'Not valid'
    if info_type == 'state':
        return info.state
    if info_type == 'city':
        return info.city
    elif info_type == 'lat':
        return info.lat
    else:
        return info.lon



In [78]:

    
def incomeFunc(df):
    """
    This function gets the income dataframe and finds the city and state corresponding to each lattidue and longitude
    and returns a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    @args:
        df: income dataframe
    @returns:
        a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    """
    #Add state, city, latitude, and longitude columns to income dataset
    info_list = ['state', 'city', 'lat', 'lon']
    for i in info_list:
        df[i] = [extract_info(str(x), i) for x in df.iloc[:,0]]

    #Extract Sanfrancicso data
    SF_income = df[(df.state == 'CA') & (df.city == 'SAN FRANCISCO')].reset_index(drop=True)
    return SF_income



In [79]:

    
def hover_incomeTextFunc(df):
    """
    This function gets the income dataframe and creates a list of texts for hover text in plot
    @args:
        df: income dataframe
    @returns:
        a list of texts for hover text in the plot
    """
    #For hover in the map graph
    text_income = df.apply(lambda x: 'Zipcode: {}<br>Population: {}<br>Median Income: ${}<br>Mean Income: ${}'.format(
            x[0], x[3], x[1], x[2]),axis=1)
    return text_income



In [80]:

    
def point_incomeSizeFunc(df):
    """
    This function gets the income dataframe and creates a list of point sizes for plot
    @args:
        df: income dataframe
    @returns:
        a list of point sizes for plot
    """
    #Scale the point size for map
    point_size = df['Median'].str.replace(',', '').apply(int)/df['Median'].str.replace(',', '').apply(int).max()
    return point_size



In [81]:

    
def point_incomeTransFunc(df):
    """
    This function gets the income dataframe and creates of a list of point transparency for plot
    @args:
        df: income dataframe
    @returns:
        a list of point transparency for plot
    """
    #Scale the point transparancy to population
    point_transparency = df['Population'].str.replace(',', '').apply(int)/df['Population'].str.replace(',', '').apply(int).max()
    return point_transparency



In [82]:

    
def income_objectFunc(df):
    """
    This function gets the income dataframe and creates a scatter map plot object
    @args:
        df: income dataframe
    @returns:
        a scatter map plot object
    """
    SF_income = incomeFunc(df)
    income_graph = go.Scattermapbox(lon = incomeFunc(SF_income)['lon'],
                        lat = incomeFunc(SF_income)['lat'],
                        hoverinfo = 'text',
                        text = hover_incomeTextFunc(SF_income),
                        mode = 'markers',
                        marker = dict(size = point_incomeSizeFunc(SF_income)*20,
                                     color = 'light blue'
                                      ),
                        opacity = point_incomeTransFunc(SF_income)/10,
                        name = 'Median Income')
    return income_graph



In [83]:

    
def graph_CrimeIncome(income_df, crime_df):
    """
    This function gets two data frames income and crime and plots the scatter map plot for both in San Francisco
    @args:
        income_df: income dataframe
        crime_df: crime dataframe
    @returns:
        A scatter map plot in San Francisco
    """
    income_object = income_objectFunc(income_df)
    crime_object = crime_objectFunc(crime_df)
    data = go.Data([income_object, crime_object])
    layout = dict(
    title='MEDIAN INCOME AND CRIME MAP IN SAN FRANCISCO',
    titlefont = dict(color='black', size=35, family='monospace'),
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_token,
        bearing=0,
        center=dict(
            lat=37.773972,
            lon=-122.431297
        ),
        pitch=0,
        zoom=10
    ),
    )
    layout = go.Layout(layout)
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)



In [84]:

    
graph_CrimeIncome(income, crime)









    Out[84]:

19. Trips from stations in each landmark for 2016



In [85]:

    
def routeDock_countFunc(tripDF, stationDF, landmark):
    """
    This function gets trip and station dataframes and the landmark of interests and returns a dataframe
    containing all stations with total number of trips started at the stations and their number of dockcounts
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A dataframe containing all stations with total number of trips started at the stations and their 
        number of dockcounts
    """
    routeCount = route_countFunc(tripDF, stationDF)
    routeCount = routeCount[routeCount.Landmark == landmark].groupby('Start Station').agg({'Counts': 'sum'}).reset_index()
    routeCount = pd.merge(left=routeCount, right=stationDF[['name', 'dockcount']], how='inner', left_on='Start Station', right_on='name')
    routeCount.drop('name', axis=1, inplace=True)
    return routeCount



In [86]:

    
def hover_text(df):
    """
    This function gets a dataframe made by function "routeDock_countFunc" and returns a list containing the 
    text information used in the barplot
    @args:
        df: dataframe made with function "routeDock_countFunc"
    @returns:
        a list containing the text information used in hover text for the barplot
    """
    text = df.apply(lambda x: 'Station: {}<br>Trip Counts: {}<br>Dock Counts: {}<br>Dock/Trip: {}%'.format(
        x[0], x[1], int(x[2]), round(float(x[2])*100/x[1], 3)), axis=1)
    return text



In [87]:

    
def bar_object(tripDF, stationDF, landmark, year):
    """
    This function gets trip and station dataframes and the landmark of interests and specific year and returns 
    a barobject used for plotting barplot
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
        year: year of interest, either 2014, 2015, 2016
    @returns:
      Barplot object used to plot a barplot
    """
    df = routeDock_countFunc(tripDF, stationDF, landmark)
    bar = go.Bar(x=df['Start Station'], y=df['Counts'],
                name = year, hoverinfo='text', text=hover_text(df))
    return bar



In [88]:

    
def barPlot(landmark):
    """
    This funciton gets a landmark and plots the number of trips from statins in a landmark. The barplot
    contains the information for three years 2014, 2015, and 2016.
    @args:
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A barplot
    """
    
    #Bar object
    bar14 = bar_object(trip14, station14, landmark, 2014)
    bar15 = bar_object(trip15, station15, landmark, 2015)
    bar16 = bar_object(trip16, station16, landmark, 2016)
    data = go.Data([bar14, bar15, bar16])
    
    #Layout
    layout = go.Layout(
     width=1000,
    height=800,
    hovermode = 'closest',
    title='TOTAL TRIPS FROM STATIONS IN ' + landmark.upper(),
    titlefont = dict(color='black', size=35, family='monospace'),
    xaxis=dict(
        title='STATION',
        tickangle = -90,
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
    yaxis=dict(
        title='TRIP COUNTS',
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
        margin=go.Margin(
        l=100,
        r=50,
        b=300,
        t=100,
        pad=4), 
        showlegend = 'True', 
        legend = dict(x=0, y=1, orientation='h'))
    
    #Make the plot
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)



In [89]:

    
barPlot('San Francisco')









    Out[89]:



In [ ]:

Insight into Bike Sharing in the Bay Area

Read the data

Format the data for analysis

Begin Analysis

1. Overview of data for 2016

2. Statistics about bike rides for 2016

3. Total rides and mean wind speed by date for 2014, 2015, 2016

4. Total rides and mean temperature by date for 2014, 2015, 2016

5. Total rides and precipitation by date for 2014, 2015, and 2016

6. Total rides with holidays highlighted for 2014, 2015, 2016

7. Total rides with weekends highlighted for 2014, 2015, and 2016

8. Total rides by day of the week, separated into subscriber type for 2014, 2015, and 2016

9. Total rides by duration in minutes, grouped by subscriber type, for 2014, 2015, and 2016

10. Total rides by duration in hours, grouped by subscriber type, for 2014, 2015, and 2016

11. Total rides by hour of the day, grouped by subscriber type, for 2014, 2015, and 2016

12. Trip counts by subscription types for 2016

13. Trip counts by cities for 2016

14. Most popular five starting stations for 2016

15. Plots for average number of available bikes and docks in 4 different cities.

16. Let's see how the average number of available bikes and docks has changed since 2014 for San Francisco Caltrain (Townsend at 4th)

17. Route count heatmap by station in SF for 2016

18. Median income and crime map in SF for 2016

19. Trips from stations in each landmark for 2016