Authors: Ashkan Saboori , Pamela Patterson , and Zamirbek Akimbekov
In [2]:
    
import pandas as pd
import pandas
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt
from geojson import dumps
import geojson as g
%matplotlib inline
plt.style.use('ggplot')
import csv
import collections
import os
from __future__ import division
import scipy as sp
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly
import plotly.graph_objs as go
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import zipcode  #To find city, latitude and longitude corresponding to a zipcode
import folium   #For map
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
import parser
import math
import warnings; warnings.simplefilter('ignore')
    
In [3]:
    
#plotly.tools.set_credentials_file(username='zamirg13', api_key='3dMc8cJ9Gxf4At0fdV49')
#mapbox_token = 'pk.eyJ1IjoiYXNoa2Fuc2Fib29yaSIsImEiOiJjaXp5djVldTMwM2FtMzNveHRpaTFqdHhyIn0.uVPRMwfIuYE_3j3jRajBFA'
## USE THE BELOW CREDENTIALS, PLEASE ### 
plotly.tools.set_credentials_file(username = 'ppatterson', api_key='jK180Tj3zyqIgqswVizx')
mapbox_token = 'pk.eyJ1IjoicGFtZWxvdDMxNyIsImEiOiJjajBlMnIzbGwwMTE0MzNwNHB3Mmt2MHI1In0.jvc8C_8qJonSMXVL-tFOfg'
    
In [4]:
    
path = '/Users/admin/project_data_141' #Zamir's path
#path = '/Users/ashkansaboori/Documents/OneDrive/MyPhD/Courses/5Winter-2017/STA141B/Project/Project_Datasets' #Ashkan's
#path = '~/Dropbox/141B/Project/data'
    
In [5]:
    
def read_data(path, year):
    """
    reads csv data and returns 4 data frames for the given year
    input: path
    output: 4 data frames (station, status, trip, and weather)
    """
    path = path + '/' + str(year) + '/'
    if year == 2014:
        station1 = pd.read_csv(path + 'station1.csv')
        station2 = pd.read_csv(path + 'station2.csv')
        status1 = pd.read_csv(path + 'status1.csv')
        status2 = pd.read_csv(path + 'status2.csv')
        trip1 = pd.read_csv(path + 'trip1.csv')
        trip2 = pd.read_csv(path + 'trip2.csv')
        weather1 = pd.read_csv(path + 'weather1.csv')
        weather2 = pd.read_csv(path + 'weather2.csv')
        station = pd.concat([station1, station2]).drop_duplicates()   
        status = pd.concat([status1, status2])
        trip = pd.concat([trip1, trip2])
        weather = pd.concat([weather1, weather2])
    
    else: 
        station = pd.read_csv(path + 'station.csv')
        status = pd.read_csv(path + 'status.csv')
        trip = pd.read_csv(path + 'trip.csv')
        weather = pd.read_csv(path + 'weather.csv')
    
    
    return station, status , trip, weather
    
In [6]:
    
station16, status16, trip16, weather16 = read_data(path, 2016)
    
In [7]:
    
station15, status15, trip15, weather15 = read_data(path, 2015)
    
In [8]:
    
station14, status14, trip14, weather14 = read_data(path, 2014)
    
In [9]:
    
crime = pd.read_csv(path + '/Crime_Incidents.csv')
income = pd.read_csv(path + '/Income.csv', header = 0)
calstations = pd.read_excel(path + '/caltrain_coordinates.xlsx')
#Source of crime data: https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z
#Source of income data: http://www.psc.isr.umich.edu/dis/census/Features/tract2zip/
#Also this can be useful: http://www.psc.isr.umich.edu/dis/data/kb/answer/1123
    
In [10]:
    
def format_df(status, trip, weather, year):
    """
    Takes trip and weather data frames and formats them for analysis. Also creates new
    dataframes for analysis. 
    Input: trip and weather pandas dataframes; year as a string
    Output: dataframes trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, 
    total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, 
    total_rides_by_hour 
    """
    status['time'] = pd.to_datetime(status.time, infer_datetime_format = True)
    trip['Count'] = 1
    trip['Start Date'] = pd.to_datetime(trip['Start Date'], format='%m/%d/%Y %H:%M')
    trip['End Date'] = pd.to_datetime(trip['End Date'], format='%m/%d/%Y %H:%M')
    trip['Day of Week'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%A'), axis=1)
    trip['Date'] = trip.apply(lambda x:  x.loc[('Start Date')].strftime('%m/%d/%Y'), axis=1)
    trip['Date'] = pd.to_datetime(trip['Date'], format = '%m/%d/%Y')
    diff = (trip['End Date']-trip['Start Date'])
    trip['Duration'] = diff.apply(lambda x:  x.seconds)
    trip = trip.rename(columns={'Start Terminal' : 'station_id'})
    cal = calendar()
    holidays = cal.holidays(start=trip['Date'].min(), end=trip['Date'].max())
    trip['Holiday']=trip['Date'].apply(lambda date: date in holidays)
    total_rides = trip[['Count', 'Date']]
    total_rides = total_rides.groupby(['Date'], as_index = False).sum()
    weather = weather.rename(index=str, columns={'PDT': 'Date',})
    weather['Date'] = pd.to_datetime(weather['Date'], format='%m/%d/%Y')
    weather = weather.set_index('Date')
    if year == '2015':
        weather = weather.set_index('Zip', append = True)
    elif year == '2014':
        weather = weather.set_index('Zip', append = True)
    elif year == '2016':
        weather = weather.set_index('ZIP', append = True)       
    weather['PrecipitationIn'] = pd.to_numeric(weather['PrecipitationIn'].replace('T', 0))
    weather_ave = weather.groupby(level = 'Date').mean()
    weekend_rides = trip.loc[trip['Day of Week'].isin(['Saturday','Sunday'])]
    weekend_rides = weekend_rides[['Count', 'Date']]
    weekend_rides = weekend_rides.groupby(['Date'], as_index = False).sum()
    hol_rides = trip.loc[trip['Holiday'] == True]
    hol_rides = hol_rides[['Count', 'Date']]
    hol_rides = hol_rides.groupby(['Date'], as_index = False).sum()
    weekday_rides = trip[['Count', 'Subscriber Type', 'Day of Week']]
    weekday_rides = weekday_rides.groupby(['Day of Week', 'Subscriber Type'], as_index = False).sum()
    sorter = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
    weekday_rides['Day of Week'] = weekday_rides['Day of Week'].astype("category")
    weekday_rides['Day of Week'].cat.set_categories(sorter, inplace=True)
    weekday_rides = weekday_rides.sort_values(['Day of Week'])
    weekday_rides.loc[weekday_rides['Subscriber Type']=='Customer']
    total_weekday_rides = weekday_rides.groupby(['Day of Week'], as_index = False).sum()
    min_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    min_dur_rides.loc[:,'Duration'] = min_dur_rides['Duration']//60
    min_dur_rides = min_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    min_dur_rides = min_dur_rides.groupby([pd.cut(min_dur_rides['Duration'], np.arange(0,61,5)),'Subscriber Type']).sum()
    hr_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    hr_dur_rides.loc[:,'Duration'] = hr_dur_rides['Duration']//3600
    hr_dur_rides = hr_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
    hr_dur_rides = hr_dur_rides.groupby([pd.cut(hr_dur_rides['Duration'], np.arange(0,25,2)),'Subscriber Type']).sum()
    total_min_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_min_rides.loc[:,'Duration'] = total_min_rides['Duration']//60
    total_min_rides = total_min_rides.groupby(['Duration'], as_index = False).sum()
    total_min_rides = total_min_rides.groupby(pd.cut(total_min_rides['Duration'], np.arange(0,61,5))).sum()
    total_hr_rides = trip[['Count', 'Subscriber Type', 'Duration']]
    total_hr_rides.loc[:,'Duration'] = total_hr_rides['Duration']//3600
    total_hr_rides = total_hr_rides.groupby(['Duration'], as_index = False).sum()
    total_hr_rides = total_hr_rides.groupby(pd.cut(total_hr_rides['Duration'], np.arange(0,25,2))).sum()
    trip['Hour'] = trip['Start Date'].apply(lambda x:  x.hour)
    rides_by_hour = trip[['Count', 'Subscriber Type', 'Hour']]
    rides_by_hour = rides_by_hour.groupby(['Hour', 'Subscriber Type'], as_index = False).sum()
    total_rides_by_hour = rides_by_hour.groupby(['Hour'], as_index = False).sum()
    return status, trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, total_rides_by_hour
    
In [11]:
    
status14, trip14, weather14, total_rides14, weather_ave14, weekend_rides14, hol_rides14, weekday_rides14, total_weekday_rides14, min_dur_rides14, hr_dur_rides14, total_min_rides14, total_hr_rides14, rides_by_hour14, total_rides_by_hour14 = format_df(status14, trip14, weather14, '2014')
    
In [12]:
    
status15, trip15, weather15, total_rides15, weather_ave15, weekend_rides15, hol_rides15, weekday_rides15, total_weekday_rides15, min_dur_rides15, hr_dur_rides15, total_min_rides15, total_hr_rides15, rides_by_hour15, total_rides_by_hour15 = format_df(status15, trip15, weather15, '2015')
    
In [13]:
    
status16, trip16, weather16, total_rides16, weather_ave16, weekend_rides16, hol_rides16, weekday_rides16, total_weekday_rides16, min_dur_rides16, hr_dur_rides16, total_min_rides16, total_hr_rides16, rides_by_hour16, total_rides_by_hour16 = format_df(status16, trip16, weather16, '2016')
    
In [14]:
    
station16 = station16.dropna()
    
In [15]:
    
print 'Size of station DF {}'.format(station16.shape)
print 'Size of status DF {}'.format(status16.shape)
print 'Size of trip DF {}'.format(trip16.shape)
print 'Size of weather DF {}'.format(weather16.shape)
    
    
In [16]:
    
merged_st = pd.merge(station16, trip16, on = 'station_id')
    
In [17]:
    
total_duration = sum(merged_st.Duration)
print 'number of bike rides between 09/01/2015 and 08/31/2016 : {}'.format(len(merged_st['Trip ID']))
print 'averaging about {} minutes per ride'.format(total_duration/(len(merged_st['Trip ID']) *60))
print 'total riding time is {} hours'.format(total_duration/3600)
print 'or {} days'.format(total_duration/(3600 * 24))
print 'or {} years'.format(total_duration / (3600 * 24 * 365))
    
    
In [18]:
    
def weather_plot(x,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the date, the left and right y-axis being inputed by user
    Input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    Output: a graph
    """
    trace1 = go.Scatter(x=x,y=y1,name='Total Rides')
    trace2 = go.Scatter(x=x,y=y2,name=y2_label,yaxis='y2')
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)
    
In [19]:
    
weather_plot(total_rides14['Date'], total_rides14['Count'], weather_ave14[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2014', 'Total Rides and Mean Wind Speed by Date 2014')
    
    Out[19]:
In [20]:
    
weather_plot(total_rides15['Date'], total_rides15['Count'], weather_ave15[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2015', 'Total Rides and Mean Wind Speed by Date 2015')
    
    Out[20]:
In [21]:
    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16[' Mean Wind SpeedMPH'], 
             'Mean Wind Speed MPH', 'Wind and Trips 2016', 'Total Rides and Mean Wind Speed by Date 2016')
    
    Out[21]:
In [22]:
    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2014', 'Total Rides and Mean Temperature by Date 2014')
    
    Out[22]:
In [23]:
    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2015', 'Total Rides and Mean Temperature by Date 2015')
    
    Out[23]:
In [24]:
    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['Mean TemperatureF'], 
             'Mean Temperature F', 'Temp and Trips 2016', 'Total Rides and Mean Temperature by Date 2016')
    
    Out[24]:
In [25]:
    
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2014', 'Total Rides and Precipitation by Date 2014')
    
    Out[25]:
In [26]:
    
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2015', 
             'Total Rides and Precipitation by Date 2015')
    
    Out[26]:
In [27]:
    
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['PrecipitationIn'], 'Precipitation In', 
             'Precip and Trips 2016', 
             'Total Rides and Precipitation by Date 2016')
    
    Out[27]:
In [28]:
    
def weekend_hol_plot(x1,x2,y1,y2, y2_label, file_name, title):
    """
    Returns dual axis plot with the x axis being the day, the left and right y-axis being inputed by user
    input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
    y2 is the data for the right y-axis; should have 365 entries, sorted by date; 
    y2_label is a string
    file_name: string of what to name file in plotly account
    output: a graph
    """
    trace1 = go.Scatter(x=x1,y=y1,name='Total Rides')
    trace2 = go.Bar(x=x2,y=y2,name=y2_label)
    data = [trace1, trace2]
    layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
                       yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
                                   tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename= file_name)
    
In [29]:
    
weekend_hol_plot(total_rides14['Date'],hol_rides14['Date'],total_rides14['Count'],hol_rides14['Count'],'Holidays',
                 'Holidays and Trips 2014', 'Holidays and Trips 2014')
    
    Out[29]:
In [30]:
    
weekend_hol_plot(total_rides15['Date'],hol_rides15['Date'],total_rides15['Count'],hol_rides15['Count'],'Holidays',
                 'Holidays and Trips 2015', 'Holidays and Trips 2015')
    
    Out[30]:
In [31]:
    
weekend_hol_plot(total_rides16['Date'],hol_rides16['Date'],total_rides16['Count'],hol_rides16['Count'],'Holidays',
                 'Holidays and Trips 2016', 'Holidays and Trips 2016')
    
    Out[31]:
In [32]:
    
weekend_hol_plot(total_rides14['Date'],weekend_rides14['Date'],total_rides14['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2014', 'Weekends and Trips 2014')
    
    Out[32]:
In [33]:
    
weekend_hol_plot(total_rides15['Date'],weekend_rides15['Date'],total_rides15['Count'],weekend_rides15['Count'],
                 'Weekends','Weekends and Trips 2015', 'Weekends and Trips 2015')
    
    Out[33]:
In [34]:
    
weekend_hol_plot(total_rides16['Date'],weekend_rides16['Date'],total_rides16['Count'],weekend_rides16['Count'],
                 'Weekends','Weekends and Trips 2016', 'Weekends and Trips 2016')
    
    Out[34]:
In [35]:
    
def grouped_bar(x1, y1, x2, y2, x3, y3, file_name, title, x_title):
    """
    Takes axis data for a 3 grouped bar chart and outputs a grouped bar chart. 
    Input: x1, y1, x2, y2, x3, y3 series data for different bars, the x data should align;
    filename: what to name the plotly file, string; title: of the graph; x_title: lable for x-axis
    Output: a grouped bar chart
    """
    trace1 = go.Bar(x=x1,y=y1,name='Total')
    trace2 = go.Bar(x=x2,y=y2,name='Customer')
    trace3 = go.Bar(x=x3,y=y3,name='Subscriber')
    data = [trace1, trace2, trace3]
    layout = go.Layout(title = title, xaxis = dict(title = x_title),yaxis = dict(title = 'Total Rides'),barmode='group')
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename=file_name)
    
In [36]:
    
grouped_bar(total_weekday_rides14['Day of Week'],total_weekday_rides14['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Count'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2014', 
            'Total Rides by Day of Week 2014', 'Day of Week')
    
    Out[36]:
In [37]:
    
grouped_bar(total_weekday_rides15['Day of Week'],total_weekday_rides15['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Count'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2015', 
            'Total Rides by Day of Week 2015', 'Day of Week')
    
    Out[37]:
In [38]:
    
grouped_bar(total_weekday_rides16['Day of Week'],total_weekday_rides16['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Count'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Day of Week'],
            weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2016', 
            'Total Rides by Day of Week 2016', 'Day of Week')
    
    Out[38]:
In [39]:
    
grouped_bar(total_min_rides14.index.get_level_values('Duration'),total_min_rides14['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides14.index.get_level_values('Duration'),
            min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2014', 'Ride Duration by Minute 2014', 'Minutes')
    
    Out[39]:
In [40]:
    
grouped_bar(total_min_rides15.index.get_level_values('Duration'),total_min_rides15['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides15.index.get_level_values('Duration'),
            min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2015', 'Ride Duration by Minute 2015', 'Minutes')
    
    Out[40]:
In [41]:
    
grouped_bar(total_min_rides16.index.get_level_values('Duration'),total_min_rides16['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_min_rides16.index.get_level_values('Duration'),
            min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Minute Plot 2016', 'Ride Duration by Minute 2016', 'Minutes')
    
    Out[41]:
In [42]:
    
grouped_bar(total_hr_rides14.index.get_level_values('Duration'),total_hr_rides14['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides14.index.get_level_values('Duration'),
            hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2014', 'Ride Duration by Hour 2014', 'Hours')
    
    Out[42]:
In [43]:
    
grouped_bar(total_hr_rides15.index.get_level_values('Duration'),total_hr_rides15['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides15.index.get_level_values('Duration'),
            hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2015', 'Ride Duration by Hour 2015', 'Hours')
    
    Out[43]:
In [44]:
    
grouped_bar(total_hr_rides16.index.get_level_values('Duration'),total_hr_rides16['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
            total_hr_rides16.index.get_level_values('Duration'),
            hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'], 
            'Duration by Hour Plot 2016', 'Ride Duration by Hour 2016', 'Hours')
    
    Out[44]:
In [45]:
    
grouped_bar(total_rides_by_hour14['Hour'],total_rides_by_hour14['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour14['Hour'],
            rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2014', 'Total Rides Per Hour 2014', 'Hour of Day')
    
    Out[45]:
In [46]:
    
grouped_bar(total_rides_by_hour15['Hour'],total_rides_by_hour15['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour15['Hour'],
            rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2015', 'Total Rides Per Hour 2015', 'Hour of Day')
    
    Out[46]:
In [47]:
    
grouped_bar(total_rides_by_hour16['Hour'],total_rides_by_hour16['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Customer']['Count'],
            total_rides_by_hour16['Hour'],
            rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Subscriber']['Count'], 
            'Rides by Hour Plot 2016', 'Total Rides Per Hour 2016', 'Hour of Day')
    
    Out[47]:
In [48]:
    
totals = merged_st['Subscriber Type'].value_counts()
Subscribers, Customers = totals[0], totals[1]
fig = {
    'data': [{'labels': ['Subscribers', 'Customers'],
              'values': [Subscribers, Customers],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by customer types'}
     }
py.iplot(fig)
    
    Out[48]:
In [49]:
    
by_cities = merged_st.landmark.value_counts()
sf, sj, mv, pa = by_cities[0], by_cities[1], by_cities[2], by_cities[3]
fig_cities = {
    'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto'],
              'values': [sf, sj, mv, pa],
              'type': 'pie'}],
    'layout': {'title': 'Trip counts by cities'}
     }
py.iplot(fig_cities)
    
    Out[49]:
In [50]:
    
trip_counts = pd.DataFrame(merged_st.sort_values(by = 'station_id').station_id.value_counts())
trip_counts = trip_counts.reset_index().rename(columns = {'station_id' : 'trip_counts',
                                                          'index' : 'station_id'})
station_merged = pd.merge(station16, trip_counts, on = 'station_id')
    
In [51]:
    
start_stations = trip16[['Start Station', 'Duration']]
grouped = (start_stations.groupby('Start Station').
                sum().
                sort_values(by = 'Duration', ascending = False))
    
In [52]:
    
merged_st.name.value_counts()[:5] #most popular five starting stations
    
    Out[52]:
In [53]:
    
calstations['station_type'] = 'caltrain'
calstations['dockcount'] = 0
calstations['trip_counts'] = 0
calstations['station_id'] = 0
calstations['installation'] = 0 
calstations = calstations.rename(columns={'caltrain_name' : 'name'})
calstations = calstations[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts', 'station_type', 'passengers']]
    
In [54]:
    
df_for_map = station_merged[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts']]
df_for_map['station_type'] = 'bike_station'
df_for_map['passengers'] = 0
df_for_map = df_for_map.append(calstations)
df_for_map.reset_index(drop='index', inplace = True)
    
In [55]:
    
start_lat, start_long, start_name = calstations.iloc[1]['lat'], calstations.iloc[1]['long'], calstations.iloc[1]['name']
    
In [56]:
    
def parser(df):
    """
    Parses a pandas DF into JSON-line object
    input: a dataframe
    output: JSON-line object
    """
    fields = df.columns
    parsed_data = (dict(zip(fields, df.iloc[i])) for i in xrange(len(df)))
    return parsed_data
def create_map(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        if line['station_type'] == 'bike_station':
            year = line['installation'].split('/')[2]
            if line['station_id'] == 70 or line['station_id'] == 69:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#00cc00',
                              'marker-size': 'large',
                              'marker-symbol': 'bicycle'
                                     }
            elif year == '2013':
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                              'marker-color':'#0e2f44',
                                'marker-size':'large',
                                'marker-symbol': 'bicycle',
                                "fill-opacity": 0.3
                                     }
            else:
                data['properties'] = {'name': line['name'],
                                 'dockcount': line['dockcount'],
                              'trip_counts' : line['trip_counts'],
                                'marker-size': 'large',
                                'marker-symbol': 'bicycle'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        else:
            data['properties'] = {'name': line['name'],
                                 'passengers': line['passengers'],
                              'marker-color':'#a11f27',
                                  'marker-size': 'large',
                                  'marker-symbol': 'rail'
                                     }
            data['geometry'] = {'type': 'Point',
                         'coordinates': (line['long'], line['lat'])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)
    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('bikecaltrain.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'bikecaltrain.geojson' file has been created, check out your cd"
def create_linestring(df):
    """
    Creates a map for the given data frame
    input: a data frame
    output: a message: "geoJSON file has been created, check out your cd"
    """
    
    geo_map = {'type': 'FeatureCollection'}
    items = list()
    for index, line in enumerate(parser(df)):
        #Skip any zero coordinates as this will throw off our map
        if line['long'] == str(0) or line['lat'] == str(0):
            continue
        
        data = {}
        data['type'] = 'Feature'
        data['id'] = index
        
        data['properties'] = {'name': line['name'],
                         'dockcount': line['dockcount'],
                              'marker-color':'#a11f27',
                              'marker-size': 'large'
                                     }
        data['geometry'] = {'type': 'LineString',
                         'coordinates': ([line['long'], line['lat']], [start_long, start_lat])
                              }
        items.append(data)
    #for each point in our items, we add the point to our dictionary
    for point in items:
        geo_map.setdefault('features', []).append(point)
    #Now that we've build up our geo_map dictionary, let's save it as geojson file
    with open('lines.geojson', 'w') as f:
            f.write(dumps(geo_map))
    return "'lines.geojson' file has been created, check out your cd"
    
In [57]:
    
create_map(df_for_map)
    
    Out[57]:
In [58]:
    
create_linestring(station16)
    
    Out[58]:
In [59]:
    
def status_for_city(station, status, city = 'San Francisco'):
    """
    returns a dataframe station ids for the specified city
    
    input: status and station dataframes, city (default = 'San Francisco')
    output: a stations dataframe with for the specified city
    
    """
    
    city_stations = station[station.landmark == city]['station_id']
    city_status = status[status.station_id.isin(city_stations)]
    
    return city_status
def groupby_term_mean(status, term):
    """
    returns dataframe grouped by term and average values
    
    input: status data frame, term (month or hour)
    output: dataframe grouped by term and average values of
            other variables
    """
    assert term in ['month', 'hour', 'weekday'], 'term should be either "month", "hour" or "weekday".'
    
    if term == 'hour':
        status['hour'] = status.time.dt.strftime('%H')
        grouped = status.groupby('hour').mean()
        grouped.reset_index(inplace = True)
    elif term == 'month':
        status['month'] = status.time.dt.strftime('%m')
        grouped = status.groupby('month').mean()
        grouped.reset_index(inplace = True)
    else:
        status['weekday'] = status.time.dt.weekday_name
        grouped = status.groupby('weekday').mean()
        grouped.reset_index(inplace = True)
        grouped['Sorter'] = [5, 1, 6, 7, 4, 2, 3]
        grouped.sort_values(by = 'Sorter', inplace = True)
    
    return grouped
def plot_status(grouped, time_format, city = None, station_name = None):
    """
    returns a plot of average bike and dock availability
    input: grouped dataframe by hour or month, time_format 
          (month or hour), city (default is None)
    output: a plot of average bike and dock availability
    """
    import plotly.plotly as py
    import plotly.graph_objs as go
    
    assert time_format in ['month', 'hour', 'weekday'], 'time format should be either "month", "hour", or "weekday".'
    # Add data
    
    if time_format is 'hour':
        time = grouped.hour
        title_time = 'Daily'
        x_axis_name = 'Hour'
    elif time_format is 'month':
        time = grouped.month
        title_time = 'Yearly'
        x_axis_name = 'Month'
    else:
        time = grouped.weekday
        title_time = 'Weekly'
        x_axis_name = 'Week Days'
        
    bikes_available = grouped.bikes_available
    docks_available = grouped.docks_available
    # Create and style traces
    trace0 = go.Scatter(
        x = time,
        y = bikes_available,
        name = 'Bikes Available',
        line = dict(
            color = ('rgb(205, 12, 24)'),
            width = 4,
            dash = 'dash')
    )
    trace1 = go.Scatter(
        x = time,
        y = docks_available,
        name = 'Docks Available',
        line = dict(
            color = ('rgb(22, 96, 167)'),
            width = 4,
            dash = 'dash')
    )
    data = [trace0, trace1]
    
    if station_name:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, station_name)
    elif city:
        title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, city)
    else:
        title_name = '{} Average Counts of Available Bikes and Docks across all 4 cities'.format(title_time)
    # Edit the layout
    layout = dict(title = title_name,
                  xaxis = dict(title = x_axis_name),
                  yaxis = dict(title = 'Counts'),
                  )
    fig = dict(data=data, layout=layout)
    file_name = 'plot for {} by {}'.format(city, time_format)
    return py.iplot(fig, filename=file_name)
def plot_summary_for_city(city, term, station_id = None):
    """
    returns plot summary for the given city and term
    input: city name, term ('hour', 'month', 'weekday')
    output: a figure
    
    NOTE: station and status data should be available in global env.
    """
    
    assert city in ['San Francisco', 'Mountain View', 'Palo Alto', 'San Jose'], 'unavailable city'
    assert term in ['hour', 'month', 'weekday'], 'term should be either "hour", "month", or "weekday".'
    assert type(status16['time'][0]) is pandas.tslib.Timestamp, 'status["time"] type error. convert it time series.'
    
    city_status = status_for_city(station16, status16, city)
    station_name = None
    if station_id:
        assert station_id in station16.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station.name[station16.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return plot_status(grouped, term, city, station_name)
    
In [60]:
    
plot_summary_for_city('San Francisco', 'hour')
    
    Out[60]:
In [61]:
    
plot_summary_for_city('San Jose', 'hour')
    
    Out[61]:
In [62]:
    
plot_summary_for_city('San Francisco', 'weekday')
    
    Out[62]:
In [63]:
    
plot_summary_for_city('San Jose', 'weekday')
    
    Out[63]:
In [64]:
    
plot_summary_for_city('San Francisco', 'month')
    
    Out[64]:
In [65]:
    
plot_summary_for_city('San Jose', 'month')
    
    Out[65]:
In [66]:
    
#plot_summary_for_city('Mountain View', 'hour')
#plot_summary_for_city('Mountain View', 'weekday')
#plot_summary_for_city('Mountain View', 'month')
    
In [67]:
    
#plot_summary_for_city('Palo Alto', 'hour')
#plot_summary_for_city('Palo Alto', 'weekday')
#plot_summary_for_city('Palo Alto', 'month')
    
In [68]:
    
def convert_df(station_in, status_in, term, station_id = None):
    """
    createst a DF which is grouped by term and has average
    bike and docks available for a station if speficied, otherwise
    it returns average value for San Francisco
    
    input: station_in, status_in, term, station_id (default = None)
    output: a grouped DF, station_name (default = None)
    """
    
    city_status = status_for_city(station_in, status_in)
    station_name = None
    if station_id:
        assert station_id in station_in.station_id.values, 'invalid station id'
        city_status = city_status[city_status.station_id == station_id]
        station_name = station_in.name[station_in.station_id == station_id].values[0]
    grouped = groupby_term_mean(city_status, term)
    
    return grouped, station_name
    
In [69]:
    
def compares_years(term, station_id = None):
    """
    compares average avaialble bike and dock counts
    for three years per term for San Francisco,
    if station_id specified compares for that station.
    
    input: term ('hour', 'month', or 'weekday')
    output: an interactive figure
    
    """
    
    sf14, station_name = convert_df(station14, status14, term, station_id)
    sf15, station_name = convert_df(station15, status15, term, station_id)
    sf16, station_name = convert_df(station16, status16, term, station_id)
    
    bikes14 = sf14.bikes_available.values
    docks14 = sf14.docks_available.values
    bikes15 = sf15.bikes_available.values
    docks15 = sf15.docks_available.values
    bikes16 = sf16.bikes_available.values
    docks16 = sf16.docks_available
    xcoord = sf15[term]
    
    if station_name:
        title = 'Average Available Bike and Dock Counts for {}'.format(station_name)
    else:
        title = 'Average Available Bike and Dock Counts for San Francisco'
    trace1 = Scatter(
        x=xcoord, y=bikes15,
        line=Line(
            color='#FFD700',
            width=3
        ),
        name='2015: Bikes'
    )
    trace2 = Scatter(
        x=xcoord, y=docks15,
        line=Line(
            color='#C0C0C0',
            width=3
        ),
        name='2015: Docks'
    )
    trace3 = Scatter(
        x=xcoord, y=bikes16,
        line=Line(
            color='#BA8651',
            width=3
        ),
        name='2016: Bikes'
    )
    trace4 = Scatter(
        x=xcoord, y=docks16,
        line=Line(
            color='#000000',
            width=3
        ),
        name='2016: Docks'
    )
    
    trace5 = Scatter(
        x=xcoord, y=bikes14,
        line=Line(
            color='#b9f442',
            width=3
        ),
        name='2014: Bikes'
    )
    
    trace6 = Scatter(
        x=xcoord, y=docks14,
        line=Line(
            color='#416bf4',
            width=3
        ),
        name='2014: Docks'
    )
    data = Data([trace1, trace2, trace3, trace4, trace5, trace6])
    layout = Layout(
        title=title,
        updatemenus=list([
            dict(
                x=-0.05,
                y=1,
                yanchor='top',
                buttons=list([
                    dict(
                        args=['visible', [True, True, True, True, True, True]],
                        label='All',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, True, False, False, False, False]],
                        label='2015',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, True, True, False, False]],
                        label='2016',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, False, False, False, True, True]],
                        label='2014',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [True, False, True, False, True, False]],
                        label='Bikes',
                        method='restyle'
                    ),
                    dict(
                        args=['visible', [False, True, False, True, False, True]],
                        label='Docks',
                        method='restyle'
                    )
                ]),
            )
        ]),
    )
    fig = Figure(data=data, layout=layout)
    return py.iplot(fig)
    
In [70]:
    
compares_years('hour', 70)
    
    Out[70]:
In [71]:
    
compares_years('weekday', 70)
    
    Out[71]:
In [72]:
    
def route_countFunc(trip_df, station_df):
    """
    This function gets 'trip' and 'station' dataframe and returns a dataframe contains routes and their counts
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
    @returns:
        routes_count: contains routes and their counts, duration, and landmark
    """
    #Add start landmark
    trip = pd.merge(left = trip_df[['Trip ID', 'Duration', 'station_id', 'End Terminal']], right = station_df[['name', 'station_id', 'landmark']], 
             how='left', left_on='station_id', right_on= 'station_id')
    trip.rename(columns={'landmark':'start_landmark', 'name':'Start Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
    #Add end landmark
    trip = pd.merge(left = trip, right = station_df[['name','station_id', 'landmark']], 
             how='left', left_on='End Terminal', right_on= 'station_id')
    trip.rename(columns={'landmark':'end_landmark', 'name':'End Station'}, inplace=True)
    trip.drop('station_id', inplace=True, axis=1)
       #Pick only trips within one landmark
    trip = trip[trip.start_landmark == trip.end_landmark]
    #Count the number of trips between stations
    routes_count = trip.groupby(['Start Station', 'End Station', 'start_landmark']).agg({'Trip ID': 'count', 'Duration': 'mean'}).reset_index()
    routes_count = routes_count.rename(columns={'Trip ID': 'Counts'})
    routes_count.Duration = routes_count['Duration'].apply(lambda x: round((x/60),2))
    #Change the column names
    routes_count.rename(columns={'start_landmark':'Landmark', 'Duration': 'Duration (min)'}, inplace=True)
    return routes_count
    
In [73]:
    
def route_heatmap(trip_df, station_df, landmark):
    """
    This function recieve the trip and station dataframes for a specific year 
    and plots the trip heatmap for a given landmark
    @args:
        trip_df: trip dataframe
        station_df: station dataframe
        landmark: Could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        Heatmap graph
    """
    routes = route_countFunc(trip_df, station_df)
    df_routes = routes[routes.Landmark == landmark]
    start_station = df_routes['Start Station'].sort_values().unique()
    end_station = df_routes['End Station'].sort_values().unique()
    #create z values for heatmap and text for hover text in the map
    z = []
    trip_text = []
    for start in start_station:
        new_row = []
        text_row = []
        for end in end_station:
            try:
                trip_count = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)].Counts.values[0]
                average_duration = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)]['Duration (min)'].values[0]
            except IndexError:
                trip_count = 0
                average_duration = 0
            text = 'Start Station: {}<br>End Station: {}<br>Trip Counts: {}<br>Average Duration: {} Mins'.format(start, end, trip_count, average_duration)
            text_row.append(text)    
            new_row.append(trip_count)
        z.append(list(new_row))
        trip_text.append(list(text_row))
    #Plot Heatmap
    heatmap = go.Heatmap(x = start_station,
               y = end_station,
               z = z,
                hoverinfo = 'text',
                text = trip_text,
               colorscale='YIGnBu',
               reversescale=True)
    data = go.Data([heatmap])
    layout = go.Layout(
        title=landmark.upper() + ' TRIP COUNTS',
        titlefont = dict(color='black', size=35, family='monospace'),
        xaxis=dict(
            title='END STATION',
            tickangle = -90,
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        yaxis=dict(
            title='START STATION',
            titlefont=dict(
                family='monospace',
                size=18,
                color='red')),
        width=1000,
        height=1000,
        margin=go.Margin(
        l=300,
        r=50,
        b=300,
        t=100,
        pad=4
    )
    )
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig)
    
In [74]:
    
route_heatmap(trip16, station16, 'San Francisco')
    
    Out[74]:
In [75]:
    
def crimeFunc(df):
    """
    This function gets the crime data frame and select the crimes potentially dangerous to bikers and bikes such as:
    'BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT'
    in the year 2016
    @args:
        df: crime dataframe
    @returns:
        a dataframe containing crimes in 2015 related to bikes
    """
    #List of related crimes
    relatedCrimes = ['BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']
    bikeCrime = df[df['Category'].isin(relatedCrimes)].reset_index(drop = True)
    #Extract year from "Date" column
    bikeCrime.Date = bikeCrime.Date.str.slice(6,10)
    bikeCrime = bikeCrime.rename(columns = {'X': 'lon', 'Y': 'lat', 'Date': 'year'})
    #Crimes in year 2016
    bikeCrime = bikeCrime[bikeCrime['year'] == '2016'][['Category', 'lat', 'lon']].reset_index(drop=True)
    return bikeCrime
    
In [76]:
    
def crime_objectFunc(df):
    """
    This function gets the crime data frame and creates a scatter map plot object
    @args:
        df: crime data frame
    @returns:
        a scatter map plot object
    """
    bikeCrime = crimeFunc(df)
    crime_graph = go.Scattermapbox(lon = bikeCrime['lon'],
                        lat = bikeCrime['lat'],
                        hoverinfo = 'text',
                        text = bikeCrime['Category'],
                        mode = 'markers',
                        marker = dict(size = 3,
                                     color = 'red'
                                      ),
                        opacity = 0.1,
                        name = 'Crime')
    return crime_graph
    
In [77]:
    
def extract_info(zipNum, info_type):
    '''
    This function gets the zipcode and returns either city name, latitude, or longitude
    @args:
        zipNum: zipcode
        info_type: could be either: 'city', 'lat', or 'longitude'
    @returns:
        city name, latitude, or longitude
    '''
    
    try:
        zipcode.isequal(str(zipNum)).state #Check to see if info exists for the zipcode
        info = zipcode.isequal(str(zipNum))
    except AttributeError:
        return 'Not valid'
    if info_type == 'state':
        return info.state
    if info_type == 'city':
        return info.city
    elif info_type == 'lat':
        return info.lat
    else:
        return info.lon
    
In [78]:
    
def incomeFunc(df):
    """
    This function gets the income dataframe and finds the city and state corresponding to each lattidue and longitude
    and returns a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    @args:
        df: income dataframe
    @returns:
        a dataframe containing median and mean income corresponding to each zipcode in San Francisco
    """
    #Add state, city, latitude, and longitude columns to income dataset
    info_list = ['state', 'city', 'lat', 'lon']
    for i in info_list:
        df[i] = [extract_info(str(x), i) for x in df.iloc[:,0]]
    #Extract Sanfrancicso data
    SF_income = df[(df.state == 'CA') & (df.city == 'SAN FRANCISCO')].reset_index(drop=True)
    return SF_income
    
In [79]:
    
def hover_incomeTextFunc(df):
    """
    This function gets the income dataframe and creates a list of texts for hover text in plot
    @args:
        df: income dataframe
    @returns:
        a list of texts for hover text in the plot
    """
    #For hover in the map graph
    text_income = df.apply(lambda x: 'Zipcode: {}<br>Population: {}<br>Median Income: ${}<br>Mean Income: ${}'.format(
            x[0], x[3], x[1], x[2]),axis=1)
    return text_income
    
In [80]:
    
def point_incomeSizeFunc(df):
    """
    This function gets the income dataframe and creates a list of point sizes for plot
    @args:
        df: income dataframe
    @returns:
        a list of point sizes for plot
    """
    #Scale the point size for map
    point_size = df['Median'].str.replace(',', '').apply(int)/df['Median'].str.replace(',', '').apply(int).max()
    return point_size
    
In [81]:
    
def point_incomeTransFunc(df):
    """
    This function gets the income dataframe and creates of a list of point transparency for plot
    @args:
        df: income dataframe
    @returns:
        a list of point transparency for plot
    """
    #Scale the point transparancy to population
    point_transparency = df['Population'].str.replace(',', '').apply(int)/df['Population'].str.replace(',', '').apply(int).max()
    return point_transparency
    
In [82]:
    
def income_objectFunc(df):
    """
    This function gets the income dataframe and creates a scatter map plot object
    @args:
        df: income dataframe
    @returns:
        a scatter map plot object
    """
    SF_income = incomeFunc(df)
    income_graph = go.Scattermapbox(lon = incomeFunc(SF_income)['lon'],
                        lat = incomeFunc(SF_income)['lat'],
                        hoverinfo = 'text',
                        text = hover_incomeTextFunc(SF_income),
                        mode = 'markers',
                        marker = dict(size = point_incomeSizeFunc(SF_income)*20,
                                     color = 'light blue'
                                      ),
                        opacity = point_incomeTransFunc(SF_income)/10,
                        name = 'Median Income')
    return income_graph
    
In [83]:
    
def graph_CrimeIncome(income_df, crime_df):
    """
    This function gets two data frames income and crime and plots the scatter map plot for both in San Francisco
    @args:
        income_df: income dataframe
        crime_df: crime dataframe
    @returns:
        A scatter map plot in San Francisco
    """
    income_object = income_objectFunc(income_df)
    crime_object = crime_objectFunc(crime_df)
    data = go.Data([income_object, crime_object])
    layout = dict(
    title='MEDIAN INCOME AND CRIME MAP IN SAN FRANCISCO',
    titlefont = dict(color='black', size=35, family='monospace'),
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_token,
        bearing=0,
        center=dict(
            lat=37.773972,
            lon=-122.431297
        ),
        pitch=0,
        zoom=10
    ),
    )
    layout = go.Layout(layout)
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)
    
In [84]:
    
graph_CrimeIncome(income, crime)
    
    Out[84]:
In [85]:
    
def routeDock_countFunc(tripDF, stationDF, landmark):
    """
    This function gets trip and station dataframes and the landmark of interests and returns a dataframe
    containing all stations with total number of trips started at the stations and their number of dockcounts
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A dataframe containing all stations with total number of trips started at the stations and their 
        number of dockcounts
    """
    routeCount = route_countFunc(tripDF, stationDF)
    routeCount = routeCount[routeCount.Landmark == landmark].groupby('Start Station').agg({'Counts': 'sum'}).reset_index()
    routeCount = pd.merge(left=routeCount, right=stationDF[['name', 'dockcount']], how='inner', left_on='Start Station', right_on='name')
    routeCount.drop('name', axis=1, inplace=True)
    return routeCount
    
In [86]:
    
def hover_text(df):
    """
    This function gets a dataframe made by function "routeDock_countFunc" and returns a list containing the 
    text information used in the barplot
    @args:
        df: dataframe made with function "routeDock_countFunc"
    @returns:
        a list containing the text information used in hover text for the barplot
    """
    text = df.apply(lambda x: 'Station: {}<br>Trip Counts: {}<br>Dock Counts: {}<br>Dock/Trip: {}%'.format(
        x[0], x[1], int(x[2]), round(float(x[2])*100/x[1], 3)), axis=1)
    return text
    
In [87]:
    
def bar_object(tripDF, stationDF, landmark, year):
    """
    This function gets trip and station dataframes and the landmark of interests and specific year and returns 
    a barobject used for plotting barplot
    @args:
        tripDF: trip dataframe
        stationDF: station dataframe
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
        year: year of interest, either 2014, 2015, 2016
    @returns:
      Barplot object used to plot a barplot
    """
    df = routeDock_countFunc(tripDF, stationDF, landmark)
    bar = go.Bar(x=df['Start Station'], y=df['Counts'],
                name = year, hoverinfo='text', text=hover_text(df))
    return bar
    
In [88]:
    
def barPlot(landmark):
    """
    This funciton gets a landmark and plots the number of trips from statins in a landmark. The barplot
    contains the information for three years 2014, 2015, and 2016.
    @args:
        landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
    @returns:
        A barplot
    """
    
    #Bar object
    bar14 = bar_object(trip14, station14, landmark, 2014)
    bar15 = bar_object(trip15, station15, landmark, 2015)
    bar16 = bar_object(trip16, station16, landmark, 2016)
    data = go.Data([bar14, bar15, bar16])
    
    #Layout
    layout = go.Layout(
     width=1000,
    height=800,
    hovermode = 'closest',
    title='TOTAL TRIPS FROM STATIONS IN ' + landmark.upper(),
    titlefont = dict(color='black', size=35, family='monospace'),
    xaxis=dict(
        title='STATION',
        tickangle = -90,
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
    yaxis=dict(
        title='TRIP COUNTS',
        titlefont=dict(
            family='monospace',
            size=18,
            color='red')),
        margin=go.Margin(
        l=100,
        r=50,
        b=300,
        t=100,
        pad=4), 
        showlegend = 'True', 
        legend = dict(x=0, y=1, orientation='h'))
    
    #Make the plot
    fig = go.Figure(data = data, layout = layout)
    return py.iplot(fig)
    
In [89]:
    
barPlot('San Francisco')
    
    Out[89]:
In [ ]: