Authors: Ashkan Saboori , Pamela Patterson , and Zamirbek Akimbekov
In [2]:
import pandas as pd
import pandas
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt
from geojson import dumps
import geojson as g
%matplotlib inline
plt.style.use('ggplot')
import csv
import collections
import os
from __future__ import division
import scipy as sp
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly
import plotly.graph_objs as go
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import zipcode #To find city, latitude and longitude corresponding to a zipcode
import folium #For map
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
import parser
import math
import warnings; warnings.simplefilter('ignore')
In [3]:
#plotly.tools.set_credentials_file(username='zamirg13', api_key='3dMc8cJ9Gxf4At0fdV49')
#mapbox_token = 'pk.eyJ1IjoiYXNoa2Fuc2Fib29yaSIsImEiOiJjaXp5djVldTMwM2FtMzNveHRpaTFqdHhyIn0.uVPRMwfIuYE_3j3jRajBFA'
## USE THE BELOW CREDENTIALS, PLEASE ###
plotly.tools.set_credentials_file(username = 'ppatterson', api_key='jK180Tj3zyqIgqswVizx')
mapbox_token = 'pk.eyJ1IjoicGFtZWxvdDMxNyIsImEiOiJjajBlMnIzbGwwMTE0MzNwNHB3Mmt2MHI1In0.jvc8C_8qJonSMXVL-tFOfg'
In [4]:
path = '/Users/admin/project_data_141' #Zamir's path
#path = '/Users/ashkansaboori/Documents/OneDrive/MyPhD/Courses/5Winter-2017/STA141B/Project/Project_Datasets' #Ashkan's
#path = '~/Dropbox/141B/Project/data'
In [5]:
def read_data(path, year):
"""
reads csv data and returns 4 data frames for the given year
input: path
output: 4 data frames (station, status, trip, and weather)
"""
path = path + '/' + str(year) + '/'
if year == 2014:
station1 = pd.read_csv(path + 'station1.csv')
station2 = pd.read_csv(path + 'station2.csv')
status1 = pd.read_csv(path + 'status1.csv')
status2 = pd.read_csv(path + 'status2.csv')
trip1 = pd.read_csv(path + 'trip1.csv')
trip2 = pd.read_csv(path + 'trip2.csv')
weather1 = pd.read_csv(path + 'weather1.csv')
weather2 = pd.read_csv(path + 'weather2.csv')
station = pd.concat([station1, station2]).drop_duplicates()
status = pd.concat([status1, status2])
trip = pd.concat([trip1, trip2])
weather = pd.concat([weather1, weather2])
else:
station = pd.read_csv(path + 'station.csv')
status = pd.read_csv(path + 'status.csv')
trip = pd.read_csv(path + 'trip.csv')
weather = pd.read_csv(path + 'weather.csv')
return station, status , trip, weather
In [6]:
station16, status16, trip16, weather16 = read_data(path, 2016)
In [7]:
station15, status15, trip15, weather15 = read_data(path, 2015)
In [8]:
station14, status14, trip14, weather14 = read_data(path, 2014)
In [9]:
crime = pd.read_csv(path + '/Crime_Incidents.csv')
income = pd.read_csv(path + '/Income.csv', header = 0)
calstations = pd.read_excel(path + '/caltrain_coordinates.xlsx')
#Source of crime data: https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z
#Source of income data: http://www.psc.isr.umich.edu/dis/census/Features/tract2zip/
#Also this can be useful: http://www.psc.isr.umich.edu/dis/data/kb/answer/1123
In [10]:
def format_df(status, trip, weather, year):
"""
Takes trip and weather data frames and formats them for analysis. Also creates new
dataframes for analysis.
Input: trip and weather pandas dataframes; year as a string
Output: dataframes trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides,
total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour,
total_rides_by_hour
"""
status['time'] = pd.to_datetime(status.time, infer_datetime_format = True)
trip['Count'] = 1
trip['Start Date'] = pd.to_datetime(trip['Start Date'], format='%m/%d/%Y %H:%M')
trip['End Date'] = pd.to_datetime(trip['End Date'], format='%m/%d/%Y %H:%M')
trip['Day of Week'] = trip.apply(lambda x: x.loc[('Start Date')].strftime('%A'), axis=1)
trip['Date'] = trip.apply(lambda x: x.loc[('Start Date')].strftime('%m/%d/%Y'), axis=1)
trip['Date'] = pd.to_datetime(trip['Date'], format = '%m/%d/%Y')
diff = (trip['End Date']-trip['Start Date'])
trip['Duration'] = diff.apply(lambda x: x.seconds)
trip = trip.rename(columns={'Start Terminal' : 'station_id'})
cal = calendar()
holidays = cal.holidays(start=trip['Date'].min(), end=trip['Date'].max())
trip['Holiday']=trip['Date'].apply(lambda date: date in holidays)
total_rides = trip[['Count', 'Date']]
total_rides = total_rides.groupby(['Date'], as_index = False).sum()
weather = weather.rename(index=str, columns={'PDT': 'Date',})
weather['Date'] = pd.to_datetime(weather['Date'], format='%m/%d/%Y')
weather = weather.set_index('Date')
if year == '2015':
weather = weather.set_index('Zip', append = True)
elif year == '2014':
weather = weather.set_index('Zip', append = True)
elif year == '2016':
weather = weather.set_index('ZIP', append = True)
weather['PrecipitationIn'] = pd.to_numeric(weather['PrecipitationIn'].replace('T', 0))
weather_ave = weather.groupby(level = 'Date').mean()
weekend_rides = trip.loc[trip['Day of Week'].isin(['Saturday','Sunday'])]
weekend_rides = weekend_rides[['Count', 'Date']]
weekend_rides = weekend_rides.groupby(['Date'], as_index = False).sum()
hol_rides = trip.loc[trip['Holiday'] == True]
hol_rides = hol_rides[['Count', 'Date']]
hol_rides = hol_rides.groupby(['Date'], as_index = False).sum()
weekday_rides = trip[['Count', 'Subscriber Type', 'Day of Week']]
weekday_rides = weekday_rides.groupby(['Day of Week', 'Subscriber Type'], as_index = False).sum()
sorter = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_rides['Day of Week'] = weekday_rides['Day of Week'].astype("category")
weekday_rides['Day of Week'].cat.set_categories(sorter, inplace=True)
weekday_rides = weekday_rides.sort_values(['Day of Week'])
weekday_rides.loc[weekday_rides['Subscriber Type']=='Customer']
total_weekday_rides = weekday_rides.groupby(['Day of Week'], as_index = False).sum()
min_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
min_dur_rides.loc[:,'Duration'] = min_dur_rides['Duration']//60
min_dur_rides = min_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
min_dur_rides = min_dur_rides.groupby([pd.cut(min_dur_rides['Duration'], np.arange(0,61,5)),'Subscriber Type']).sum()
hr_dur_rides = trip[['Count', 'Subscriber Type', 'Duration']]
hr_dur_rides.loc[:,'Duration'] = hr_dur_rides['Duration']//3600
hr_dur_rides = hr_dur_rides.groupby(['Duration', 'Subscriber Type'], as_index = False).sum()
hr_dur_rides = hr_dur_rides.groupby([pd.cut(hr_dur_rides['Duration'], np.arange(0,25,2)),'Subscriber Type']).sum()
total_min_rides = trip[['Count', 'Subscriber Type', 'Duration']]
total_min_rides.loc[:,'Duration'] = total_min_rides['Duration']//60
total_min_rides = total_min_rides.groupby(['Duration'], as_index = False).sum()
total_min_rides = total_min_rides.groupby(pd.cut(total_min_rides['Duration'], np.arange(0,61,5))).sum()
total_hr_rides = trip[['Count', 'Subscriber Type', 'Duration']]
total_hr_rides.loc[:,'Duration'] = total_hr_rides['Duration']//3600
total_hr_rides = total_hr_rides.groupby(['Duration'], as_index = False).sum()
total_hr_rides = total_hr_rides.groupby(pd.cut(total_hr_rides['Duration'], np.arange(0,25,2))).sum()
trip['Hour'] = trip['Start Date'].apply(lambda x: x.hour)
rides_by_hour = trip[['Count', 'Subscriber Type', 'Hour']]
rides_by_hour = rides_by_hour.groupby(['Hour', 'Subscriber Type'], as_index = False).sum()
total_rides_by_hour = rides_by_hour.groupby(['Hour'], as_index = False).sum()
return status, trip, weather, total_rides, weather_ave, weekend_rides, hol_rides, weekday_rides, total_weekday_rides, min_dur_rides, hr_dur_rides, total_min_rides, total_hr_rides, rides_by_hour, total_rides_by_hour
In [11]:
status14, trip14, weather14, total_rides14, weather_ave14, weekend_rides14, hol_rides14, weekday_rides14, total_weekday_rides14, min_dur_rides14, hr_dur_rides14, total_min_rides14, total_hr_rides14, rides_by_hour14, total_rides_by_hour14 = format_df(status14, trip14, weather14, '2014')
In [12]:
status15, trip15, weather15, total_rides15, weather_ave15, weekend_rides15, hol_rides15, weekday_rides15, total_weekday_rides15, min_dur_rides15, hr_dur_rides15, total_min_rides15, total_hr_rides15, rides_by_hour15, total_rides_by_hour15 = format_df(status15, trip15, weather15, '2015')
In [13]:
status16, trip16, weather16, total_rides16, weather_ave16, weekend_rides16, hol_rides16, weekday_rides16, total_weekday_rides16, min_dur_rides16, hr_dur_rides16, total_min_rides16, total_hr_rides16, rides_by_hour16, total_rides_by_hour16 = format_df(status16, trip16, weather16, '2016')
In [14]:
station16 = station16.dropna()
In [15]:
print 'Size of station DF {}'.format(station16.shape)
print 'Size of status DF {}'.format(status16.shape)
print 'Size of trip DF {}'.format(trip16.shape)
print 'Size of weather DF {}'.format(weather16.shape)
In [16]:
merged_st = pd.merge(station16, trip16, on = 'station_id')
In [17]:
total_duration = sum(merged_st.Duration)
print 'number of bike rides between 09/01/2015 and 08/31/2016 : {}'.format(len(merged_st['Trip ID']))
print 'averaging about {} minutes per ride'.format(total_duration/(len(merged_st['Trip ID']) *60))
print 'total riding time is {} hours'.format(total_duration/3600)
print 'or {} days'.format(total_duration/(3600 * 24))
print 'or {} years'.format(total_duration / (3600 * 24 * 365))
In [18]:
def weather_plot(x,y1,y2, y2_label, file_name, title):
"""
Returns dual axis plot with the x axis being the date, the left and right y-axis being inputed by user
Input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
y2 is the data for the right y-axis; should have 365 entries, sorted by date;
y2_label is a string
file_name: string of what to name file in plotly account
Output: a graph
"""
trace1 = go.Scatter(x=x,y=y1,name='Total Rides')
trace2 = go.Scatter(x=x,y=y2,name=y2_label,yaxis='y2')
data = [trace1, trace2]
layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename= file_name)
In [19]:
weather_plot(total_rides14['Date'], total_rides14['Count'], weather_ave14[' Mean Wind SpeedMPH'],
'Mean Wind Speed MPH', 'Wind and Trips 2014', 'Total Rides and Mean Wind Speed by Date 2014')
Out[19]:
In [20]:
weather_plot(total_rides15['Date'], total_rides15['Count'], weather_ave15[' Mean Wind SpeedMPH'],
'Mean Wind Speed MPH', 'Wind and Trips 2015', 'Total Rides and Mean Wind Speed by Date 2015')
Out[20]:
In [21]:
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16[' Mean Wind SpeedMPH'],
'Mean Wind Speed MPH', 'Wind and Trips 2016', 'Total Rides and Mean Wind Speed by Date 2016')
Out[21]:
In [22]:
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['Mean TemperatureF'],
'Mean Temperature F', 'Temp and Trips 2014', 'Total Rides and Mean Temperature by Date 2014')
Out[22]:
In [23]:
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['Mean TemperatureF'],
'Mean Temperature F', 'Temp and Trips 2015', 'Total Rides and Mean Temperature by Date 2015')
Out[23]:
In [24]:
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['Mean TemperatureF'],
'Mean Temperature F', 'Temp and Trips 2016', 'Total Rides and Mean Temperature by Date 2016')
Out[24]:
In [25]:
weather_plot(total_rides14['Date'],total_rides14['Count'], weather_ave14['PrecipitationIn'], 'Precipitation In',
'Precip and Trips 2014', 'Total Rides and Precipitation by Date 2014')
Out[25]:
In [26]:
weather_plot(total_rides15['Date'],total_rides15['Count'], weather_ave15['PrecipitationIn'], 'Precipitation In',
'Precip and Trips 2015',
'Total Rides and Precipitation by Date 2015')
Out[26]:
In [27]:
weather_plot(total_rides16['Date'],total_rides16['Count'], weather_ave16['PrecipitationIn'], 'Precipitation In',
'Precip and Trips 2016',
'Total Rides and Precipitation by Date 2016')
Out[27]:
In [28]:
def weekend_hol_plot(x1,x2,y1,y2, y2_label, file_name, title):
"""
Returns dual axis plot with the x axis being the day, the left and right y-axis being inputed by user
input: y1 is the data for the left-axis; should have 365 entries, sorted by date;
y2 is the data for the right y-axis; should have 365 entries, sorted by date;
y2_label is a string
file_name: string of what to name file in plotly account
output: a graph
"""
trace1 = go.Scatter(x=x1,y=y1,name='Total Rides')
trace2 = go.Bar(x=x2,y=y2,name=y2_label)
data = [trace1, trace2]
layout = go.Layout(title=title,yaxis=dict(title='Total Rides'),
yaxis2=dict(title=y2_label,titlefont=dict(color='rgb(148, 103, 189)'),
tickfont=dict(color='rgb(148, 103, 189)'),overlaying='y',side='right'))
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename= file_name)
In [29]:
weekend_hol_plot(total_rides14['Date'],hol_rides14['Date'],total_rides14['Count'],hol_rides14['Count'],'Holidays',
'Holidays and Trips 2014', 'Holidays and Trips 2014')
Out[29]:
In [30]:
weekend_hol_plot(total_rides15['Date'],hol_rides15['Date'],total_rides15['Count'],hol_rides15['Count'],'Holidays',
'Holidays and Trips 2015', 'Holidays and Trips 2015')
Out[30]:
In [31]:
weekend_hol_plot(total_rides16['Date'],hol_rides16['Date'],total_rides16['Count'],hol_rides16['Count'],'Holidays',
'Holidays and Trips 2016', 'Holidays and Trips 2016')
Out[31]:
In [32]:
weekend_hol_plot(total_rides14['Date'],weekend_rides14['Date'],total_rides14['Count'],weekend_rides15['Count'],
'Weekends','Weekends and Trips 2014', 'Weekends and Trips 2014')
Out[32]:
In [33]:
weekend_hol_plot(total_rides15['Date'],weekend_rides15['Date'],total_rides15['Count'],weekend_rides15['Count'],
'Weekends','Weekends and Trips 2015', 'Weekends and Trips 2015')
Out[33]:
In [34]:
weekend_hol_plot(total_rides16['Date'],weekend_rides16['Date'],total_rides16['Count'],weekend_rides16['Count'],
'Weekends','Weekends and Trips 2016', 'Weekends and Trips 2016')
Out[34]:
In [35]:
def grouped_bar(x1, y1, x2, y2, x3, y3, file_name, title, x_title):
"""
Takes axis data for a 3 grouped bar chart and outputs a grouped bar chart.
Input: x1, y1, x2, y2, x3, y3 series data for different bars, the x data should align;
filename: what to name the plotly file, string; title: of the graph; x_title: lable for x-axis
Output: a grouped bar chart
"""
trace1 = go.Bar(x=x1,y=y1,name='Total')
trace2 = go.Bar(x=x2,y=y2,name='Customer')
trace3 = go.Bar(x=x3,y=y3,name='Subscriber')
data = [trace1, trace2, trace3]
layout = go.Layout(title = title, xaxis = dict(title = x_title),yaxis = dict(title = 'Total Rides'),barmode='group')
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename=file_name)
In [36]:
grouped_bar(total_weekday_rides14['Day of Week'],total_weekday_rides14['Count'],
weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Day of Week'],
weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Customer']['Count'],
weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Day of Week'],
weekday_rides14.loc[weekday_rides14['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2014',
'Total Rides by Day of Week 2014', 'Day of Week')
Out[36]:
In [37]:
grouped_bar(total_weekday_rides15['Day of Week'],total_weekday_rides15['Count'],
weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Day of Week'],
weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Customer']['Count'],
weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Day of Week'],
weekday_rides15.loc[weekday_rides15['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2015',
'Total Rides by Day of Week 2015', 'Day of Week')
Out[37]:
In [38]:
grouped_bar(total_weekday_rides16['Day of Week'],total_weekday_rides16['Count'],
weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Day of Week'],
weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Customer']['Count'],
weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Day of Week'],
weekday_rides16.loc[weekday_rides16['Subscriber Type']=='Subscriber']['Count'], 'Day of Week Plot 2016',
'Total Rides by Day of Week 2016', 'Day of Week')
Out[38]:
In [39]:
grouped_bar(total_min_rides14.index.get_level_values('Duration'),total_min_rides14['Count'],
total_min_rides14.index.get_level_values('Duration'),
min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_min_rides14.index.get_level_values('Duration'),
min_dur_rides14.loc[(min_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Minute Plot 2014', 'Ride Duration by Minute 2014', 'Minutes')
Out[39]:
In [40]:
grouped_bar(total_min_rides15.index.get_level_values('Duration'),total_min_rides15['Count'],
total_min_rides15.index.get_level_values('Duration'),
min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_min_rides15.index.get_level_values('Duration'),
min_dur_rides15.loc[(min_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Minute Plot 2015', 'Ride Duration by Minute 2015', 'Minutes')
Out[40]:
In [41]:
grouped_bar(total_min_rides16.index.get_level_values('Duration'),total_min_rides16['Count'],
total_min_rides16.index.get_level_values('Duration'),
min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_min_rides16.index.get_level_values('Duration'),
min_dur_rides16.loc[(min_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Minute Plot 2016', 'Ride Duration by Minute 2016', 'Minutes')
Out[41]:
In [42]:
grouped_bar(total_hr_rides14.index.get_level_values('Duration'),total_hr_rides14['Count'],
total_hr_rides14.index.get_level_values('Duration'),
hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_hr_rides14.index.get_level_values('Duration'),
hr_dur_rides14.loc[(hr_dur_rides14.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Hour Plot 2014', 'Ride Duration by Hour 2014', 'Hours')
Out[42]:
In [43]:
grouped_bar(total_hr_rides15.index.get_level_values('Duration'),total_hr_rides15['Count'],
total_hr_rides15.index.get_level_values('Duration'),
hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_hr_rides15.index.get_level_values('Duration'),
hr_dur_rides15.loc[(hr_dur_rides15.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Hour Plot 2015', 'Ride Duration by Hour 2015', 'Hours')
Out[43]:
In [44]:
grouped_bar(total_hr_rides16.index.get_level_values('Duration'),total_hr_rides16['Count'],
total_hr_rides16.index.get_level_values('Duration'),
hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Customer')]['Count'],
total_hr_rides16.index.get_level_values('Duration'),
hr_dur_rides16.loc[(hr_dur_rides16.index.get_level_values('Subscriber Type') == 'Subscriber')]['Count'],
'Duration by Hour Plot 2016', 'Ride Duration by Hour 2016', 'Hours')
Out[44]:
In [45]:
grouped_bar(total_rides_by_hour14['Hour'],total_rides_by_hour14['Count'],
total_rides_by_hour14['Hour'],
rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Customer']['Count'],
total_rides_by_hour14['Hour'],
rides_by_hour14.loc[rides_by_hour14['Subscriber Type'] == 'Subscriber']['Count'],
'Rides by Hour Plot 2014', 'Total Rides Per Hour 2014', 'Hour of Day')
Out[45]:
In [46]:
grouped_bar(total_rides_by_hour15['Hour'],total_rides_by_hour15['Count'],
total_rides_by_hour15['Hour'],
rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Customer']['Count'],
total_rides_by_hour15['Hour'],
rides_by_hour15.loc[rides_by_hour15['Subscriber Type'] == 'Subscriber']['Count'],
'Rides by Hour Plot 2015', 'Total Rides Per Hour 2015', 'Hour of Day')
Out[46]:
In [47]:
grouped_bar(total_rides_by_hour16['Hour'],total_rides_by_hour16['Count'],
total_rides_by_hour16['Hour'],
rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Customer']['Count'],
total_rides_by_hour16['Hour'],
rides_by_hour16.loc[rides_by_hour16['Subscriber Type'] == 'Subscriber']['Count'],
'Rides by Hour Plot 2016', 'Total Rides Per Hour 2016', 'Hour of Day')
Out[47]:
In [48]:
totals = merged_st['Subscriber Type'].value_counts()
Subscribers, Customers = totals[0], totals[1]
fig = {
'data': [{'labels': ['Subscribers', 'Customers'],
'values': [Subscribers, Customers],
'type': 'pie'}],
'layout': {'title': 'Trip counts by customer types'}
}
py.iplot(fig)
Out[48]:
In [49]:
by_cities = merged_st.landmark.value_counts()
sf, sj, mv, pa = by_cities[0], by_cities[1], by_cities[2], by_cities[3]
fig_cities = {
'data': [{'labels': ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto'],
'values': [sf, sj, mv, pa],
'type': 'pie'}],
'layout': {'title': 'Trip counts by cities'}
}
py.iplot(fig_cities)
Out[49]:
In [50]:
trip_counts = pd.DataFrame(merged_st.sort_values(by = 'station_id').station_id.value_counts())
trip_counts = trip_counts.reset_index().rename(columns = {'station_id' : 'trip_counts',
'index' : 'station_id'})
station_merged = pd.merge(station16, trip_counts, on = 'station_id')
In [51]:
start_stations = trip16[['Start Station', 'Duration']]
grouped = (start_stations.groupby('Start Station').
sum().
sort_values(by = 'Duration', ascending = False))
In [52]:
merged_st.name.value_counts()[:5] #most popular five starting stations
Out[52]:
In [53]:
calstations['station_type'] = 'caltrain'
calstations['dockcount'] = 0
calstations['trip_counts'] = 0
calstations['station_id'] = 0
calstations['installation'] = 0
calstations = calstations.rename(columns={'caltrain_name' : 'name'})
calstations = calstations[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts', 'station_type', 'passengers']]
In [54]:
df_for_map = station_merged[['installation','station_id','name', 'lat', 'long', 'dockcount', 'trip_counts']]
df_for_map['station_type'] = 'bike_station'
df_for_map['passengers'] = 0
df_for_map = df_for_map.append(calstations)
df_for_map.reset_index(drop='index', inplace = True)
In [55]:
start_lat, start_long, start_name = calstations.iloc[1]['lat'], calstations.iloc[1]['long'], calstations.iloc[1]['name']
In [56]:
def parser(df):
"""
Parses a pandas DF into JSON-line object
input: a dataframe
output: JSON-line object
"""
fields = df.columns
parsed_data = (dict(zip(fields, df.iloc[i])) for i in xrange(len(df)))
return parsed_data
def create_map(df):
"""
Creates a map for the given data frame
input: a data frame
output: a message: "geoJSON file has been created, check out your cd"
"""
geo_map = {'type': 'FeatureCollection'}
items = list()
for index, line in enumerate(parser(df)):
#Skip any zero coordinates as this will throw off our map
if line['long'] == str(0) or line['lat'] == str(0):
continue
data = {}
data['type'] = 'Feature'
data['id'] = index
if line['station_type'] == 'bike_station':
year = line['installation'].split('/')[2]
if line['station_id'] == 70 or line['station_id'] == 69:
data['properties'] = {'name': line['name'],
'dockcount': line['dockcount'],
'trip_counts' : line['trip_counts'],
'marker-color':'#00cc00',
'marker-size': 'large',
'marker-symbol': 'bicycle'
}
elif year == '2013':
data['properties'] = {'name': line['name'],
'dockcount': line['dockcount'],
'trip_counts' : line['trip_counts'],
'marker-color':'#0e2f44',
'marker-size':'large',
'marker-symbol': 'bicycle',
"fill-opacity": 0.3
}
else:
data['properties'] = {'name': line['name'],
'dockcount': line['dockcount'],
'trip_counts' : line['trip_counts'],
'marker-size': 'large',
'marker-symbol': 'bicycle'
}
data['geometry'] = {'type': 'Point',
'coordinates': (line['long'], line['lat'])
}
else:
data['properties'] = {'name': line['name'],
'passengers': line['passengers'],
'marker-color':'#a11f27',
'marker-size': 'large',
'marker-symbol': 'rail'
}
data['geometry'] = {'type': 'Point',
'coordinates': (line['long'], line['lat'])
}
items.append(data)
#for each point in our items, we add the point to our dictionary
for point in items:
geo_map.setdefault('features', []).append(point)
#Now that we've build up our geo_map dictionary, let's save it as geojson file
with open('bikecaltrain.geojson', 'w') as f:
f.write(dumps(geo_map))
return "'bikecaltrain.geojson' file has been created, check out your cd"
def create_linestring(df):
"""
Creates a map for the given data frame
input: a data frame
output: a message: "geoJSON file has been created, check out your cd"
"""
geo_map = {'type': 'FeatureCollection'}
items = list()
for index, line in enumerate(parser(df)):
#Skip any zero coordinates as this will throw off our map
if line['long'] == str(0) or line['lat'] == str(0):
continue
data = {}
data['type'] = 'Feature'
data['id'] = index
data['properties'] = {'name': line['name'],
'dockcount': line['dockcount'],
'marker-color':'#a11f27',
'marker-size': 'large'
}
data['geometry'] = {'type': 'LineString',
'coordinates': ([line['long'], line['lat']], [start_long, start_lat])
}
items.append(data)
#for each point in our items, we add the point to our dictionary
for point in items:
geo_map.setdefault('features', []).append(point)
#Now that we've build up our geo_map dictionary, let's save it as geojson file
with open('lines.geojson', 'w') as f:
f.write(dumps(geo_map))
return "'lines.geojson' file has been created, check out your cd"
In [57]:
create_map(df_for_map)
Out[57]:
In [58]:
create_linestring(station16)
Out[58]:
In [59]:
def status_for_city(station, status, city = 'San Francisco'):
"""
returns a dataframe station ids for the specified city
input: status and station dataframes, city (default = 'San Francisco')
output: a stations dataframe with for the specified city
"""
city_stations = station[station.landmark == city]['station_id']
city_status = status[status.station_id.isin(city_stations)]
return city_status
def groupby_term_mean(status, term):
"""
returns dataframe grouped by term and average values
input: status data frame, term (month or hour)
output: dataframe grouped by term and average values of
other variables
"""
assert term in ['month', 'hour', 'weekday'], 'term should be either "month", "hour" or "weekday".'
if term == 'hour':
status['hour'] = status.time.dt.strftime('%H')
grouped = status.groupby('hour').mean()
grouped.reset_index(inplace = True)
elif term == 'month':
status['month'] = status.time.dt.strftime('%m')
grouped = status.groupby('month').mean()
grouped.reset_index(inplace = True)
else:
status['weekday'] = status.time.dt.weekday_name
grouped = status.groupby('weekday').mean()
grouped.reset_index(inplace = True)
grouped['Sorter'] = [5, 1, 6, 7, 4, 2, 3]
grouped.sort_values(by = 'Sorter', inplace = True)
return grouped
def plot_status(grouped, time_format, city = None, station_name = None):
"""
returns a plot of average bike and dock availability
input: grouped dataframe by hour or month, time_format
(month or hour), city (default is None)
output: a plot of average bike and dock availability
"""
import plotly.plotly as py
import plotly.graph_objs as go
assert time_format in ['month', 'hour', 'weekday'], 'time format should be either "month", "hour", or "weekday".'
# Add data
if time_format is 'hour':
time = grouped.hour
title_time = 'Daily'
x_axis_name = 'Hour'
elif time_format is 'month':
time = grouped.month
title_time = 'Yearly'
x_axis_name = 'Month'
else:
time = grouped.weekday
title_time = 'Weekly'
x_axis_name = 'Week Days'
bikes_available = grouped.bikes_available
docks_available = grouped.docks_available
# Create and style traces
trace0 = go.Scatter(
x = time,
y = bikes_available,
name = 'Bikes Available',
line = dict(
color = ('rgb(205, 12, 24)'),
width = 4,
dash = 'dash')
)
trace1 = go.Scatter(
x = time,
y = docks_available,
name = 'Docks Available',
line = dict(
color = ('rgb(22, 96, 167)'),
width = 4,
dash = 'dash')
)
data = [trace0, trace1]
if station_name:
title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, station_name)
elif city:
title_name = '{} Average Counts of Available Bikes and Docks in {}'.format(title_time, city)
else:
title_name = '{} Average Counts of Available Bikes and Docks across all 4 cities'.format(title_time)
# Edit the layout
layout = dict(title = title_name,
xaxis = dict(title = x_axis_name),
yaxis = dict(title = 'Counts'),
)
fig = dict(data=data, layout=layout)
file_name = 'plot for {} by {}'.format(city, time_format)
return py.iplot(fig, filename=file_name)
def plot_summary_for_city(city, term, station_id = None):
"""
returns plot summary for the given city and term
input: city name, term ('hour', 'month', 'weekday')
output: a figure
NOTE: station and status data should be available in global env.
"""
assert city in ['San Francisco', 'Mountain View', 'Palo Alto', 'San Jose'], 'unavailable city'
assert term in ['hour', 'month', 'weekday'], 'term should be either "hour", "month", or "weekday".'
assert type(status16['time'][0]) is pandas.tslib.Timestamp, 'status["time"] type error. convert it time series.'
city_status = status_for_city(station16, status16, city)
station_name = None
if station_id:
assert station_id in station16.station_id.values, 'invalid station id'
city_status = city_status[city_status.station_id == station_id]
station_name = station.name[station16.station_id == station_id].values[0]
grouped = groupby_term_mean(city_status, term)
return plot_status(grouped, term, city, station_name)
In [60]:
plot_summary_for_city('San Francisco', 'hour')
Out[60]:
In [61]:
plot_summary_for_city('San Jose', 'hour')
Out[61]:
In [62]:
plot_summary_for_city('San Francisco', 'weekday')
Out[62]:
In [63]:
plot_summary_for_city('San Jose', 'weekday')
Out[63]:
In [64]:
plot_summary_for_city('San Francisco', 'month')
Out[64]:
In [65]:
plot_summary_for_city('San Jose', 'month')
Out[65]:
In [66]:
#plot_summary_for_city('Mountain View', 'hour')
#plot_summary_for_city('Mountain View', 'weekday')
#plot_summary_for_city('Mountain View', 'month')
In [67]:
#plot_summary_for_city('Palo Alto', 'hour')
#plot_summary_for_city('Palo Alto', 'weekday')
#plot_summary_for_city('Palo Alto', 'month')
In [68]:
def convert_df(station_in, status_in, term, station_id = None):
"""
createst a DF which is grouped by term and has average
bike and docks available for a station if speficied, otherwise
it returns average value for San Francisco
input: station_in, status_in, term, station_id (default = None)
output: a grouped DF, station_name (default = None)
"""
city_status = status_for_city(station_in, status_in)
station_name = None
if station_id:
assert station_id in station_in.station_id.values, 'invalid station id'
city_status = city_status[city_status.station_id == station_id]
station_name = station_in.name[station_in.station_id == station_id].values[0]
grouped = groupby_term_mean(city_status, term)
return grouped, station_name
In [69]:
def compares_years(term, station_id = None):
"""
compares average avaialble bike and dock counts
for three years per term for San Francisco,
if station_id specified compares for that station.
input: term ('hour', 'month', or 'weekday')
output: an interactive figure
"""
sf14, station_name = convert_df(station14, status14, term, station_id)
sf15, station_name = convert_df(station15, status15, term, station_id)
sf16, station_name = convert_df(station16, status16, term, station_id)
bikes14 = sf14.bikes_available.values
docks14 = sf14.docks_available.values
bikes15 = sf15.bikes_available.values
docks15 = sf15.docks_available.values
bikes16 = sf16.bikes_available.values
docks16 = sf16.docks_available
xcoord = sf15[term]
if station_name:
title = 'Average Available Bike and Dock Counts for {}'.format(station_name)
else:
title = 'Average Available Bike and Dock Counts for San Francisco'
trace1 = Scatter(
x=xcoord, y=bikes15,
line=Line(
color='#FFD700',
width=3
),
name='2015: Bikes'
)
trace2 = Scatter(
x=xcoord, y=docks15,
line=Line(
color='#C0C0C0',
width=3
),
name='2015: Docks'
)
trace3 = Scatter(
x=xcoord, y=bikes16,
line=Line(
color='#BA8651',
width=3
),
name='2016: Bikes'
)
trace4 = Scatter(
x=xcoord, y=docks16,
line=Line(
color='#000000',
width=3
),
name='2016: Docks'
)
trace5 = Scatter(
x=xcoord, y=bikes14,
line=Line(
color='#b9f442',
width=3
),
name='2014: Bikes'
)
trace6 = Scatter(
x=xcoord, y=docks14,
line=Line(
color='#416bf4',
width=3
),
name='2014: Docks'
)
data = Data([trace1, trace2, trace3, trace4, trace5, trace6])
layout = Layout(
title=title,
updatemenus=list([
dict(
x=-0.05,
y=1,
yanchor='top',
buttons=list([
dict(
args=['visible', [True, True, True, True, True, True]],
label='All',
method='restyle'
),
dict(
args=['visible', [True, True, False, False, False, False]],
label='2015',
method='restyle'
),
dict(
args=['visible', [False, False, True, True, False, False]],
label='2016',
method='restyle'
),
dict(
args=['visible', [False, False, False, False, True, True]],
label='2014',
method='restyle'
),
dict(
args=['visible', [True, False, True, False, True, False]],
label='Bikes',
method='restyle'
),
dict(
args=['visible', [False, True, False, True, False, True]],
label='Docks',
method='restyle'
)
]),
)
]),
)
fig = Figure(data=data, layout=layout)
return py.iplot(fig)
In [70]:
compares_years('hour', 70)
Out[70]:
In [71]:
compares_years('weekday', 70)
Out[71]:
In [72]:
def route_countFunc(trip_df, station_df):
"""
This function gets 'trip' and 'station' dataframe and returns a dataframe contains routes and their counts
@args:
trip_df: trip dataframe
station_df: station dataframe
@returns:
routes_count: contains routes and their counts, duration, and landmark
"""
#Add start landmark
trip = pd.merge(left = trip_df[['Trip ID', 'Duration', 'station_id', 'End Terminal']], right = station_df[['name', 'station_id', 'landmark']],
how='left', left_on='station_id', right_on= 'station_id')
trip.rename(columns={'landmark':'start_landmark', 'name':'Start Station'}, inplace=True)
trip.drop('station_id', inplace=True, axis=1)
#Add end landmark
trip = pd.merge(left = trip, right = station_df[['name','station_id', 'landmark']],
how='left', left_on='End Terminal', right_on= 'station_id')
trip.rename(columns={'landmark':'end_landmark', 'name':'End Station'}, inplace=True)
trip.drop('station_id', inplace=True, axis=1)
#Pick only trips within one landmark
trip = trip[trip.start_landmark == trip.end_landmark]
#Count the number of trips between stations
routes_count = trip.groupby(['Start Station', 'End Station', 'start_landmark']).agg({'Trip ID': 'count', 'Duration': 'mean'}).reset_index()
routes_count = routes_count.rename(columns={'Trip ID': 'Counts'})
routes_count.Duration = routes_count['Duration'].apply(lambda x: round((x/60),2))
#Change the column names
routes_count.rename(columns={'start_landmark':'Landmark', 'Duration': 'Duration (min)'}, inplace=True)
return routes_count
In [73]:
def route_heatmap(trip_df, station_df, landmark):
"""
This function recieve the trip and station dataframes for a specific year
and plots the trip heatmap for a given landmark
@args:
trip_df: trip dataframe
station_df: station dataframe
landmark: Could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
@returns:
Heatmap graph
"""
routes = route_countFunc(trip_df, station_df)
df_routes = routes[routes.Landmark == landmark]
start_station = df_routes['Start Station'].sort_values().unique()
end_station = df_routes['End Station'].sort_values().unique()
#create z values for heatmap and text for hover text in the map
z = []
trip_text = []
for start in start_station:
new_row = []
text_row = []
for end in end_station:
try:
trip_count = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)].Counts.values[0]
average_duration = df_routes[(df_routes['Start Station'] == start) & (df_routes['End Station'] == end)]['Duration (min)'].values[0]
except IndexError:
trip_count = 0
average_duration = 0
text = 'Start Station: {}<br>End Station: {}<br>Trip Counts: {}<br>Average Duration: {} Mins'.format(start, end, trip_count, average_duration)
text_row.append(text)
new_row.append(trip_count)
z.append(list(new_row))
trip_text.append(list(text_row))
#Plot Heatmap
heatmap = go.Heatmap(x = start_station,
y = end_station,
z = z,
hoverinfo = 'text',
text = trip_text,
colorscale='YIGnBu',
reversescale=True)
data = go.Data([heatmap])
layout = go.Layout(
title=landmark.upper() + ' TRIP COUNTS',
titlefont = dict(color='black', size=35, family='monospace'),
xaxis=dict(
title='END STATION',
tickangle = -90,
titlefont=dict(
family='monospace',
size=18,
color='red')),
yaxis=dict(
title='START STATION',
titlefont=dict(
family='monospace',
size=18,
color='red')),
width=1000,
height=1000,
margin=go.Margin(
l=300,
r=50,
b=300,
t=100,
pad=4
)
)
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig)
In [74]:
route_heatmap(trip16, station16, 'San Francisco')
Out[74]:
In [75]:
def crimeFunc(df):
"""
This function gets the crime data frame and select the crimes potentially dangerous to bikers and bikes such as:
'BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT'
in the year 2016
@args:
df: crime dataframe
@returns:
a dataframe containing crimes in 2015 related to bikes
"""
#List of related crimes
relatedCrimes = ['BURGLARY', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']
bikeCrime = df[df['Category'].isin(relatedCrimes)].reset_index(drop = True)
#Extract year from "Date" column
bikeCrime.Date = bikeCrime.Date.str.slice(6,10)
bikeCrime = bikeCrime.rename(columns = {'X': 'lon', 'Y': 'lat', 'Date': 'year'})
#Crimes in year 2016
bikeCrime = bikeCrime[bikeCrime['year'] == '2016'][['Category', 'lat', 'lon']].reset_index(drop=True)
return bikeCrime
In [76]:
def crime_objectFunc(df):
"""
This function gets the crime data frame and creates a scatter map plot object
@args:
df: crime data frame
@returns:
a scatter map plot object
"""
bikeCrime = crimeFunc(df)
crime_graph = go.Scattermapbox(lon = bikeCrime['lon'],
lat = bikeCrime['lat'],
hoverinfo = 'text',
text = bikeCrime['Category'],
mode = 'markers',
marker = dict(size = 3,
color = 'red'
),
opacity = 0.1,
name = 'Crime')
return crime_graph
In [77]:
def extract_info(zipNum, info_type):
'''
This function gets the zipcode and returns either city name, latitude, or longitude
@args:
zipNum: zipcode
info_type: could be either: 'city', 'lat', or 'longitude'
@returns:
city name, latitude, or longitude
'''
try:
zipcode.isequal(str(zipNum)).state #Check to see if info exists for the zipcode
info = zipcode.isequal(str(zipNum))
except AttributeError:
return 'Not valid'
if info_type == 'state':
return info.state
if info_type == 'city':
return info.city
elif info_type == 'lat':
return info.lat
else:
return info.lon
In [78]:
def incomeFunc(df):
"""
This function gets the income dataframe and finds the city and state corresponding to each lattidue and longitude
and returns a dataframe containing median and mean income corresponding to each zipcode in San Francisco
@args:
df: income dataframe
@returns:
a dataframe containing median and mean income corresponding to each zipcode in San Francisco
"""
#Add state, city, latitude, and longitude columns to income dataset
info_list = ['state', 'city', 'lat', 'lon']
for i in info_list:
df[i] = [extract_info(str(x), i) for x in df.iloc[:,0]]
#Extract Sanfrancicso data
SF_income = df[(df.state == 'CA') & (df.city == 'SAN FRANCISCO')].reset_index(drop=True)
return SF_income
In [79]:
def hover_incomeTextFunc(df):
"""
This function gets the income dataframe and creates a list of texts for hover text in plot
@args:
df: income dataframe
@returns:
a list of texts for hover text in the plot
"""
#For hover in the map graph
text_income = df.apply(lambda x: 'Zipcode: {}<br>Population: {}<br>Median Income: ${}<br>Mean Income: ${}'.format(
x[0], x[3], x[1], x[2]),axis=1)
return text_income
In [80]:
def point_incomeSizeFunc(df):
"""
This function gets the income dataframe and creates a list of point sizes for plot
@args:
df: income dataframe
@returns:
a list of point sizes for plot
"""
#Scale the point size for map
point_size = df['Median'].str.replace(',', '').apply(int)/df['Median'].str.replace(',', '').apply(int).max()
return point_size
In [81]:
def point_incomeTransFunc(df):
"""
This function gets the income dataframe and creates of a list of point transparency for plot
@args:
df: income dataframe
@returns:
a list of point transparency for plot
"""
#Scale the point transparancy to population
point_transparency = df['Population'].str.replace(',', '').apply(int)/df['Population'].str.replace(',', '').apply(int).max()
return point_transparency
In [82]:
def income_objectFunc(df):
"""
This function gets the income dataframe and creates a scatter map plot object
@args:
df: income dataframe
@returns:
a scatter map plot object
"""
SF_income = incomeFunc(df)
income_graph = go.Scattermapbox(lon = incomeFunc(SF_income)['lon'],
lat = incomeFunc(SF_income)['lat'],
hoverinfo = 'text',
text = hover_incomeTextFunc(SF_income),
mode = 'markers',
marker = dict(size = point_incomeSizeFunc(SF_income)*20,
color = 'light blue'
),
opacity = point_incomeTransFunc(SF_income)/10,
name = 'Median Income')
return income_graph
In [83]:
def graph_CrimeIncome(income_df, crime_df):
"""
This function gets two data frames income and crime and plots the scatter map plot for both in San Francisco
@args:
income_df: income dataframe
crime_df: crime dataframe
@returns:
A scatter map plot in San Francisco
"""
income_object = income_objectFunc(income_df)
crime_object = crime_objectFunc(crime_df)
data = go.Data([income_object, crime_object])
layout = dict(
title='MEDIAN INCOME AND CRIME MAP IN SAN FRANCISCO',
titlefont = dict(color='black', size=35, family='monospace'),
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_token,
bearing=0,
center=dict(
lat=37.773972,
lon=-122.431297
),
pitch=0,
zoom=10
),
)
layout = go.Layout(layout)
fig = go.Figure(data = data, layout = layout)
return py.iplot(fig)
In [84]:
graph_CrimeIncome(income, crime)
Out[84]:
In [85]:
def routeDock_countFunc(tripDF, stationDF, landmark):
"""
This function gets trip and station dataframes and the landmark of interests and returns a dataframe
containing all stations with total number of trips started at the stations and their number of dockcounts
@args:
tripDF: trip dataframe
stationDF: station dataframe
landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
@returns:
A dataframe containing all stations with total number of trips started at the stations and their
number of dockcounts
"""
routeCount = route_countFunc(tripDF, stationDF)
routeCount = routeCount[routeCount.Landmark == landmark].groupby('Start Station').agg({'Counts': 'sum'}).reset_index()
routeCount = pd.merge(left=routeCount, right=stationDF[['name', 'dockcount']], how='inner', left_on='Start Station', right_on='name')
routeCount.drop('name', axis=1, inplace=True)
return routeCount
In [86]:
def hover_text(df):
"""
This function gets a dataframe made by function "routeDock_countFunc" and returns a list containing the
text information used in the barplot
@args:
df: dataframe made with function "routeDock_countFunc"
@returns:
a list containing the text information used in hover text for the barplot
"""
text = df.apply(lambda x: 'Station: {}<br>Trip Counts: {}<br>Dock Counts: {}<br>Dock/Trip: {}%'.format(
x[0], x[1], int(x[2]), round(float(x[2])*100/x[1], 3)), axis=1)
return text
In [87]:
def bar_object(tripDF, stationDF, landmark, year):
"""
This function gets trip and station dataframes and the landmark of interests and specific year and returns
a barobject used for plotting barplot
@args:
tripDF: trip dataframe
stationDF: station dataframe
landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
year: year of interest, either 2014, 2015, 2016
@returns:
Barplot object used to plot a barplot
"""
df = routeDock_countFunc(tripDF, stationDF, landmark)
bar = go.Bar(x=df['Start Station'], y=df['Counts'],
name = year, hoverinfo='text', text=hover_text(df))
return bar
In [88]:
def barPlot(landmark):
"""
This funciton gets a landmark and plots the number of trips from statins in a landmark. The barplot
contains the information for three years 2014, 2015, and 2016.
@args:
landmark: Landmark of interest that could be "San Francisco", "San Jose", "Palo Alto", and "Mountain View"
@returns:
A barplot
"""
#Bar object
bar14 = bar_object(trip14, station14, landmark, 2014)
bar15 = bar_object(trip15, station15, landmark, 2015)
bar16 = bar_object(trip16, station16, landmark, 2016)
data = go.Data([bar14, bar15, bar16])
#Layout
layout = go.Layout(
width=1000,
height=800,
hovermode = 'closest',
title='TOTAL TRIPS FROM STATIONS IN ' + landmark.upper(),
titlefont = dict(color='black', size=35, family='monospace'),
xaxis=dict(
title='STATION',
tickangle = -90,
titlefont=dict(
family='monospace',
size=18,
color='red')),
yaxis=dict(
title='TRIP COUNTS',
titlefont=dict(
family='monospace',
size=18,
color='red')),
margin=go.Margin(
l=100,
r=50,
b=300,
t=100,
pad=4),
showlegend = 'True',
legend = dict(x=0, y=1, orientation='h'))
#Make the plot
fig = go.Figure(data = data, layout = layout)
return py.iplot(fig)
In [89]:
barPlot('San Francisco')
Out[89]:
In [ ]: