In [7]:
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.legend_handler import HandlerLine2D
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import numpy as np
import pandas as pd
import seaborn as sns
from shapely.geometry import Point, LineString
import time
%matplotlib inline
mpl.rcParams['figure.figsize'] = 10,10 #set the default map size
mpl.rcParams['patch.linewidth'] = 0.5 #set default polygon line width
mpl.rcParams['markers.fillstyle'] = 'full' #set default polygon line width
In [8]:
# Directories & Data Paths
top_directory = "/Users/jgaboardi/Desktop/Exercise_2/"
raw_data = "/raw_data"
half_1 = "/201402_babs_open_data"
half_2 = "/201408_babs_open_data"
trips_1 = "/201402_trip_data.csv"
trips_2 = "/201408_trip_data.csv"
stations_1 = "/201402_station_data.csv"
stations_2 = "/201408_station_data.csv"
bay_area = "/bayarea_cities/bay_area_cities.shp"
san_jose_area = "/bayarea_cities/San_Jose.shp"
clean_data = "/clean_data"
In [9]:
# Bay Area Shapes
bay_area_cities = gpd.read_file(top_directory+raw_data+bay_area)
del bay_area_cities["Length__"]
# No Water
bay_area_cities = bay_area_cities[bay_area_cities.CITY != "BAY AND OCEAN"]
# Bay Area Shapes
san_jose = gpd.read_file(top_directory+raw_data+san_jose_area)
del san_jose["INCORP_ID"]
san_jose.columns = ["Area__", "CITY", "geometry"]
# Concatenate San Jose to other cites
bay_area_cities = pd.concat([bay_area_cities, san_jose])
# To File
#bay_area_cities.to_file(top_directory+clean_data+"/bay_area_cities.shp")
bay_area_cities.head()
Out[9]:
In [10]:
# Set color for polygons
color = ListedColormap('white')
# Plot
bay_area_cities.plot(alpha=1,
colormap=color,
linewidth=.25)
# Title
plt.title('San Francisco Bay Area, CA',
family='Times New Roman',
size=25,
color='k',
backgroundcolor='w',
weight='bold')
plt.grid(False)
# Legend Handles
cities = mpatches.Patch(facecolor='white',
edgecolor='gray',
lw=1,
label='Cities')
# Legend
plt.legend(handles=[cities],
numpoints=1,
loc="lower left")
# North Arrow and 'N' --> Must be changed for different spatial resolutions, etc.
plt.annotate('N', xy=(5900000, 2470000), fontstyle='italic', fontsize='xx-large',
fontweight='heavy', alpha=0.75)
plt.arrow(5910000, 2420000, 0.0, 35000, width=3000, head_width=7250,
head_length=7500, fc='k', ec='k',alpha=0.75,)
plt.annotate('<| ~ 20.0 miles |>', xy=(5900000, 2400000),
fontstyle='italic', fontsize='large', alpha=0.95)
#plt.savefig(top_directory+clean_data+"/bay_area.png")
Out[10]:
In [11]:
# Initial Coords
wgs = {'init': 'epsg:4326', 'no_defs': True}
# Merge Station Data
half_1_stations = pd.read_csv(top_directory+raw_data+half_1+stations_1)
half_2_stations = pd.read_csv(top_directory+raw_data+half_2+stations_2)
# The 2nd half only has one new station so we concatenate that
frames = [half_1_stations, half_2_stations[half_2_stations.index == 69]]
whole_year_stations = pd.concat(frames)
# Add Geometry
geometry = [Point(xy) for xy in zip(whole_year_stations["long"], whole_year_stations["lat"])]
whole_year_stations = gpd.GeoDataFrame(whole_year_stations, geometry=geometry, crs=wgs)
whole_year_stations = whole_year_stations.to_crs(epsg=2227)
print "Number of stations: ", len(whole_year_stations)
# To File
#whole_year_stations.to_file(top_directory+clean_data+"/whole_year_stations.shp")
whole_year_stations.head()
Out[11]:
In [12]:
# Base
base = bay_area_cities.plot(legend=True,
colormap=color,
alpha=1,
linewidth=.25)
# Stations
whole_year_stations.plot(ax=base,
marker='o',
color='green',
markersize=10,
alpha=.1,
label="Stations")
# Title
plt.title('San Francisco Bay Area, CA\nBay Area Bike Share Stations',
family='Times New Roman',
size=20,
color='k',
backgroundcolor='w',
weight='bold')
plt.grid(False)
# Legend Handles
cities = mpatches.Patch(facecolor='white',
edgecolor='gray',
lw=1,
label='Cities')
stations = mlines.Line2D([], [],
color='green',
marker='o',
markersize=10,
label='Stations',
alpha=.25,
linestyle='None')
# Legend
plt.legend(handles=[stations, cities],
numpoints=1,
loc="lower left",
shadow=True,
fancybox=True)
# North Arrow and 'N' --> Must be changed for different spatial resolutions, etc.
plt.annotate('N', xy=(5900000, 2470000), fontstyle='italic', fontsize='xx-large',
fontweight='heavy', alpha=0.75)
plt.arrow(5910000, 2420000, 0.0, 35000, width=3000, head_width=7250,
head_length=7500, fc='k', ec='k',alpha=0.75,)
plt.annotate('<| ~ 20.0 miles |>', xy=(5900000, 2400000),
fontstyle='italic', fontsize='large', alpha=0.95)
#plt.savefig(top_directory+clean_data+"/bay_area_and_stations.png")
Out[12]:
In [13]:
# Filter out the cities with bike share stations
sf_bikes = bay_area_cities[bay_area_cities.CITY == "SAN FRANCISCO"]
redwood_bikes = bay_area_cities[bay_area_cities.CITY == "PALO ALTO"]
palo_alto_bikes = bay_area_cities[bay_area_cities.CITY == "MOUNTAIN VIEW"]
mountainview_bikes = bay_area_cities[bay_area_cities.CITY == "REDWOOD CITY"]
san_jose_bikes = bay_area_cities[bay_area_cities.CITY == "San Jose"]
# Concatenate
bike_share_area = pd.concat([sf_bikes,redwood_bikes,palo_alto_bikes,mountainview_bikes,san_jose_bikes])
#bike_share_area.to_file(top_directory+clean_data+"/bike_share_area.shp")
In [14]:
# Base
base = bike_share_area.plot(legend=True,
colormap=color,
alpha=1,
linewidth=.25)
# Stations
whole_year_stations.plot(ax=base,
marker='o',
color='green',
markersize=10,
alpha=.1,
label="Stations")
# Title
plt.title('San Francisco, Mountain View,\nRedwood City, Palo Alto, and San Jose',
family='Times New Roman',
size=20,
color='k',
backgroundcolor='w',
weight='bold')
plt.grid(False)
# Legend Handles
cities = mpatches.Patch(facecolor='white',
edgecolor='gray',
lw=1,
label='Cities')
stations = mlines.Line2D([], [],
color='green',
marker='o',
markersize=10,
label='Stations',
alpha=.25,
linestyle='None')
# Legend
plt.legend(handles=[stations, cities],
numpoints=1,
loc="lower left",
shadow=True,
fancybox=True)
# North Arrow and 'N' --> Must be changed for different spatial resolutions, etc.
plt.annotate('N', xy=(6000000, 1900000), fontstyle='italic', fontsize='xx-large',
fontweight='heavy', alpha=0.75)
plt.arrow(6005000, 1875000, 0.0, 15000, width=1000, head_width=3250,
head_length=4500, fc='k', ec='k',alpha=0.75,)
plt.annotate('<| ~ 10.0 miles |>', xy=(6015000, 1900000),
fontstyle='italic', fontsize='large', alpha=0.95)
#plt.savefig(top_directory+clean_data+"/share_cities_and_stations.png")
Out[14]:
In [15]:
# Merge Trip Data
half_1_trips = pd.read_csv(top_directory+raw_data+half_1+trips_1)
half_2_trips = pd.read_csv(top_directory+raw_data+half_2+trips_2)
# Inconsistent Naming Conventions for 'Subscription Type' Column
# Must rename so halves match
half_2_trips = half_2_trips.rename(columns = {'Subscriber Type':'Subscription Type'})
# Concatenate the halves
frames = [half_1_trips, half_2_trips]
whole_year_trips = pd.concat(frames)
print "Length of whole equal to the combined halfs?",\
len(whole_year_trips) == len(half_1_trips)\
+ len(half_2_trips)
print len(whole_year_trips)
# To File
#whole_year_trips.to_csv(top_directory+clean_data+"/whole_year_trips.csv")
whole_year_trips.head()
Out[15]:
In [16]:
# Merge Trip and Station Station Data by Start Station
start_station_merge = whole_year_trips.merge(whole_year_stations,
how='right',
left_on=whole_year_trips["Start Station"],
right_on=whole_year_stations.name)
# Convert Start Date to 'datetime' and set to index
start_station_merge["Start_Date"] = pd.to_datetime(start_station_merge["Start Date"])
del start_station_merge["Start Date"]
start_station_merge["End_Date"] = pd.to_datetime(start_station_merge["End Date"])
del start_station_merge["End Date"]
start_station_merge.index = start_station_merge["Start_Date"]
#start_station_merge.to_csv(top_directory+clean_data+"/start_station_merge.csv")
#http://chrisalbon.com/python/pandas_time_series_basics.html
In [17]:
start_station_merge.head()[:2]
Out[17]:
In [18]:
start_station_merge['Trip'] = [1]*len(start_station_merge)
count_by_month = start_station_merge['Trip'].groupby(pd.TimeGrouper("M")).sum()
count_by_month.to_csv(top_directory+clean_data+"/count_by_month.csv")
count_by_month
Out[18]:
In [19]:
# Plot
count_by_month.plot(linestyle='-',
lw=6,
color='r')
# Title
plt.title('Count of Trips by Month',
family='Times New Roman',
size=20,
color='k',
backgroundcolor='w',
weight='bold')
# Grid
plt.grid(color='k',
which='both',
axis='both',
lw=.5,
linestyle=':')
# X-Label
plt.xlabel("August 2013 through August 2014",
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
# Y-Label
plt.ylabel("Trips",
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
# y-ticks
plt.yticks(family='Times New Roman')
#plt.savefig(top_directory+clean_data+"/count_by_month.png")
Out[19]:
In [20]:
start_station_merge['Trip'] = [1]*len(start_station_merge)
count_by_hour = start_station_merge['Trip'].groupby(start_station_merge.index.hour).sum()
#count_by_hour.to_csv(top_directory+clean_data+"/count_by_hour.csv")
count_by_hour.head()
Out[20]:
In [21]:
# Plot
count_by_hour.plot(linestyle='-',
lw=6,
color='r')
# Title
plt.title('Count of Trips by Hour of Day',
family='Times New Roman',
size=20,
color='k',
backgroundcolor='w',
weight='bold')
# Grid
plt.grid(color='k',
which='both',
axis='both',
lw=.5,
linestyle=':')
# X-Label
plt.xlabel("Time of Day: 12:00am (0) through 11:59pm",
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
# Y-Label
plt.ylabel("Trips",
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
# y-ticks
plt.yticks(family='Times New Roman')
#plt.savefig(top_directory+clean_data+"/count_by_hour.png")
Out[21]:
In [22]:
# OD Start Index __ Name
origin_destination_by_start__NAME = start_station_merge[["Start Station",
"End Station"]]\
.pivot_table(index=["Start Station"],
columns=["End Station"],
aggfunc=len,
fill_value=0,
margins=True,
margins_name="Totals")
origin_destination_by_start__NAME.columns.name = "Destination"
origin_destination_by_start__NAME.index.name = "Origin"
#origin_destination_by_start__NAME.to_csv(top_directory\
# +clean_data\
# +"/origin_destination_by_start__NAME.csv")
In [23]:
origin_destination_by_start__NAME[["2nd at Folsom",
"2nd at South Park",
"2nd at Townsend"]].head()[:3]
Out[23]:
In [24]:
# OD Start Index __ Terminal
origin_destination_by_start__TERMINAL = start_station_merge[["Start Terminal",
"End Terminal"]]\
.pivot_table(index=["Start Terminal"],
columns=["End Terminal"],
aggfunc=len,
dropna=False,
fill_value=0,
margins=True,
margins_name="Totals")
origin_destination_by_start__TERMINAL.columns.name = "Destination"
origin_destination_by_start__TERMINAL.index.name = "Origin"
#origin_destination_by_start__TERMINAL.to_csv(top_directory\
# +clean_data\
# +"/origin_destination_by_start__TERMINAL.csv")
In [25]:
origin_destination_by_start__TERMINAL[[2,3,4]].head()[:3]
Out[25]:
In [26]:
print origin_destination_by_start__NAME.shape
print origin_destination_by_start__TERMINAL.shape
In [27]:
print origin_destination_by_start__TERMINAL['Totals'].head()[:3]
print len(origin_destination_by_start__TERMINAL['Totals'])
In [28]:
originCount_for_histogram = origin_destination_by_start__TERMINAL\
[origin_destination_by_start__TERMINAL.index != "Totals"]
#originCount_for_histogram.to_csv(top_directory\
# +clean_data\
# +"/originCount_for_histogram.csv")
sns.distplot(originCount_for_histogram["Totals"],
bins=20,
kde=False)
# Title
plt.title('Origin-based Count',
family='Times New Roman',
size=30,
color='k',
backgroundcolor='w',
weight='bold')
plt.xlabel('Trip Count',
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
plt.ylabel('Station Count',
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
#plt.savefig(top_directory+clean_data+"/count_by_origin.png")
Out[28]:
In [29]:
destination_origin_by_start__TERMINAL = origin_destination_by_start__TERMINAL.transpose()
destinationCount_for_histogram = destination_origin_by_start__TERMINAL\
[destination_origin_by_start__TERMINAL.index != "Totals"]
#destinationCount_for_histogram.to_csv(top_directory\
# +clean_data\
# +"/destinationCount_for_histogram.csv")
sns.distplot(destinationCount_for_histogram["Totals"],
bins=18,
kde=False,
color='k')
# Title
plt.title('Destination-based Count',
family='Times New Roman',
size=30,
color='k',
backgroundcolor='w',
weight='bold')
plt.xlabel('Trip Count',
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
plt.ylabel('Station Count',
family='Times New Roman',
size=18,
color='k',
backgroundcolor='w',
weight='bold')
#plt.savefig(top_directory+clean_data+"/count_by_destination.png")
Out[29]:
In [30]:
# Create new Weights dataframe to merge with stations in QGIS
weight_counts_df = pd.DataFrame(index=origin_destination_by_start__TERMINAL.index[:-1])
weight_counts_df["Start_Terminal"] = weight_counts_df.index
weight_counts_df['Origin_Weights'] = origin_destination_by_start__TERMINAL['Totals'][:-1]
weight_counts_df['Destination_Weights'] = destination_origin_by_start__TERMINAL['Totals'][:-1]
weight_counts_df['OD_Difference'] = weight_counts_df['Origin_Weights']\
- weight_counts_df['Destination_Weights']
# Save to File
#weight_counts_df.to_csv(top_directory\
# +clean_data\
# +"/weight_counts_df.csv")
In [31]:
# 7am to 9am Origins and Destinations
AM_7_to_9 = start_station_merge.between_time('7:00','9:00')
AM_7_to_9.to_csv(top_directory+clean_data+'/AM_7_to_9.csv')
AM_7_to_9.head()[:2]
Out[31]:
In [32]:
# AM OD Start Index __ Terminal
AM_OD_by_start = AM_7_to_9[["Start Terminal",
"End Terminal"]]\
.pivot_table(index=["Start Terminal"],
columns=["End Terminal"],
aggfunc=len,
dropna=False,
fill_value=0,
margins=True,
margins_name="Totals")
AM_OD_by_start.columns.name = "Destination"
AM_OD_by_start.index.name = "Origin"
AM_OD_by_start.to_csv(top_directory\
+clean_data\
+"/AM_OD_by_start.csv")
In [33]:
AM_OD_by_start[[2,3,4]].head()[:3]
Out[33]:
In [34]:
# 5pm to 7pm Origins and Destinations
PM_5_to_7 = start_station_merge.between_time('17:00','19:00')
PM_5_to_7.to_csv(top_directory+clean_data+'/PM_5_to_7.csv')
PM_5_to_7.head()[:2]
Out[34]:
In [35]:
# PM OD Start Index __ Terminal
PM_OD_by_start = PM_5_to_7[["Start Terminal",
"End Terminal"]]\
.pivot_table(index=["Start Terminal"],
columns=["End Terminal"],
aggfunc=len,
dropna=False,
fill_value=0,
margins=True,
margins_name="Totals")
PM_OD_by_start.columns.name = "Destination"
PM_OD_by_start.index.name = "Origin"
PM_OD_by_start.to_csv(top_directory\
+clean_data\
+"/PM_OD_by_start.csv")
In [36]:
PM_OD_by_start[[2,3,4]].head()[:3]
Out[36]:
In [37]:
# Create new Weights dataframe to merge with stations in QGIS
AM_PM_weight_counts_df = pd.DataFrame(index=AM_OD_by_start.index[:-1])
AM_PM_weight_counts_df["Start_Terminal"] = AM_PM_weight_counts_df.index
# 7am - 9am
AM_PM_weight_counts_df['AM_Origin_Weights'] = AM_OD_by_start['Totals'][:-1]
AM_PM_weight_counts_df['AM_Destination_Weights'] = AM_OD_by_start.transpose()['Totals'][:-1]
AM_PM_weight_counts_df['AM_OD_Difference'] = AM_PM_weight_counts_df['AM_Origin_Weights']\
- AM_PM_weight_counts_df['AM_Destination_Weights']
# 5pm - 7pm
AM_PM_weight_counts_df['PM_Origin_Weights'] = PM_OD_by_start['Totals'][:-1]
AM_PM_weight_counts_df['PM_Destination_Weights'] = PM_OD_by_start.transpose()['Totals'][:-1]
AM_PM_weight_counts_df['PM_OD_Difference'] = AM_PM_weight_counts_df['PM_Origin_Weights']\
- AM_PM_weight_counts_df['PM_Destination_Weights']
# Save to File
#AM_PM_weight_counts_df.to_csv(top_directory\
# +clean_data\
# +"/AM_PM_weight_counts_df.csv")
AM_PM_weight_counts_df.head()
Out[37]:
In [46]:
# Subcribers
subscribers = start_station_merge[start_station_merge["Subscription Type"]\
== "Subscriber"]
#subscribers.to_csv(top_directory+clean_data+'/subscribers.csv')
subscribers.head()[:2]
Out[46]:
In [53]:
# Subscriber OD Start Index __ Terminal
subscribers_by_start = subscribers[["Start Terminal",
"End Terminal"]]\
.pivot_table(index=["Start Terminal"],
columns=["End Terminal"],
aggfunc=len,
dropna=False,
fill_value=0,
margins=True,
margins_name="Totals")
subscribers_by_start.columns.name = "Destination"
subscribers_by_start.index.name = "Origin"
#subscribers_by_start.to_csv(top_directory\
# +clean_data\
# +"/subscribers_by_start.csv")
In [113]:
# Subscribers Stacked Pairwise
subscribers_pairwise = subscribers_by_start.stack().reset_index()
subscribers_pairwise = subscribers_pairwise[subscribers_pairwise["Origin"] != "Totals"]
subscribers_pairwise = subscribers_pairwise[subscribers_pairwise["Destination"] != "Totals"]
subscribers_pairwise.columns = ["Source", "Target", "Weight"]
subscribers_pairwise = subscribers_pairwise.sort_values(['Weight'], ascending=False)
#subscribers_pairwise.to_csv(top_directory\
# +clean_data\
# +"/subscribers_pairwise.csv")
subscribers_pairwise_top_5 = subscribers_pairwise.head()
subscribers_pairwise_top_5 = subscribers_pairwise_top_5.reset_index(drop=True)
#subscribers_pairwise_top_5.to_csv(top_directory\
# +clean_data\
# +"/subscribers_pairwise_top_5.csv")
subscribers_pairwise_top_5
Out[113]:
In [114]:
# Create a .shp for each OD trip combo SUBSCRIBER
for record in range(len(subscribers_pairwise_top_5)):
subscriber = gpd.GeoDataFrame()
start_station = subscribers_pairwise_top_5.iloc[record].Source
end_station = subscribers_pairwise_top_5.iloc[record].Target
subscriber["station_id"] = [start_station, end_station]
subscriber["Weight"] = [subscribers_pairwise_top_5.iloc[record].Weight] * 2
subscriber = subscriber.merge(whole_year_stations)
subscriber.crs = whole_year_stations.crs
# subscriber.to_file(top_directory\
# +clean_data\
# +"/top_subscriber_"\
# +str(record+1)\
# +".shp")
In [49]:
# Customer
customers = start_station_merge[start_station_merge["Subscription Type"]\
== "Customer"]
#customers.to_csv(top_directory+clean_data+'/customers.csv')
customers.head()[:2]
Out[49]:
In [115]:
# Customers OD Start Index __ Terminal
customers_by_start = customers[["Start Terminal",
"End Terminal"]]\
.pivot_table(index=["Start Terminal"],
columns=["End Terminal"],
aggfunc=len,
dropna=False,
fill_value=0,
margins=True,
margins_name="Totals")
customers_by_start.columns.name = "Destination"
customers_by_start.index.name = "Origin"
#customers_by_start.to_csv(top_directory\
# +clean_data\
# +"/customers_by_start.csv")
In [116]:
# Customers Stacked Pairwise
customers_pairwise = customers_by_start.stack().reset_index()
customers_pairwise = customers_pairwise[customers_pairwise["Origin"] != "Totals"]
customers_pairwise = customers_pairwise[customers_pairwise["Destination"] != "Totals"]
customers_pairwise.columns = ["Source", "Target", "Weight"]
customers_pairwise = customers_pairwise.sort_values(['Weight'], ascending=False)
#customers_pairwise.to_csv(top_directory\
# +clean_data\
# +"/customers_pairwise.csv")
customers_pairwise_top_5 = customers_pairwise.head()
customers_pairwise_top_5 = customers_pairwise_top_5.reset_index(drop=True)
#customers_pairwise_top_5.to_csv(top_directory\
# +clean_data\
# +"/customers_pairwise_top_5.csv")
customers_pairwise_top_5
Out[116]:
In [117]:
# Create a .shp for each OD trip combo CUSTOMER
for record in range(len(customers_pairwise_top_5)):
customer = gpd.GeoDataFrame()
start_station = customers_pairwise_top_5.iloc[record].Source
end_station = customers_pairwise_top_5.iloc[record].Target
customer["station_id"] = [start_station, end_station]
customer["Weight"] = [customers_pairwise_top_5.iloc[record].Weight] * 2
customer = customer.merge(whole_year_stations)
customer.crs = whole_year_stations.crs
# customer.to_file(top_directory\
# +clean_data\
# +"/top_customer_"\
# +str(record+1)\
# +".shp")
In [52]:
# Are the combination of Subscribers and Customers the same as Total Users?
print "Total Users --------------- ", len(start_station_merge)
print "Subscribers + Cusotmers --- ", len(customers) + len(subscribers)
print "Are the combination of Subscribers and Customers the same? ",\
len(customers) + len(subscribers) == len(start_station_merge)