In [2]:
# Imports data from HDF5 file containing AVL/APC data from SFMTA's Muni Bus Network
# Aggregates a number of performance variables into time periods for comparison
# Formats for visualisation in formats using projected coordinate system
In [3]:
%load_ext autoreload
%autoreload 2
In [4]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 150
pd.options.display.float_format='{:,.2f}'.format
import sys
sys.path.append('C:/CASA/Workspace/sfdata_wrangler/sfdata_wrangler')
from SFMuniDataHelper import SFMuniDataHelper
from DataFrameViewer import DataFrameViewer
sfmuniHelper = SFMuniDataHelper()
viewer = DataFrameViewer()
In [13]:
years_list = {2008,2009,2010,2011,2012}
months_list = {'01','02','03','04','05','06','07','08','09','10','11','12'}
year_month_day = []
for year in years_list:
for month in months_list:
year_month_day.append(str(year)+month+'01')
print year_month_day
In [5]:
# eventually convert filenames to arguments
hdffile = "D:/sfbusdatastore/sfmuni_aggregate.h5"
store = pd.HDFStore(hdffile)
store
Out[5]:
In [140]:
# Specify direction, 1 = Inbound (to downtown), 0 = Outbound
direction = 1
# Specify - Weekday = 1, Sat = 2, Sun =3
dow = 1
# Specify TODs {300,600,900,1400,1600,1900,2200}
tod = 600
# Specify months
month = 20100301
month_2 = 20100601
In [ ]:
In [141]:
#month 1 - query and select relevant columns
tod_route_stops_q = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q = tod_route_stops_q[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
#month 2 - query and select random columns
tod_route_stops_q2 = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_2),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q2 = tod_route_stops_q2[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
#perform outer join on unique stop ID, flags stops missing between sets
tod_r_s_q3 = pd.merge(tod_r_s_q, tod_r_s_q2, on='QSTOP', how='outer')
#aggregates calculates the average value for each stop
merged_data = tod_r_s_q3.groupby(['QSTOP','ROUTE_x','ROUTE_y','MONTH_x','MONTH_y','TOD_x','TOD_y']).agg({'LAT_x': np.mean, 'LAT_y': np.mean, 'LON_x': np.mean, 'LON_y': np.mean, 'ON_x': np.mean, 'ON_y': np.mean, 'OFF_x': np.mean, 'OFF_y': np.mean, 'LOAD_DEP_x': np.mean, 'LOAD_DEP_y': np.mean, 'CAPACITY_x': np.mean, 'CAPACITY_y': np.mean, 'MAXVEL_x': np.mean, 'MAXVEL_y': np.mean, 'HEADWAY_x': np.mean, 'HEADWAY_y': np.mean, 'ONTIME2_x': np.mean, 'ONTIME2_y': np.mean, 'ONTIME10_x': np.mean, 'ONTIME10_y': np.mean})
#quick way to remove unwanted header formatting
merged_data.to_csv('temp.csv')
merged_data_csv = pd.read_csv('temp.csv')
#removes duplicate unmatching route-route pairs
merged_data_clean = merged_data_csv[merged_data_csv.ROUTE_x == merged_data_csv.ROUTE_y]
#calculate average lat/lon for each stop
LON_xy = (merged_data_clean.LON_x + merged_data_clean.LON_y)*0.5
LAT_xy = (merged_data_clean.LAT_x + merged_data_clean.LAT_y)*0.5
#add as columns to dataframe
merged_data_clean['LON_xy'] = LON_xy
merged_data_clean['LAT_xy'] = LAT_xy
merged_data_clean.head()
Out[141]:
In [142]:
#time to project from lat/lon into local projection system
# for this, State Plane California Zone 3
# http://spatialreference.org/ref/epsg/nad83-california-zone-3-ftus/
from pyproj import Proj, transform
import csv
# current format
inProj = Proj(proj='longlat')
# format to project to
#doesn't seem to work in Processing #outProj = Proj(init='epsg:2227')
#outProj = Proj(init='epsg:3786')
outProj = Proj(proj='utm',zone=10,datim'WGS84',ellps='WGS84',units='m')
# utilise pyproj library to convert latlon to projection, write to temporary csv (dataframe didn't take iterating well)
with open('projected-coordinates.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(['i','proj_x','proj_y'])
for i,entry in merged_data_clean.iterrows():
x1,y1 = entry['LON_xy'],entry['LAT_xy']
x2,y2 = transform(inProj,outProj,x1,y1)
csvwriter.writerow([i,x2,y2])
projected_coordinates = pd.DataFrame.from_csv('projected-coordinates.csv')
#projected_coordinates
finished_set = merged_data_clean.join(projected_coordinates)
finished_set.head()
Out[142]:
In [143]:
#save as csv
finished_set.to_csv(str(tod)+'_'+str(month)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')
In [144]:
for tod in {600,900,1400,1600,1900,2200}:
for direction in {1,2}:
finished_set.to_csv(str(tod)+'_'+str(month)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')
In [6]:
years_list = {2010}
months_list = {'01','02','03','04','05','06','07','08','09','10','11','12'}
dow = 1
year_month_day = []
for year in years_list:
for month in months_list:
year_month_day.append(str(year)+month+'01')
for month_1 in year_month_day:
for month_2 in year_month_day:
if month_2 > month_1:
for direction in {1,0}:
for tod in {300,600,900,1400,1600,1900,2200}:
#month 1 - query and select relevant columns
tod_route_stops_q = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_1),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q = tod_route_stops_q[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
#month 2 - query and select random columns
tod_route_stops_q2 = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_2),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q2 = tod_route_stops_q2[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
#perform outer join on unique stop ID, flags stops missing between sets
tod_r_s_q3 = pd.merge(tod_r_s_q, tod_r_s_q2, on='QSTOP', how='outer')
#aggregates calculates the average value for each stop
merged_data = tod_r_s_q3.groupby(['QSTOP','ROUTE_x','ROUTE_y','MONTH_x','MONTH_y','TOD_x','TOD_y']).agg({'LAT_x': np.mean, 'LAT_y': np.mean, 'LON_x': np.mean, 'LON_y': np.mean, 'ON_x': np.mean, 'ON_y': np.mean, 'OFF_x': np.mean, 'OFF_y': np.mean, 'LOAD_DEP_x': np.mean, 'LOAD_DEP_y': np.mean, 'CAPACITY_x': np.mean, 'CAPACITY_y': np.mean, 'MAXVEL_x': np.mean, 'MAXVEL_y': np.mean, 'HEADWAY_x': np.mean, 'HEADWAY_y': np.mean, 'ONTIME2_x': np.mean, 'ONTIME2_y': np.mean, 'ONTIME10_x': np.mean, 'ONTIME10_y': np.mean})
#quick way to remove unwanted header formatting
merged_data.to_csv('temp.csv')
merged_data_csv = pd.read_csv('temp.csv')
#removes duplicate unmatching route-route pairs
merged_data_clean = merged_data_csv[merged_data_csv.ROUTE_x == merged_data_csv.ROUTE_y]
#calculate average lat/lon for each stop
LON_xy = (merged_data_clean.LON_x + merged_data_clean.LON_y)*0.5
LAT_xy = (merged_data_clean.LAT_x + merged_data_clean.LAT_y)*0.5
#add as columns to dataframe
merged_data_clean['LON_xy'] = LON_xy
merged_data_clean['LAT_xy'] = LAT_xy
merged_data_clean.head()
#time to project from lat/lon into local projection system
# for this, State Plane California Zone 3
# http://spatialreference.org/ref/epsg/nad83-california-zone-3-ftus/
from pyproj import Proj, transform
import csv
# current format
inProj = Proj(proj='longlat')
# format to project to
#doesn't seem to work in Processing #outProj = Proj(init='epsg:2227')
#outProj = Proj(init='epsg:3786')
outProj = Proj(proj='utm',zone=10,datum='WGS84',ellps='WGS84',units='m')
# utilise pyproj library to convert latlon to projection, write to temporary csv (dataframe didn't take iterating well)
with open('projected-coordinates.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(['i','proj_x','proj_y'])
for i,entry in merged_data_clean.iterrows():
x1,y1 = entry['LON_xy'],entry['LAT_xy']
x2,y2 = transform(inProj,outProj,x1,y1)
csvwriter.writerow([i,x2,y2])
projected_coordinates = pd.DataFrame.from_csv('projected-coordinates.csv')
#projected_coordinates
finished_set = merged_data_clean.join(projected_coordinates)
finished_set.head()
finished_set.to_csv('data/'+str(tod)+'_'+str(month_1)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')
In [ ]: