In [2]:
# Imports data from HDF5 file containing AVL/APC data from SFMTA's Muni Bus Network
# Aggregates a number of performance variables into time periods for comparison
# Formats for visualisation in formats using projected coordinate system

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = 150
pd.options.display.float_format='{:,.2f}'.format

import sys
sys.path.append('C:/CASA/Workspace/sfdata_wrangler/sfdata_wrangler')

from SFMuniDataHelper import SFMuniDataHelper
from DataFrameViewer import DataFrameViewer

sfmuniHelper = SFMuniDataHelper()
viewer = DataFrameViewer()

In [13]:
years_list = {2008,2009,2010,2011,2012}
months_list = {'01','02','03','04','05','06','07','08','09','10','11','12'}
year_month_day = []

for year in years_list:
    for month in months_list:
       year_month_day.append(str(year)+month+'01')

print year_month_day


['20081101', '20081001', '20081201', '20080601', '20080701', '20080401', '20080501', '20080301', '20080801', '20080901', '20080201', '20080101', '20091101', '20091001', '20091201', '20090601', '20090701', '20090401', '20090501', '20090301', '20090801', '20090901', '20090201', '20090101', '20101101', '20101001', '20101201', '20100601', '20100701', '20100401', '20100501', '20100301', '20100801', '20100901', '20100201', '20100101', '20111101', '20111001', '20111201', '20110601', '20110701', '20110401', '20110501', '20110301', '20110801', '20110901', '20110201', '20110101', '20121101', '20121001', '20121201', '20120601', '20120701', '20120401', '20120501', '20120301', '20120801', '20120901', '20120201', '20120101']

In [5]:
# eventually convert filenames to arguments
hdffile = "D:/sfbusdatastore/sfmuni_aggregate.h5"
store = pd.HDFStore(hdffile)
store


Out[5]:
<class 'pandas.io.pytables.HDFStore'>
File path: D:/sfbusdatastore/sfmuni_aggregate.h5
/average                      frame_table  (typ->appendable,nrows->28692042,ncols->85,indexers->[index],dc->[MONTH,DOW,ROUTE,PATTCODE,DIR,TRIP,SEQ,NUMDAYS,OBSTRIPS,ROUTEA,VEHNO,SCHOOL,LASTTRIP,NEXTTRIP,HEADWAY,HEADWAY_STD,TOD,QSTOP,STOPNAME,TIMEPOINT,EOL,LAT,LAT_STD,LON,LON_STD,NS,EW,MAXVEL,MAXVEL_STD,MILES,MILES_STD,GODOM,GODOM_STD,VEHMILES,VEHMILES_STD,ON,ON_STD,OFF,OFF_STD,LOAD_ARR,LOAD_ARR_STD,LOAD_DEP,LOAD_DEP_STD,PASSMILES,PASSMILES_STD,PASSHOURS,PASSHOURS_STD,RDBRDNGS,RDBRDNGS_STD,LOADCODE,LOADCODE_STD,CAPACITY,CAPACITY_STD,DOORCYCLES,DOORCYCLES_STD,WHEELCHAIR,WHEELCHAIR_STD,BIKERACK,BIKERACK_STD,TIMESTOP,TIMESTOP_S,TIMESTOP_DEV,TIMESTOP_DEV_STD,DOORCLOSE,DOORCLOSE_S,DOORCLOSE_DEV,DOORCLOSE_DEV_STD,DWELL,DWELL_STD,DWELL_S,PULLOUT,PULLDWELL,PULLDWELL_STD,RUNTIME,RUNTIME_STD,RUNTIME_S,RECOVERY,RECOVERY_STD,RECOVERY_S,DLPMIN,DLPMIN_STD,ONTIME2,ONTIME2_STD,ONTIME10,ONTIME10_STD])
/daily_route_stops            frame_table  (typ->appendable,nrows->987424,ncols->47,indexers->[index],dc->[MONTH,DOW,ROUTE,PATTCODE,DIR,SEQ,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,ROUTEA,HEADWAY,QSTOP,STOPNAME,TIMEPOINT,EOL,LAT,LON,NS,EW,MAXVEL,MILES,GODOM,VEHMILES,ON,OFF,LOAD_ARR,LOAD_DEP,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                        
/daily_routes                 frame_table  (typ->appendable,nrows->12784,ncols->32,indexers->[index],dc->[MONTH,DOW,ROUTE,DIR,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,ROUTEA,HEADWAY,VEHMILES,ON,OFF,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
/daily_stops                  frame_table  (typ->appendable,nrows->381941,ncols->25,indexers->[index],dc->[MONTH,DOW,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,QSTOP,STOPNAME,TIMEPOINT,EOL,LAT,LON,ON,OFF,RDBRDNGS,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
/daily_system                 frame_table  (typ->appendable,nrows->121,ncols->28,indexers->[index],dc->[MONTH,DOW,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,VEHMILES,ON,OFF,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
/tod_route_stops              frame_table  (typ->appendable,nrows->3810220,ncols->48,indexers->[index],dc->[TOD,MONTH,DOW,ROUTE,PATTCODE,DIR,SEQ,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,ROUTEA,HEADWAY,QSTOP,STOPNAME,TIMEPOINT,EOL,LAT,LON,NS,EW,MAXVEL,MILES,GODOM,VEHMILES,ON,OFF,LOAD_ARR,LOAD_DEP,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                   
/tod_routes                   frame_table  (typ->appendable,nrows->67767,ncols->33,indexers->[index],dc->[TOD,MONTH,DOW,ROUTE,DIR,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,ROUTEA,HEADWAY,VEHMILES,ON,OFF,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
/tod_stops                    frame_table  (typ->appendable,nrows->2367349,ncols->26,indexers->[index],dc->[TOD,MONTH,DOW,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,QSTOP,STOPNAME,TIMEPOINT,EOL,LAT,LON,ON,OFF,RDBRDNGS,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
/tod_system                   frame_table  (typ->appendable,nrows->926,ncols->29,indexers->[index],dc->[TOD,MONTH,DOW,NUMDAYS,DAILYTRIPS,TOTTRIPS,OBSTRIPS,VEHMILES,ON,OFF,PASSMILES,PASSHOURS,RDBRDNGS,CAPACITY,DOORCYCLES,WHEELCHAIR,BIKERACK,TIMESTOP_DEV,DOORCLOSE_DEV,DWELL,DWELL_S,PULLDWELL,RUNTIME,RUNTIME_S,RECOVERY,RECOVERY_S,DLPMIN,ONTIME2,ONTIME10])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [140]:
# Specify direction, 1 = Inbound (to downtown), 0 = Outbound
direction = 1

# Specify - Weekday = 1, Sat = 2, Sun =3
dow = 1

# Specify TODs {300,600,900,1400,1600,1900,2200}
tod = 600

# Specify months
month = 20100301
month_2 = 20100601

In [ ]:


In [141]:
#month 1 - query and select relevant columns
tod_route_stops_q = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q = tod_route_stops_q[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]

#month 2 - query and select random columns
tod_route_stops_q2 = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_2),'DIR=='+str(direction),'TOD=='+str(tod)])
tod_r_s_q2 = tod_route_stops_q2[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]

#perform outer join on unique stop ID, flags stops missing between sets
tod_r_s_q3 = pd.merge(tod_r_s_q, tod_r_s_q2, on='QSTOP', how='outer')

#aggregates calculates the average value for each stop
merged_data = tod_r_s_q3.groupby(['QSTOP','ROUTE_x','ROUTE_y','MONTH_x','MONTH_y','TOD_x','TOD_y']).agg({'LAT_x': np.mean, 'LAT_y': np.mean, 'LON_x': np.mean, 'LON_y': np.mean, 'ON_x': np.mean, 'ON_y': np.mean, 'OFF_x': np.mean, 'OFF_y': np.mean, 'LOAD_DEP_x': np.mean, 'LOAD_DEP_y': np.mean, 'CAPACITY_x': np.mean, 'CAPACITY_y': np.mean, 'MAXVEL_x': np.mean, 'MAXVEL_y': np.mean, 'HEADWAY_x': np.mean, 'HEADWAY_y': np.mean, 'ONTIME2_x': np.mean, 'ONTIME2_y': np.mean, 'ONTIME10_x': np.mean, 'ONTIME10_y': np.mean})
    
#quick way to remove unwanted header formatting
merged_data.to_csv('temp.csv')
merged_data_csv = pd.read_csv('temp.csv')
    
#removes duplicate unmatching route-route pairs
merged_data_clean = merged_data_csv[merged_data_csv.ROUTE_x == merged_data_csv.ROUTE_y]

#calculate average lat/lon for each stop
LON_xy = (merged_data_clean.LON_x + merged_data_clean.LON_y)*0.5 
LAT_xy = (merged_data_clean.LAT_x + merged_data_clean.LAT_y)*0.5

#add as columns to dataframe
merged_data_clean['LON_xy'] = LON_xy
merged_data_clean['LAT_xy'] = LAT_xy

merged_data_clean.head()


Out[141]:
QSTOP ROUTE_x ROUTE_y MONTH_x MONTH_y TOD_x TOD_y OFF_y OFF_x HEADWAY_y HEADWAY_x LON_y LON_x ONTIME2_y CAPACITY_y ONTIME10_x ONTIME10_y CAPACITY_x LOAD_DEP_y LOAD_DEP_x LAT_y LAT_x MAXVEL_x MAXVEL_y ONTIME2_x ON_x ON_y LON_xy LAT_xy
0 390.00 17.00 17.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 13.00 20.00 27.67 19.71 -122.48 -122.48 0.33 378.00 1.00 1.00 441.00 82.25 54.00 37.72 37.72 24.86 10.04 0.29 3.00 16.75 -122.48 37.72
5 390.00 28.00 28.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 127.66 105.67 11.29 9.42 -122.48 -122.48 0.97 1,071.00 1.00 1.00 756.00 400.10 319.33 37.72 37.72 24.58 22.61 0.97 160.67 127.65 -122.48 37.72
10 390.00 29.00 29.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 64.62 58.21 12.24 7.38 -122.48 -122.48 0.86 630.00 0.92 1.00 283.50 232.42 110.96 37.72 37.72 19.05 19.49 0.53 8.67 21.34 -122.48 37.72
15 390.00 528.00 528.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 102.73 107.50 12.00 10.60 -122.47 -122.48 0.91 567.00 1.00 1.00 630.00 177.26 217.00 37.72 37.72 24.70 24.05 1.00 88.00 71.72 -122.48 37.72
16 392.00 28.00 28.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 16.91 11.00 11.29 9.31 -122.48 -122.48 0.88 1,071.00 1.00 1.00 819.00 560.72 407.83 37.77 37.77 35.00 32.01 0.63 47.17 59.54 -122.48 37.77

5 rows × 29 columns


In [142]:
#time to project from lat/lon into local projection system
# for this, State Plane California Zone 3
# http://spatialreference.org/ref/epsg/nad83-california-zone-3-ftus/

from pyproj import Proj, transform
import csv

# current format 
inProj = Proj(proj='longlat')

# format to project to
#doesn't seem to work in Processing #outProj = Proj(init='epsg:2227')
#outProj = Proj(init='epsg:3786')
outProj = Proj(proj='utm',zone=10,datim'WGS84',ellps='WGS84',units='m')

# utilise pyproj library to convert latlon to projection, write to temporary csv (dataframe didn't take iterating well)
with open('projected-coordinates.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['i','proj_x','proj_y'])
    for i,entry in merged_data_clean.iterrows():
        x1,y1 = entry['LON_xy'],entry['LAT_xy']
        x2,y2 = transform(inProj,outProj,x1,y1)
        csvwriter.writerow([i,x2,y2])

projected_coordinates = pd.DataFrame.from_csv('projected-coordinates.csv')
#projected_coordinates


finished_set = merged_data_clean.join(projected_coordinates)
finished_set.head()


Out[142]:
QSTOP ROUTE_x ROUTE_y MONTH_x MONTH_y TOD_x TOD_y OFF_y OFF_x HEADWAY_y HEADWAY_x LON_y LON_x ONTIME2_y CAPACITY_y ONTIME10_x ONTIME10_y CAPACITY_x LOAD_DEP_y LOAD_DEP_x LAT_y LAT_x MAXVEL_x MAXVEL_y ONTIME2_x ON_x ON_y LON_xy LAT_xy proj_x proj_y
0 390.00 17.00 17.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 13.00 20.00 27.67 19.71 -122.48 -122.48 0.33 378.00 1.00 1.00 441.00 82.25 54.00 37.72 37.72 24.86 10.04 0.29 3.00 16.75 -122.48 37.72 -13,618,620.41 4,194,448.10
5 390.00 28.00 28.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 127.66 105.67 11.29 9.42 -122.48 -122.48 0.97 1,071.00 1.00 1.00 756.00 400.10 319.33 37.72 37.72 24.58 22.61 0.97 160.67 127.65 -122.48 37.72 -13,618,614.51 4,194,448.20
10 390.00 29.00 29.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 64.62 58.21 12.24 7.38 -122.48 -122.48 0.86 630.00 0.92 1.00 283.50 232.42 110.96 37.72 37.72 19.05 19.49 0.53 8.67 21.34 -122.48 37.72 -13,618,615.53 4,194,451.93
15 390.00 528.00 528.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 102.73 107.50 12.00 10.60 -122.47 -122.48 0.91 567.00 1.00 1.00 630.00 177.26 217.00 37.72 37.72 24.70 24.05 1.00 88.00 71.72 -122.48 37.72 -13,618,613.66 4,194,447.28
16 392.00 28.00 28.00 2010-03-01 00:00:00 2010-06-01 00:00:00 600.00 600.00 16.91 11.00 11.29 9.31 -122.48 -122.48 0.88 1,071.00 1.00 1.00 819.00 560.72 407.83 37.77 37.77 35.00 32.01 0.63 47.17 59.54 -122.48 37.77 -13,618,857.57 4,199,297.86

5 rows × 31 columns


In [143]:
#save as csv
finished_set.to_csv(str(tod)+'_'+str(month)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')

In [144]:
for tod in {600,900,1400,1600,1900,2200}:
    for direction in {1,2}:
        finished_set.to_csv(str(tod)+'_'+str(month)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')


1600 1
1600 2
1400 1
1400 2
900 1
900 2
1900 1
1900 2
600 1
600 2
2200 1
2200 2

In [6]:
years_list = {2010}
months_list = {'01','02','03','04','05','06','07','08','09','10','11','12'}
dow = 1
year_month_day = []

for year in years_list:
    for month in months_list:
       year_month_day.append(str(year)+month+'01')

for month_1 in year_month_day:
    for month_2 in year_month_day:
        if month_2 > month_1:
            for direction in {1,0}:   
                for tod in {300,600,900,1400,1600,1900,2200}:
                    #month 1 - query and select relevant columns
                    tod_route_stops_q = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_1),'DIR=='+str(direction),'TOD=='+str(tod)])
                    tod_r_s_q = tod_route_stops_q[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
                    
                    #month 2 - query and select random columns
                    tod_route_stops_q2 = store.select('tod_route_stops',where=['DOW=='+str(dow),'MONTH=='+str(month_2),'DIR=='+str(direction),'TOD=='+str(tod)])
                    tod_r_s_q2 = tod_route_stops_q2[['MONTH','QSTOP','STOPNAME','LAT','LON','ROUTE','TOD','LOAD_DEP','CAPACITY','ON','OFF','MAXVEL','HEADWAY','ONTIME2','ONTIME10']]
                    
                    #perform outer join on unique stop ID, flags stops missing between sets
                    tod_r_s_q3 = pd.merge(tod_r_s_q, tod_r_s_q2, on='QSTOP', how='outer')
                    
                    #aggregates calculates the average value for each stop
                    merged_data = tod_r_s_q3.groupby(['QSTOP','ROUTE_x','ROUTE_y','MONTH_x','MONTH_y','TOD_x','TOD_y']).agg({'LAT_x': np.mean, 'LAT_y': np.mean, 'LON_x': np.mean, 'LON_y': np.mean, 'ON_x': np.mean, 'ON_y': np.mean, 'OFF_x': np.mean, 'OFF_y': np.mean, 'LOAD_DEP_x': np.mean, 'LOAD_DEP_y': np.mean, 'CAPACITY_x': np.mean, 'CAPACITY_y': np.mean, 'MAXVEL_x': np.mean, 'MAXVEL_y': np.mean, 'HEADWAY_x': np.mean, 'HEADWAY_y': np.mean, 'ONTIME2_x': np.mean, 'ONTIME2_y': np.mean, 'ONTIME10_x': np.mean, 'ONTIME10_y': np.mean})
                        
                    #quick way to remove unwanted header formatting
                    merged_data.to_csv('temp.csv')
                    merged_data_csv = pd.read_csv('temp.csv')
                        
                    #removes duplicate unmatching route-route pairs
                    merged_data_clean = merged_data_csv[merged_data_csv.ROUTE_x == merged_data_csv.ROUTE_y]
                    
                    #calculate average lat/lon for each stop
                    LON_xy = (merged_data_clean.LON_x + merged_data_clean.LON_y)*0.5 
                    LAT_xy = (merged_data_clean.LAT_x + merged_data_clean.LAT_y)*0.5
                    
                    #add as columns to dataframe
                    merged_data_clean['LON_xy'] = LON_xy
                    merged_data_clean['LAT_xy'] = LAT_xy
                    
                    merged_data_clean.head()
                    
                    
                    #time to project from lat/lon into local projection system
                    # for this, State Plane California Zone 3
                    # http://spatialreference.org/ref/epsg/nad83-california-zone-3-ftus/
                    
                    from pyproj import Proj, transform
                    import csv
                    
                    # current format
                    inProj = Proj(proj='longlat')
                    
                    # format to project to
                    #doesn't seem to work in Processing #outProj = Proj(init='epsg:2227')
                    #outProj = Proj(init='epsg:3786')
                    outProj = Proj(proj='utm',zone=10,datum='WGS84',ellps='WGS84',units='m')
                    
                    # utilise pyproj library to convert latlon to projection, write to temporary csv (dataframe didn't take iterating well)
                    with open('projected-coordinates.csv', 'wb') as csvfile:
                        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
                        csvwriter.writerow(['i','proj_x','proj_y'])
                        for i,entry in merged_data_clean.iterrows():
                            x1,y1 = entry['LON_xy'],entry['LAT_xy']
                            x2,y2 = transform(inProj,outProj,x1,y1)
                            csvwriter.writerow([i,x2,y2])
                    
                    projected_coordinates = pd.DataFrame.from_csv('projected-coordinates.csv')
                    #projected_coordinates
                    
                    
                    finished_set = merged_data_clean.join(projected_coordinates)
                    finished_set.head()
        
                    finished_set.to_csv('data/'+str(tod)+'_'+str(month_1)+'_to_'+str(month_2)+'_dir_'+str(direction)+'.csv')

In [ ]: