Building on the API Exploration Notebook and the Filtering Observed Arrivals notebook. Let's explore a different approach for analyzing the data. Note that, I modified the api scraper to only retrieve the soonest time from the next subway API. This should (hopefully) help with some of the issues we were previously having. I made a new database and ran the API for a few hours on Sunday polling only St. George station (station_id == 10) at a poll frequency of once every 10 seconds. I will post the data online so that others can try it out.
Created by Rami on May 6/2018
In [36]:
import datetime
from psycopg2 import connect
import configparser
import pandas as pd
import pandas.io.sql as pandasql
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider
In [12]:
%matplotlib qt
In [13]:
try:
con.close()
except:
print("No existing connection... moving on")
In [14]:
CONFIG = configparser.ConfigParser(interpolation=None)
CONFIG.read('../db.cfg')
dbset = CONFIG['DBSETTINGS']
con = connect(**dbset)
In [15]:
sql = '''SELECT requestid, stationid, lineid, create_date, request_date, station_char, subwayline, system_message_type,
timint, traindirection, trainid, train_message
FROM requests
INNER JOIN ntas_data USING (requestid)
WHERE request_date >= '2017-06-14'::DATE + interval '6 hours 5 minutes'
AND request_date < '2017-06-14'::DATE + interval '29 hours'
AND stationid = 10
AND traindirection = 'South'
ORDER BY request_date
'''
In [26]:
stg_south = pandasql.read_sql(sql, con)
In [27]:
stg_south
Out[27]:
In [28]:
stg_south_resamp = stg_south[stg_south.index % 3 == 0]
In [29]:
stg_south_resamp
Out[29]:
Now we need to process the data to extract some useful information from the raw ntas_data. To do this we're going to go row by row through the table shown above to get arrival times, departure times and wait times.
arrival_times
are the times at which a train arrives at St. George station
departure_times
are the times at which a train leaves St. George station
all_wait_times
are all the reported wait times from every API call (which in this case is every 10 seconds)
expected_wait_times
are the expected wait times immediately after a train has departed the station. They represent the worst case wait times.
In [30]:
arrival_times = []
departure_times = []
all_wait_times = []
all_time_stamps = []
expected_wait_times = []
prev_arrival_train_id = -1
for index, row in stg_south_resamp.iterrows():
if index == 0:
prev_departure_train_id = row['trainid']
all_wait_times.append(row['timint'])
all_time_stamps.append(row['create_date'])
if (row['trainid'] != prev_arrival_train_id):
arrival_times.append(row['create_date'])
prev_arrival_train_id = row['trainid']
#elif (row['trainid'] != prev_departure_train_id):
departure_times.append(row['create_date'])
expected_wait_times.append(row['timint'])
#prev_departure_train_id = row['trainid']
We can look at all the reported wait times. While this is somewhat interesting, it doesn't tell us very much
In [206]:
plt.plot(all_time_stamps,all_wait_times)
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('All reported wait times at St. George')
plt.savefig('all_wait_times.png', dpi=500)
plt.show()
In [286]:
def timeToArrival(all_time_stamps,all_wait_times,arrival_times):
actual_wait_times = []
i = 0
k = 0
arrival_time = arrival_times[i]
for time in all_time_stamps:
if (all_wait_times[k] == 0):
actual_wait_times.append(arrival_times[0]-arrival_times[0])
k+=1
continue
while ((arrival_time - time).total_seconds() < 0):
i+=1
if (i > (len(arrival_times) -1)):
break
arrival_time = arrival_times[i]
actual_wait_times.append(arrival_time - time)
k+=1
return actual_wait_times
print(len(all_time_stamps[0:-1]))
actual_wait_times_all = timeToArrival(all_time_stamps,all_wait_times,arrival_times)
In [295]:
def sliding_window_filter(input_mat,window_size,overlap):
average_time = []
max_time = []
for i in range(0,len(input_mat)-window_size,overlap):
window = input_mat[i:(i+window_size)]
average_time.append(np.mean(window))
max_time.append(np.mean(window))
return average_time #, max_time
window_size = 30
overlap = 25
#average_time, max_time = sliding_window_filter(all_wait_times,window_size, overlap)
#times = all_time_stamps[0:len(all_time_stamps)-window_size:overlap]
#times = all_time_stamps[0:len(actual_wait_times_all)]
times = all_time_stamps[0:len(all_time_stamps)-window_size:overlap]
plt.plot(times,np.floor(sliding_window_filter(convert_timedelta_to_mins(actual_wait_times_all),window_size,overlap)))
#average_time, max_time = sliding_window_filter(convert_timedelta_to_mins(actual_wait_times_all),window_size, overlap)
plt.plot(times,np.ceil(sliding_window_filter(all_wait_times,window_size,overlap)))
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('All reported wait times at St. George')
plt.show()
In [289]:
class sliding_figure:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider
def __init__(self,all_time_stamps,all_wait_times):
self.fig, self.ax = plt.subplots()
plt.subplots_adjust(bottom=0.25)
self.t = all_time_stamps;
self.s = all_wait_times;
self.l, = plt.plot(self.t,self.s)
self.y_min = 0.0;
self.y_max = max(self.s)
plt.axis([self.t[0], self.t[100], self.y_min, self.y_max])
x_dt = self.t[100] - self.t[0]
self.axcolor = 'lightgoldenrodyellow'
self.axpos = plt.axes([0.2, 0.1, 0.65, 0.03], facecolor=axcolor)
self.spos = Slider(self.axpos, 'Pos', matplotlib.dates.date2num(self.t[0]), matplotlib.dates.date2num(self.t[-1]))
#self.showPlot()
# pretty date names
plt.gcf().autofmt_xdate()
self.plt = plt
#self.showPlot()
def update(self,val):
pos = self.spos.val
self.xmax_time = matplotlib.dates.num2date(pos) + x_dt
self.xmin_time = pos
self.ax.axis([self.xmin_time, self.xmax_time, self.y_min, self.y_max])
fig.canvas.draw_idle()
def showPlot(self):
self.spos.on_changed(self.update)
self.plt.show()
In [296]:
wait_times_figure = sliding_figure(all_time_stamps, all_wait_times)
wait_times_figure.showPlot()
In [84]:
def time_delta(times):
delta_times = []
for n in range(0,len(times)-1):
time_diff = times[n+1] - times[n]
delta_times.append(time_diff/np.timedelta64(1, 's'))
return delta_times
In [85]:
delta_times = time_delta(arrival_times)
In [86]:
#delta_times
In [162]:
plt.plot(arrival_times[:-1],np.multiply(delta_times,1/60.0))
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Headway (mins)')
plt.title('Headway between trains as they approach St. George')
plt.savefig('headway.png', dpi=500)
In [88]:
time_at_station = np.subtract(departure_times[:],arrival_times[:])
In [89]:
#time_at_station
In [133]:
def convert_timedelta_to_mins(mat):
result = []
for element in mat:
result.append((element/np.timedelta64(1, 'm')))
return result
In [91]:
time_at_station_mins = convert_timedelta_to_mins(time_at_station)
In [92]:
plt.plot(departure_times,time_at_station_mins)
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Duration of time at station (mins)')
plt.title('Duration of time that trains spend at St. George Station')
Out[92]:
In [93]:
#expected_wait_times
In [165]:
plt.plot(arrival_times,expected_wait_times)
plt.ylabel('Expected Wait Time (mins)')
plt.xticks(fontsize=10, rotation=45)
plt.xlabel('Time')
plt.title('Worst-case expected wait times for next train at St. George')
plt.savefig('expected_wait_times.png', dpi=500)
In [ ]:
It's instructive if we can look at the actual worst-case wait time and compare this to the expected worst case wait time. In this case, we will also consider the actual worst-case wait time as the time between when a train departs and the next train arrives (i.e the difference between the arrival time and the previous departed time)
In [142]:
actual_wait_times = np.subtract(arrival_times[1:],arrival_times[:-1])
In [143]:
actual_wait_times_mins = convert_timedelta_to_mins(actual_wait_times)
In [166]:
plt.plot(arrival_times[1:],actual_wait_times_mins,color = 'C1')
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Actual wait time (mins)')
plt.title('Worst-case actual wait times for next train at St. George')
plt.savefig('actual_wait_times.png', dpi=500)
plt.show()
print(len(actual_wait_times_mins))
In [177]:
window_size = 15
overlap = 14
average_time, max_time = sliding_window_filter(actual_wait_times_mins ,window_size, overlap)
print(len(average_time))
times = arrival_times[0:len(all_time_stamps)-window_size:overlap]
print(len(times))
plt.plot(times[1:],average_time)
plt.plot(times[1:],max_time)
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('All reported wait times at St. George')
plt.show()
In [139]:
print(len(expected_wait_times))
print(len(arrival_times))
print(len(arrival_times))
print(len(actual_wait_times_mins))
type(arrival_times[1])
arrival_times_pdt = []
for item in arrival_times:
arrival_times_pdt.append(datetime.time(item.to_pydatetime().hour,item.to_pydatetime().minute))
arrival_times_pdt[2]
Out[139]:
In [188]:
plt.plot(arrival_times,expected_wait_times)
plt.plot(arrival_times[1:],np.floor(actual_wait_times_mins[:]))
#plt.legend(['Expected Wait for Next Train','Actual Wait Time for Next Train'],
# bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('Comparing actual and expected wait times at St. George')
lgd = plt.legend(['Expected Wait for Next Train','Actual Wait Time for Next Train'],
bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('actual_and_expected_wait_times.png', bbox_extra_artists=(lgd,), bbox_inches='tight', dpi=700)
In [285]:
window_size = 15
overlap = 12
average_time = sliding_window_filter(actual_wait_times_mins,window_size, overlap)
print(len(average_time))
times = arrival_times[0:len(all_time_stamps)-window_size:overlap]
print(len(times))
plt.plot(times[1:],np.floor(average_time))
average_time = sliding_window_filter(np.ceil(expected_wait_times),window_size, overlap)
plt.plot(times[1:],np.floor(average_time))
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('All reported wait times at St. George')
plt.show()
lgd = plt.legend(['Actual Wait for Next Train','Expected Wait Time for Next Train'])
We can also plot all the reported wait times too!
In [170]:
plt.plot(departure_times,expected_wait_times)
plt.plot(departure_times[1:],actual_wait_times_mins)
plt.plot(all_time_stamps,all_wait_times)
plt.legend(['Expected Wait for Next Train','Actual Wait Time for Next Train','All Reported Wait Times'],
bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=45)
plt.ylabel('Wait Time (mins)')
plt.title('Comparing actual and expected wait times at St. George')
Out[170]:
We can also look at how long trains spend at St. George
In [24]:
plt.plot(all_time_stamps,all_wait_times)
plt.plot(arrival_times[:],time_at_station_mins)
plt.title('Durtion of time trains spend at St.George')
plt.xlabel('Time')
plt.xticks(fontsize=10, rotation=90)
plt.ylabel('Time (mins)')
plt.legend(['All Reported Wait Times','Time train spends at station (mins)'],
bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[24]:
In [ ]: