Comparison can be made with the official statistics: http://www.infrabel.be/nl/over-infrabel/stiptheid/rapporten/2015/2
In [ ]:
from railfetcher import *
from datetime import datetime, date, timedelta, time
from dateutil import parser
from mpl_toolkits.basemap import Basemap
import pymysql
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
In [ ]:
def toUnix(datetime):
unix = datetime.strftime('%s')
return unix
def convertDatetime(unix):
dt = datetime.fromtimestamp(unix)
return dt
def convertDate(unix):
d = date.fromtimestamp(unix)
return d
In [ ]:
class RailDatabase():
def __init__(self, isNew):
if isNew:
self.conn = pymysql.connect(host='localhost', port=3306,
user='jnevens', passwd='XXX', db='newrailDB')
else:
self.conn = pymysql.connect(host='localhost', port=3306,
user='jnevens', passwd='XXX', db='oldrailDB')
def getAllRoutes(self, date):
C = self.conn.cursor()
C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
rows = C.fetchall()
C.close()
return rows
def getStops(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
rows = C.fetchall()
C.close()
return rows
def getLastStop(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime',
(routeID,))
rows = C.fetchall()
last = rows[-1:]
C.close()
return last
def getStations(self):
C = self.conn.cursor()
C.execute('SELECT name_nl FROM station')
rows = C.fetchall()
C.close()
return rows
def getStationName(self, stationID):
C = self.conn.cursor()
C.execute('SELECT name_nl FROM station WHERE station_id = %s', (stationID,))
row = C.fetchone()
C.close()
return row[0]
def getStationLoc(self, nameNL):
C = self.conn.cursor()
C.execute('SELECT latitude, longitude FROM station WHERE name_nl = %s', (nameNL,))
row = C.fetchone()
C.close()
return row
class Config():
def __init__(self, isNew):
self.new = isNew
def period(self):
if self.new:
return (date(2014, 12, 15), date(2015, 3, 22))
else:
return (date(2014, 10, 27), date(2014, 12, 14))
In [ ]:
conf = Config(False)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(False)
#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#divided by the amount of trains on that day.
#In avg-case scenario, i.e. avg of avg of arrival and avg of departure delay
#divided by the amount of trains on that day.
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
#Instead of keeping a list of worst_delays {which is O(n) in memory},
#the worst_delay is accumulated and
#divided by the amount of data points {which is O(1) in memory}
total_worst_delay = 0
worst_delay_count = 0
total_avg_delay = 0
avg_delay_count = 0
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
max_delay = 0
total_arrival_delay = 0
arrival_delay_count = 1
total_departure_delay = 0
departure_delay_count = 1
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
total_arrival_delay += arrival_delay
arrival_delay_count += 1
if departure_detected:
total_departure_delay += departure_delay
departure_delay_count += 1
if max(arrival_delay, departure_delay) > max_delay:
max_delay = max(arrival_delay, departure_delay)
total_worst_delay += max_delay
worst_delay_count += 1
avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
avg_departure = float(total_departure_delay) / float(departure_delay_count)
total_avg_delay += np.mean([avg_arrival, avg_departure])
avg_delay_count += 1
key = start.isoformat()
delays.loc[key, 'Worst case'] = float(total_worst_delay) / float(worst_delay_count)
delays.loc[key, 'Avg case'] = float(total_avg_delay) / float(avg_delay_count)
delta = timedelta(days=1)
start = start + delta
delays
In [ ]:
delays.describe()
In [ ]:
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.title('Delays per day on entire network (old timetable)')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.savefig('./../../Paper/plots/old_delay_per_day.png')
In [ ]:
conf = Config(False)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(False)
#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
key = start.isoformat()
count = len(routes)
#Get all stops for the route
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
#If a stop is found in the route, with > 5min delay
#then this train is considered delayed!
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > 5:
perctDelays.loc[key, 'Delayed'] += 1
break
elif departure_detected:
if departure_delay > 5:
perctDelays.loc[key, 'Delayed'] += 1
break
perctDelays.loc[key,'Percentage']=(float(perctDelays.loc[key,'Delayed'])/float(count))*100.
delta = timedelta(days=1)
start = start + delta
perctDelays
In [ ]:
nov = perctDelays[5:35]
nov.describe()
In [ ]:
perctDelays.describe()
In [ ]:
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='b')
plt.title('Percentage of delayed trains per day (old timetable)')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_percentage_delays.png')
In [ ]:
conf = Config(False)
start, stop = conf.period()
period = pd.date_range(start, stop)
db = RailDatabase(False)
#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
key = start.isoformat()
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
lastStop = db.getLastStop(routeID)
for stopRow in lastStop:
arrival_detected = stopRow[3]
if not arrival_detected:
cancels[key] += 1
delta = timedelta(days=1)
start = start + delta
cancels
In [ ]:
nov = cancels[5:35]
nov.sum()
In [ ]:
cancels['2014-11-24'] = 0
cancels['2014-12-01'] = 0
cancels['2014-12-08'] = 0
cancels['2014-12-11'] = 0
In [ ]:
cancels.describe()
In [ ]:
fig = plt.figure()
plt.bar(cancels.index, cancels, color='b')
plt.title('Cancelled trains per day (old timetable) (no strike)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
plt.ylim(ymax=350)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_cancels_per_day_no_strike.png')
In [ ]:
conf = Config(False)
period = np.arange(24)
sr = pd.Series(0, index=period)
start, stop = conf.period()
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
db = RailDatabase(False)
routes = db.getAllRoutes(toUnix(dt))
routeCount = 0
tmp = pd.Series(0, index=period)
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
includeRoute = False
for stopRow in stops:
arrival_datetime = stopRow[1]
arrival_delay = stopRow[2]
arrival_detected = stopRow[3]
departure_datetime = stopRow[4]
departure_delay = stopRow[5]
departure_detected = stopRow[6]
if arrival_detected:
if arrival_delay > 0:
key = convertDatetime(arrival_datetime)
key = key.time().hour
tmp[key] += arrival_delay
includeRoute = True
elif departure_detected:
if departure_delay > 0:
key = convertDatetime(departure_datetime)
key = key.time().hour
tmp[key] += departure_delay
includeRoute = True
if includeRoute:
routeCount += 1
if routeCount == 0:
routeCount = 1
calcAverage = lambda x: float(x) / float(routeCount)
tmp = tmp.map(calcAverage)
for key in period:
a = sr[key]
b = tmp[key]
if a == 0:
sr[key] = b
elif b != 0:
sr[key] = np.mean([a, b])
delta = timedelta(days=1)
start = start + delta
sr
In [ ]:
sr.describe()
In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='b')
rolling.plot(color='b', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Delays per hour on entire network (old timetable)')
plt.xlabel('Hour')
plt.ylabel('Minutes')
plt.ylim(ymax=10)
plt.savefig('./../../Paper/plots/old_delay_per_hour.png')
In [ ]:
db = RailDatabase(False)
stations = db.getAllStations()
m = Basemap(resolution='i', projection='merc',
llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for stationRow in stations:
x = stationRow[5]
y = stationRow[6]
lat, lon = m(y, x)
m.plot(lat, lon, 'b.', alpha=0.5)
plt.title('Stations')
plt.savefig('./plots/stations.pdf')
plt.show()
In [ ]:
conf = Config(True)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(True)
#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#and in avg-case scenario, i.e. max of avg of arrival and avg of departure delay
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
#Instead of keeping a list of worst_delays {which is O(n) in memory},
#the worst_delay is accumulated and
#divided by the amount of data points {which is O(1) in memory}
total_worst_delay = 0
worst_delay_count = 0
total_avg_delay = 0
avg_delay_count = 0
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
max_delay = 0
total_arrival_delay = 0
arrival_delay_count = 1
total_departure_delay = 0
departure_delay_count = 1
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
total_arrival_delay += arrival_delay
arrival_delay_count += 1
if departure_detected:
total_departure_delay += departure_delay
departure_delay_count += 1
if max(arrival_delay, departure_delay) > max_delay:
max_delay = max(arrival_delay, departure_delay)
total_worst_delay += max_delay
worst_delay_count += 1
avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
avg_departure = float(total_departure_delay) / float(departure_delay_count)
total_avg_delay += np.mean([avg_arrival, avg_departure])
avg_delay_count += 1
key = start.isoformat()
delays.loc[key, 'Worst case'] = float(total_worst_delay) / float(worst_delay_count)
delays.loc[key, 'Avg case'] = float(total_avg_delay) / float(avg_delay_count)
delta = timedelta(days=1)
start = start + delta
delays
In [ ]:
delays.describe()
In [ ]:
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.title('Delay on entire network (new timetable)')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.ylim(ymax=9)
plt.savefig('./../../Paper/plots/new_delay_per_day.png')
In [ ]:
conf = Config(True)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(True)
#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
key = start.isoformat()
count = len(routes)
#Get all stops for the route
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
#If a stop is found in the route, with > 5min delay
#then this train is considered delayed!
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > 5:
perctDelays.loc[key, 'Delayed'] += 1
break
elif departure_detected:
if departure_delay > 5:
perctDelays.loc[key, 'Delayed'] += 1
break
perctDelays.loc[key,'Percentage']=(float(perctDelays.loc[key,'Delayed'])/float(count))*100.
delta = timedelta(days=1)
start = start + delta
perctDelays
In [ ]:
jan = perctDelays[17:48]
feb = perctDelays[49:76]
jan.describe()
In [ ]:
perctDelays.describe()
In [ ]:
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='r')
plt.title('Percentage of delayed trains per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_percentage_delays.png')
In [ ]:
conf = Config(True)
start, stop = conf.period()
period = pd.date_range(start, stop)
db = RailDatabase(True)
#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
key = start.isoformat()
#Get all routes for a specified date.
#Every trainID rides only once each day,
#so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
lastStop = db.getLastStop(routeID)
for stopRow in lastStop:
arrival_detected = stopRow[3]
if not arrival_detected:
cancels[key] += 1
delta = timedelta(days=1)
start = start + delta
cancels
In [ ]:
cancels['2014-12-15'] = 0
In [ ]:
jan = cancels[17:17+31]
jan.sum()
In [ ]:
cancels.describe()
In [ ]:
fig = plt.figure()
plt.bar(cancels.index, cancels, color='r')
plt.title('Cancelled trains per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_cancels_per_day.png')
In [ ]:
c = cancels[1:]
fig = plt.figure()
plt.bar(c.index, c, color='r')
plt.title('Cancelled trains per day (new timetable) (no strike)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_cancels_per_day_no_strike.png')
In [ ]:
conf = Config(True)
period = np.arange(24)
sr = pd.Series(0, index=period)
start = date(2014, 12, 15)
stop = date(2015, 3, 22)
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
db = RailDatabase(True)
routes = db.getAllRoutes(toUnix(dt))
routeCount = 0
tmp = pd.Series(0, index=period)
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
includeRoute = False
for stopRow in stops:
arrival_datetime = stopRow[1]
arrival_delay = stopRow[2]
arrival_detected = stopRow[3]
departure_datetime = stopRow[4]
departure_delay = stopRow[5]
departure_detected = stopRow[6]
if arrival_detected:
if arrival_delay > 0:
key = convertDatetime(arrival_datetime)
key = key.time().hour
tmp[key] += arrival_delay
includeRoute = True
elif departure_detected:
if departure_delay > 0:
key = convertDatetime(departure_datetime)
key = key.time().hour
tmp[key] += departure_delay
includeRoute = True
if includeRoute:
routeCount += 1
if routeCount == 0:
routeCount = 1
calcAverage = lambda x: float(x) / float(routeCount)
tmp = tmp.map(calcAverage)
for key in period:
a = sr[key]
b = tmp[key]
if a == 0:
sr[key] = b
elif b != 0:
sr[key] = np.mean([a, b])
delta = timedelta(days=1)
start = start + delta
sr
In [ ]:
sr.describe()
In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='r')
rolling.plot(color='r', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Delays per hour on entire network (new timetable)')
plt.xlabel('Hour')
plt.ylabel('Minutes')
plt.ylim(ymax=10)
plt.savefig('./../../Paper/plots/new_delay_per_hour.png')
In [ ]: