In [1]:
from railfetcher import *
from datetime import datetime, date, timedelta, time
from dateutil import parser
from mpl_toolkits.basemap import Basemap
from scipy.stats.stats import pearsonr
import pickle
import pymysql
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
In [2]:
def toUnix(datetime):
unix = datetime.strftime('%s')
return unix
def convertDatetime(unix):
dt = datetime.fromtimestamp(unix)
return dt
def convertDate(unix):
d = date.fromtimestamp(unix)
return d
In [3]:
class RailDatabase():
def __init__(self, isNew):
if isNew:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
else:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')
def getAllRoutes(self, date):
C = self.conn.cursor()
C.execute('SELECT * FROM route WHERE date = %s', (date,))
rows = C.fetchall()
C.close()
return rows
def getRoute(self, trainID, date):
C = self.conn.cursor()
C.execute('SELECT * FROM route WHERE train_id = %s AND date = %s', (trainID, date))
row = C.fetchone()
C.close()
return row
def getStops(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
rows = C.fetchall()
C.close()
return rows
def getLastStop(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
rows = C.fetchall()
last = rows[-1:]
C.close()
return last
def getFirstStop(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
rows = C.fetchall()
first = rows[0]
C.close()
return first
def getStationName(self, stationID):
C = self.conn.cursor()
C.execute('SELECT name_nl FROM station WHERE station_id = %s', (stationID,))
row = C.fetchone()
C.close()
return row
class Config():
def __init__(self, isNew):
self.new = isNew
def period(self):
if self.new:
return (date(2014, 12, 16), date(2015, 2, 16))
else:
return (date(2014, 10, 27), date(2014, 12, 14))
In [4]:
#Metric is average delay over all the stations
def avg_delay_metric(isNew):
conf = Config(isNew)
start, stop = conf.period()
db = RailDatabase(isNew)
trains = dict()
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
trainID = routeRow[2]
stops = db.getStops(routeID)
total_arrival_delay = 0
arrival_delay_count = 0
total_departure_delay = 0
departure_delay_count = 0
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
total_arrival_delay += arrival_delay
arrival_delay_count += 1
if departure_detected:
total_departure_delay += departure_delay
departure_delay_count += 1
if arrival_delay_count != 0:
avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
if departure_delay_count != 0:
avg_departure = float(total_departure_delay) / float(departure_delay_count)
if avg_arrival != 0 and avg_departure != 0:
avg_delay = np.mean([avg_arrival, avg_departure])
elif avg_arrival != 0:
avg_delay = avg_arrival
elif avg_departure != 0:
avg_delay = avg_departure
else:
pass
if trainID in trains:
trains[trainID].append(avg_delay)
else:
trains[trainID] = [avg_delay]
delta = timedelta(days=1)
start = start + delta
return trains
def acc_delay_metric(isNew):
conf = Config(isNew)
start, stop = conf.period()
db = RailDatabase(isNew)
trains = dict()
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
trainID = routeRow[2]
stops = db.getStops(routeID)
total_delay = 0
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected and departure_detected:
total_delay += max(arrival_delay, departure_delay)
elif arrival_detected:
total_delay += arrival_delay
elif departure_detected:
total_delay += departure_delay
if trainID in trains:
trains[trainID].append(total_delay)
else:
trains[trainID] = [total_delay]
delta = timedelta(days=1)
start = start + delta
return trains
def binary_metric(isNew):
conf = Config(isNew)
start, stop = conf.period()
db = RailDatabase(isNew)
trains = dict()
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
trainID = routeRow[2]
stops = db.getStops(routeID)
delayed = 0
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > 5:
delayed = 1
break
if departure_detected:
if departure_delay > 5:
delayed = 1
break
if trainID in trains:
trains[trainID].append(delayed)
else:
trains[trainID] = [delayed]
delta = timedelta(days=1)
start = start + delta
return trains
def binary_metric_train(isNew, train, tresh=5):
conf = Config(isNew)
start, stop = conf.period()
db = RailDatabase(isNew)
metric = []
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
route = db.getRoute(train, toUnix(dt))
routeID = route[0]
stops = db.getStops(routeID)
delayed = 0
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > tresh:
delayed = 1
break
if departure_detected:
if departure_delay > tresh:
delayed = 1
break
metric.append(delayed)
delta = timedelta(days=1)
start = start + delta
return metric
In [19]:
#Period: 16/12/14 -> 16/02/15
#Avg delay metric
avg_delays = avg_delay_metric(True)
copy = dict(avg_delays)
for idx in copy:
lst = copy[idx]
if len(lst) != 45:
del avg_delays[idx]
sze = len(avg_delays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
avgcorr = pd.DataFrame(matrix, index=np.sort(avg_delays.keys()), columns=np.sort(avg_delays.keys()), dtype=float)
for i in avg_delays:
for j in avg_delays:
if i > j:
corr = abs(pearsonr(avg_delays[i], avg_delays[j])[0])
if corr > 0 and corr < 1:
avgcorr.loc[i, j] = corr
avgcorr
Out[19]:
In [78]:
t = avgcorr.unstack()
t.describe()
Out[78]:
In [20]:
#Period: 16/12/14 -> 16/02/15
#acc delay metric
acc_delays = acc_delay_metric(True)
copy = dict(acc_delays)
for idx in copy:
lst = copy[idx]
if len(lst) != 45:
del acc_delays[idx]
sze = len(acc_delays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
acccorr = pd.DataFrame(matrix, index=np.sort(acc_delays.keys()), columns=np.sort(acc_delays.keys()), dtype=float)
for i in acc_delays:
for j in acc_delays:
if i > j:
corr = abs(pearsonr(acc_delays[i], acc_delays[j])[0])
if corr > 0 and corr < 1.0:
acccorr.loc[i, j] = corr
acccorr
Out[20]:
In [65]:
t = acccorr.unstack()
t.describe()
Out[65]:
In [5]:
bdelays = binary_metric(True)
copy = dict(bdelays)
for idx in copy:
lst = copy[idx]
if len(lst) != 45:
del bdelays[idx]
sze = len(bdelays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
bincorr = pd.DataFrame(matrix, index=np.sort(bdelays.keys()), columns=np.sort(bdelays.keys()), dtype=float)
for i in bdelays:
for j in bdelays:
if i > j:
corr = abs(pearsonr(bdelays[i], bdelays[j])[0])
if corr > 0 and corr < 1:
bincorr.loc[i, j] = corr
bincorr
Out[5]:
In [6]:
t = bincorr.unstack()
t.describe()
Out[6]:
In [ ]:
bres_file = open('./pickles/bres.pkl', 'rb')
bres = pickle.load(bres_file)
In [7]:
#Voor elke trein A bereken ik de binaire metriek verschillende keren, telkens met een verschillende treshold.
#Voor elke treshold moet de correlatie met alle andere treinen berekent worden. Indien de som van de correlaties
#tussen trein A en trein B voor alle tresholds groter dan (bijvoorbeeld) 1 is,
#dan moet deze correlatie bijgehouden worden.
trains = bincorr.index.tolist()
tresholds = [10, 20, 30, 40, 50, 60]
tuples = []
for i in trains:
for j in tresholds:
tuples.append((i, j))
index = pd.MultiIndex.from_tuples(tuples, names=['trains', 'tresholds'])
df = pd.DataFrame(np.zeros((len(index), len(trains))), index=index, columns=trains, dtype=float)
In [8]:
metrics = dict()
for i in trains:
m = binary_metric_train(True, str(i))
metrics[i] = m
In [9]:
treshold_metrics = dict()
for i in trains:
treshold_metrics[i] = dict()
for j in tresholds:
m = binary_metric_train(True, str(i), tresh=j)
treshold_metrics[i][j] = m
In [10]:
for i in trains:
for j in tresholds:
trA = treshold_metrics[i][j]
for k in trains:
if k != i:
trB = metrics[k]
corr = abs(pearsonr(trA, trB)[0])
df.loc[(i, j), k] = corr
else:
df.loc[(i, j), k] = np.nan
In [11]:
df
Out[11]:
In [ ]:
data_file = open('./pickles/data.pkl', 'rb')
df = pickle.load(data_file)
In [ ]:
lst = []
for i in trains:
for j in trains:
s = df.loc[i][j].sum()
if s > 2:
lst.append(str(i) + '-' + str(j))
lst
In [ ]:
df