In [1]:
    
from railfetcher import *
from datetime import datetime, date, timedelta, time
from dateutil import parser
from mpl_toolkits.basemap import Basemap
from scipy.stats.stats import pearsonr
import pickle
import pymysql
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
    
In [2]:
    
def toUnix(datetime):
    unix = datetime.strftime('%s')
    return unix
def convertDatetime(unix):
    dt = datetime.fromtimestamp(unix)
    return dt
def convertDate(unix):
    d = date.fromtimestamp(unix)
    return d
    
In [3]:
    
class RailDatabase():
    def __init__(self, isNew):
        if isNew:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
        else:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')
    def getAllRoutes(self, date):
        C = self.conn.cursor()
        C.execute('SELECT * FROM route WHERE date = %s', (date,))
        rows = C.fetchall()
        C.close()
        return rows
    
    def getRoute(self, trainID, date):
        C = self.conn.cursor()
        C.execute('SELECT * FROM route WHERE train_id = %s AND date = %s', (trainID, date))
        row = C.fetchone()
        C.close()
        return row
    def getStops(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        C.close()
        return rows
    def getLastStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        last = rows[-1:]
        C.close()
        return last
    
    def getFirstStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        first = rows[0]
        C.close()
        return first
    
    def getStationName(self, stationID):
        C = self.conn.cursor()
        C.execute('SELECT name_nl FROM station WHERE station_id = %s', (stationID,))
        row = C.fetchone()
        C.close()
        return row
    
class Config():
    def __init__(self, isNew):
        self.new = isNew
        
    def period(self):
        if self.new:
            return (date(2014, 12, 16), date(2015, 2, 16))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))
    
In [4]:
    
#Metric is average delay over all the stations
def avg_delay_metric(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    db = RailDatabase(isNew)
    trains = dict()
    while start <= stop:
        weekday = start.weekday()
        if weekday < 5:
            t = time(0, 0, 0)
            dt = datetime.combine(start, t)
            routes = db.getAllRoutes(toUnix(dt))
            for routeRow in routes:
                routeID = routeRow[0]
                trainID = routeRow[2]
                stops = db.getStops(routeID)
                total_arrival_delay = 0
                arrival_delay_count = 0
                total_departure_delay = 0
                departure_delay_count = 0
                for stopRow in stops:
                    arrival_detected = stopRow[3]
                    departure_detected = stopRow[6]
                    arrival_delay = stopRow[2]
                    departure_delay = stopRow[5]
                    if arrival_detected:
                        total_arrival_delay += arrival_delay
                        arrival_delay_count += 1
                    if departure_detected:
                        total_departure_delay += departure_delay
                        departure_delay_count += 1
                if arrival_delay_count != 0:
                    avg_arrival = float(total_arrival_delay) / float(arrival_delay_count) 
                if departure_delay_count != 0:
                    avg_departure = float(total_departure_delay) / float(departure_delay_count)
                    
                if avg_arrival != 0 and avg_departure != 0:
                    avg_delay = np.mean([avg_arrival, avg_departure])
                elif avg_arrival != 0:
                    avg_delay = avg_arrival
                elif avg_departure != 0:
                    avg_delay = avg_departure
                else:
                    pass
                
                if trainID in trains:
                    trains[trainID].append(avg_delay)
                else:
                    trains[trainID] = [avg_delay]
                    
        delta = timedelta(days=1)
        start = start + delta          
    return trains
def acc_delay_metric(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    db = RailDatabase(isNew)
    trains = dict()
    while start <= stop:
        weekday = start.weekday()
        if weekday < 5:
            t = time(0, 0, 0)
            dt = datetime.combine(start, t)
            routes = db.getAllRoutes(toUnix(dt))
            for routeRow in routes:
                routeID = routeRow[0]
                trainID = routeRow[2]
                stops = db.getStops(routeID)
                total_delay = 0
                for stopRow in stops:
                    arrival_detected = stopRow[3]
                    departure_detected = stopRow[6]
                    arrival_delay = stopRow[2]
                    departure_delay = stopRow[5]
                    if arrival_detected and departure_detected:
                        total_delay += max(arrival_delay, departure_delay)
                    elif arrival_detected:
                        total_delay += arrival_delay
                    elif departure_detected:
                        total_delay += departure_delay
                if trainID in trains:
                    trains[trainID].append(total_delay)
                else:
                    trains[trainID] = [total_delay]    
        delta = timedelta(days=1)
        start = start + delta            
    return trains
def binary_metric(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    db = RailDatabase(isNew)
    trains = dict()
    while start <= stop:
        weekday = start.weekday()
        if weekday < 5:
            t = time(0, 0, 0)
            dt = datetime.combine(start, t)
            routes = db.getAllRoutes(toUnix(dt))
            for routeRow in routes:
                routeID = routeRow[0]
                trainID = routeRow[2]
                stops = db.getStops(routeID)
                delayed = 0
                for stopRow in stops:
                    arrival_detected = stopRow[3]
                    departure_detected = stopRow[6]
                    arrival_delay = stopRow[2]
                    departure_delay = stopRow[5]
                    if arrival_detected:
                        if arrival_delay > 5:
                            delayed = 1
                            break
                    if departure_detected:
                        if departure_delay > 5:
                            delayed = 1
                            break
                if trainID in trains:
                    trains[trainID].append(delayed)
                else:
                    trains[trainID] = [delayed]
        delta = timedelta(days=1)
        start = start + delta
    return trains
def binary_metric_train(isNew, train, tresh=5):
    conf = Config(isNew)
    start, stop = conf.period()
    db = RailDatabase(isNew)
    metric = []
    while start <= stop:
        weekday = start.weekday()
        if weekday < 5:
            t = time(0, 0, 0)
            dt = datetime.combine(start, t)
            route = db.getRoute(train, toUnix(dt))
            routeID = route[0]
            stops = db.getStops(routeID)
            delayed = 0
            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]
                if arrival_detected:
                    if arrival_delay > tresh:
                        delayed = 1
                        break
                if departure_detected:
                    if departure_delay > tresh:
                        delayed = 1
                        break
            metric.append(delayed)
        delta = timedelta(days=1)
        start = start + delta
    return metric
    
In [19]:
    
#Period: 16/12/14 -> 16/02/15
#Avg delay metric
avg_delays = avg_delay_metric(True)
copy = dict(avg_delays)
for idx in copy:
    lst = copy[idx]
    if len(lst) != 45:
        del avg_delays[idx]
sze = len(avg_delays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
avgcorr = pd.DataFrame(matrix, index=np.sort(avg_delays.keys()), columns=np.sort(avg_delays.keys()), dtype=float)
for i in avg_delays:
    for j in avg_delays:
        if i > j:
            corr = abs(pearsonr(avg_delays[i], avg_delays[j])[0])
            if corr > 0 and corr < 1:
                avgcorr.loc[i, j] = corr
                
avgcorr
    
    Out[19]:
In [78]:
    
t = avgcorr.unstack()
t.describe()
    
    Out[78]:
In [20]:
    
#Period: 16/12/14 -> 16/02/15
#acc delay metric
acc_delays = acc_delay_metric(True)
copy = dict(acc_delays)
for idx in copy:
    lst = copy[idx]
    if len(lst) != 45:
        del acc_delays[idx]
        
sze = len(acc_delays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
acccorr = pd.DataFrame(matrix, index=np.sort(acc_delays.keys()), columns=np.sort(acc_delays.keys()), dtype=float)
for i in acc_delays:
    for j in acc_delays:
        if i > j:
            corr = abs(pearsonr(acc_delays[i], acc_delays[j])[0])
            if corr > 0 and corr < 1.0:
                acccorr.loc[i, j] = corr
                
acccorr
    
    Out[20]:
In [65]:
    
t = acccorr.unstack()
t.describe()
    
    Out[65]:
In [5]:
    
bdelays = binary_metric(True)
copy = dict(bdelays)
for idx in copy:
    lst = copy[idx]
    if len(lst) != 45:
        del bdelays[idx]
sze = len(bdelays)
matrix = np.empty((sze, sze))
matrix.fill(np.nan)
bincorr = pd.DataFrame(matrix, index=np.sort(bdelays.keys()), columns=np.sort(bdelays.keys()), dtype=float)
for i in bdelays:
    for j in bdelays:
        if i > j:
            corr = abs(pearsonr(bdelays[i], bdelays[j])[0])
            if corr > 0 and corr < 1:
                bincorr.loc[i, j] = corr
                
bincorr
    
    Out[5]:
In [6]:
    
t = bincorr.unstack()
t.describe()
    
    Out[6]:
In [ ]:
    
bres_file = open('./pickles/bres.pkl', 'rb')
bres = pickle.load(bres_file)
    
In [7]:
    
#Voor elke trein A bereken ik de binaire metriek verschillende keren, telkens met een verschillende treshold.
#Voor elke treshold moet de correlatie met alle andere treinen berekent worden. Indien de som van de correlaties
#tussen trein A en trein B voor alle tresholds groter dan (bijvoorbeeld) 1 is, 
#dan moet deze correlatie bijgehouden worden.
trains = bincorr.index.tolist()
tresholds = [10, 20, 30, 40, 50, 60]
tuples = []
for i in trains:
    for j in tresholds:
        tuples.append((i, j))
        
index = pd.MultiIndex.from_tuples(tuples, names=['trains', 'tresholds'])
df = pd.DataFrame(np.zeros((len(index), len(trains))), index=index, columns=trains, dtype=float)
    
In [8]:
    
metrics = dict()
for i in trains:
    m = binary_metric_train(True, str(i))
    metrics[i] = m
    
In [9]:
    
treshold_metrics = dict()
for i in trains:
    treshold_metrics[i] = dict()
    for j in tresholds:
        m = binary_metric_train(True, str(i), tresh=j)
        treshold_metrics[i][j] = m
    
In [10]:
    
for i in trains:
    for j in tresholds:
        trA = treshold_metrics[i][j]
        for k in trains:
            if k != i:
                trB = metrics[k]
                corr = abs(pearsonr(trA, trB)[0])
                df.loc[(i, j), k] = corr
            else:
                df.loc[(i, j), k] = np.nan
    
In [11]:
    
df
    
    Out[11]:
In [ ]:
    
data_file = open('./pickles/data.pkl', 'rb')
df = pickle.load(data_file)
    
In [ ]:
    
lst = []
for i in trains:
    for j in trains:
        s = df.loc[i][j].sum()
        if s > 2:
            lst.append(str(i) + '-' + str(j))
lst
    
In [ ]:
    
df