Correlation between Tweets and Delayed/Cancelled trains

Old Schedule



In [1]:

    
from pymongo import MongoClient
from datetime import datetime, date, timedelta, time
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
matplotlib.style.use('ggplot')
import pymysql

Help Functions



In [2]:

    
def toUnix(datetime):
    unix = datetime.strftime('%s')
    return unix

def convertDatetime(unix):
    dt = datetime.fromtimestamp(unix)
    return dt

def convertDate(unix):
    d = date.fromtimestamp(unix)
    return d

Database functions



In [3]:

    
class RailDatabase():
    def __init__(self, isNew):
        if isNew:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
        else:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')

    def getAllRoutes(self, date):
        C = self.conn.cursor()
        C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
        rows = C.fetchall()
        C.close()
        return rows

    def getStops(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
        rows = C.fetchall()
        C.close()
        return rows

    def getLastStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        last = rows[-1:]
        C.close()
        return last

    def getAllStations(self):
        C = self.conn.cursor()
        C.execute('SELECT * FROM station')
        rows = C.fetchall()
        C.close()
        return rows
    
class TweetDatabase():
    def __init__(self, isNew):
        self.conn = MongoClient().data_science
        self.new = isNew
        
    def connect(self):
        if self.new:
            return self.conn.new_tweets
        else:
            return self.conn.old_tweets
    
class Config():
    def __init__(self, isNew):
        self.new = isNew
        
    def period(self):
        if self.new:
            return (date(2014, 12, 15), date(2015, 3, 22))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))



In [4]:

    
def tweetsPerDay(isNew):
    db = TweetDatabase(isNew).connect()
    conf = Config(isNew)
    
    tweets = db.find()
    
    start, stop = conf.period()
    period = pd.date_range(start, stop)

    #Tweets Per Day
    tpd = pd.Series(0, index = period)

    for tweet in tweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        create_date = create_datetime.date()
        d = create_date
        tpd[d] += 1
        
    return tpd

def delayPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))
    
    db = RailDatabase(isNew)

    #DataFrame with worst-case and avg delay on entire network, per day
    #Delay is computed in a worst-case scenario, i.e. max delay of a train
    #and in avg-case scenario, i.e. max of avg of arrival and avg of departure delay
    delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])

    while(start <= stop):
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        #Instead of keeping a list of worst_delays {which is O(n) in memory}, 
        #the worst_delay is accumulated and divided by the amount of data points {which is O(1) in memory}
        total_worst_delay = 0
        worst_delay_count = 0

        total_avg_delay = 0
        avg_delay_count = 0

        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            max_delay = 0

            total_arrival_delay = 0
            arrival_delay_count = 1

            total_departure_delay = 0
            departure_delay_count = 1

            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]
            
                if arrival_detected:
                    total_arrival_delay += arrival_delay
                    arrival_delay_count += 1
                
                if departure_detected:
                    total_departure_delay += departure_delay
                    departure_delay_count += 1

                if max(arrival_delay, departure_delay) > max_delay:
                    max_delay = max(arrival_delay, departure_delay)

            total_worst_delay += max_delay
            worst_delay_count += 1

            avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
            avg_departure = float(total_departure_delay) / float(departure_delay_count)

            total_avg_delay += np.mean([avg_arrival, avg_departure])
            avg_delay_count += 1

        key = start.isoformat()
        delays['Worst case'][key] = float(total_worst_delay) / float(worst_delay_count)
        delays['Avg case'][key] = float(total_avg_delay) / float(avg_delay_count)

        delta = timedelta(days=1)
        start = start + delta
    return delays

def percentageDelays(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))

    db = RailDatabase(isNew)

    #DataFrame with the number of delayed trains per day
    #A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
    #The 2nd column is the percentage of delayed trains on that day.
    perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])

    while(start <= stop):
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        key = start.isoformat()
        count = len(routes)

        #Get all stops for the route
        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            #If a stop is found in the route, with > 5min delay
            #then this train is considered delayed!
            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]

                if arrival_detected:
                    if arrival_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break
                elif departure_detected:
                    if departure_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break

        perctDelays['Percentage'][key] = (float(perctDelays['Delayed'][key]) / float(count)) * float(100)

        delta = timedelta(days=1)
        start = start + delta

    return perctDelays

def cancelPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    period = pd.date_range(start, stop)
    
    db = RailDatabase(isNew)

    #Series with the amount of cancelled trains, per day
    #A train is considered cancelled when arrival is not detected at its final stop
    cancels = pd.Series(0, index = period)

    while(start <= stop):
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)
        key = start.isoformat()

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        for routeRow in routes:
            routeID = routeRow[0]
            lastStop = db.getLastStop(routeID)

            for stopRow in lastStop:
                arrival_detected = stopRow[3]
                if not arrival_detected:
                    cancels[key] += 1

        delta = timedelta(days=1)
        start = start + delta

    return cancels

def numberOfTweets(isNew):
    db = TweetDatabase(isNew).connect()
    tweets = db.find()
    return tweets.count()



In [6]:

    
tpd = tweetsPerDay(False)
#delays = delayPerDay(False)
cancels = cancelPerDay(False)
#percentage = percentageDelays(False)



In [7]:

    
cancels['2014-11-24'] = 0
cancels['2014-12-01'] = 0
cancels['2014-12-08'] = 0
cancels['2014-12-11'] = 0



In [8]:

    
#worst = delays['Worst case']
#perc = percentage['Percentage']

df = pd.DataFrame(np.zeros((49, 2)), index=tpd.index, columns=['Tweets', 'Cancelled trains'])
df['Tweets'] = tpd
df['Cancelled trains'] = cancels

#fig = plt.figure()
#df.plot(kind='scatter', x='Tweets', y='Delays')
#z = np.polyfit(df['Tweets'], df['Delays'], 1)
#p = np.poly1d(z)
#plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
#plt.title('Tweets and Delayed trains (old timetable)')
#plt.xlabel('Tweets')
#plt.ylabel('Delays')
#plt.savefig('./../../Paper/plots/old_corr_tweets_delays.png')

fig = plt.figure()
df.plot(kind='scatter', x='Tweets', y='Cancelled trains')
z = np.polyfit(df['Tweets'], df['Cancelled trains'], 1)
p = np.poly1d(z)
plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
plt.title('Tweets and Cancelled trains (old timetable)')
plt.xlabel('Tweets')
plt.ylabel('Cancelled trains')
plt.savefig('./../../Paper/plots/old_corr_tweets_cancels.png')

#df.plot(kind='scatter', x='Tweets', y='Percentage')
#z = np.polyfit(df['Tweets'], df['Percentage'], 1)
#p = np.poly1d(z)
#plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
#plt.title('Tweets and Percentage of delayed trains (old schedule)')
#plt.xlabel('Tweets')
#plt.ylabel('Delayed trains')
#plt.savefig('./plots/old_corr_tweets_percentage.pdf')
#plt.show()

Tweets and delays per hour

Only weekdays



In [9]:

    
def delaysPerHour(isNew):
    conf = Config(isNew)
    period = np.arange(24)
    sr = pd.Series(0, index=period)

    start, stop = conf.period()

    while start <= stop:
        weekday = start.weekday()
        if weekday < 5:
            t = time(0, 0, 0)
            dt = datetime.combine(start, t)

            db = RailDatabase(isNew)
            routes = db.getAllRoutes(toUnix(dt))

            routeCount = 0
            tmp = pd.Series(0, index=period)

            for routeRow in routes:
                routeID = routeRow[0]
                stops = db.getStops(routeID)

                includeRoute = False

                for stopRow in stops:
                    arrival_datetime = stopRow[1]
                    arrival_delay = stopRow[2]
                    arrival_detected = stopRow[3]
                    departure_datetime = stopRow[4]
                    departure_delay = stopRow[5]
                    departure_detected = stopRow[6]

                    if arrival_detected:
                        if arrival_delay > 0:
                            key = convertDatetime(arrival_datetime)
                            key = key.time().hour
                            tmp[key] += arrival_delay
                            includeRoute = True
                    elif departure_detected:
                        if departure_delay > 0:
                            key = convertDatetime(departure_datetime)
                            key = key.time().hour
                            tmp[key] += departure_delay
                            includeRoute = True

                if includeRoute:
                    routeCount += 1

            if routeCount == 0:
                routeCount = 1
            calcAverage = lambda x: float(x) / float(routeCount)
            tmp = tmp.map(calcAverage)

            for key in period:
                a = sr[key]
                b = tmp[key]
                if a == 0:
                    sr[key] = b
                elif b != 0:
                    sr[key] = np.mean([a, b])

        delta = timedelta(days=1)
        start = start + delta

    return sr

def tweetsPerHour(isNew):
    period = np.arange(24)
    sr = pd.Series(0, index=period)

    db = TweetDatabase(isNew).connect()

    tweets = db.find()

    for tweet in tweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        weekday = create_datetime.weekday()
        if weekday < 5:
            create_hour = create_datetime.time().hour
            sr[create_hour] += 1

    return sr

delays = delaysPerHour(False)
tweets = tweetsPerHour(False)



In [10]:

    
plt.figure()

rolling_delay = pd.rolling_mean(delays, 3, center=True)
rolling_tweet = pd.rolling_mean(tweets, 3, center=True)

ax_delays = delays.plot(style='--', color='b', label='Delays')
rolling_delay.plot(color='b', ax=ax_delays, label='Rolling mean')
ax_delays.set_ylabel('Minutes')
ax_delays.set_ylim(ymax=10)
ax_delays.legend(loc=1)

ax_tweets = tweets.plot(style='--', color='r', secondary_y=True, label='Tweets')
rolling_tweet.plot(color='r', ax=ax_tweets, label='Rolling mean')
ax_tweets.set_ylabel('Tweets')
ax_tweets.set_ylim(ymax=2500)
ax_tweets.legend(loc=2)

plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets and Delays per hour (old timetable)')
plt.xlabel('Hour')

plt.savefig('./../../Paper/plots/old_corr_tweets_delays_hour.png')



In [ ]: