In [1]:
from pymongo import MongoClient
from datetime import datetime, date, timedelta, time
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
matplotlib.style.use('ggplot')
import pymysql
In [2]:
def toUnix(datetime):
unix = datetime.strftime('%s')
return unix
def convertDatetime(unix):
dt = datetime.fromtimestamp(unix)
return dt
def convertDate(unix):
d = date.fromtimestamp(unix)
return d
In [3]:
class RailDatabase():
def __init__(self, isNew):
if isNew:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
else:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')
def getAllRoutes(self, date):
C = self.conn.cursor()
C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
rows = C.fetchall()
C.close()
return rows
def getStops(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
rows = C.fetchall()
C.close()
return rows
def getLastStop(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
rows = C.fetchall()
last = rows[-1:]
C.close()
return last
def getAllStations(self):
C = self.conn.cursor()
C.execute('SELECT * FROM station')
rows = C.fetchall()
C.close()
return rows
class TweetDatabase():
def __init__(self, isNew):
self.conn = MongoClient().data_science
self.new = isNew
def connect(self):
if self.new:
return self.conn.new_tweets
else:
return self.conn.old_tweets
class Config():
def __init__(self, isNew):
self.new = isNew
def period(self):
if self.new:
return (date(2014, 12, 15), date(2015, 3, 22))
else:
return (date(2014, 10, 27), date(2014, 12, 14))
In [4]:
def tweetsPerDay(isNew):
db = TweetDatabase(isNew).connect()
conf = Config(isNew)
tweets = db.find()
start, stop = conf.period()
period = pd.date_range(start, stop)
#Tweets Per Day
tpd = pd.Series(0, index = period)
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
create_date = create_datetime.date()
d = create_date
tpd[d] += 1
return tpd
def delayPerDay(isNew):
conf = Config(isNew)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(isNew)
#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#and in avg-case scenario, i.e. max of avg of arrival and avg of departure delay
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
#Instead of keeping a list of worst_delays {which is O(n) in memory},
#the worst_delay is accumulated and divided by the amount of data points {which is O(1) in memory}
total_worst_delay = 0
worst_delay_count = 0
total_avg_delay = 0
avg_delay_count = 0
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
max_delay = 0
total_arrival_delay = 0
arrival_delay_count = 1
total_departure_delay = 0
departure_delay_count = 1
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
total_arrival_delay += arrival_delay
arrival_delay_count += 1
if departure_detected:
total_departure_delay += departure_delay
departure_delay_count += 1
if max(arrival_delay, departure_delay) > max_delay:
max_delay = max(arrival_delay, departure_delay)
total_worst_delay += max_delay
worst_delay_count += 1
avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
avg_departure = float(total_departure_delay) / float(departure_delay_count)
total_avg_delay += np.mean([avg_arrival, avg_departure])
avg_delay_count += 1
key = start.isoformat()
delays['Worst case'][key] = float(total_worst_delay) / float(worst_delay_count)
delays['Avg case'][key] = float(total_avg_delay) / float(avg_delay_count)
delta = timedelta(days=1)
start = start + delta
return delays
def percentageDelays(isNew):
conf = Config(isNew)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
db = RailDatabase(isNew)
#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
key = start.isoformat()
count = len(routes)
#Get all stops for the route
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
#If a stop is found in the route, with > 5min delay
#then this train is considered delayed!
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > 5:
perctDelays['Delayed'][key] += 1
break
elif departure_detected:
if departure_delay > 5:
perctDelays['Delayed'][key] += 1
break
perctDelays['Percentage'][key] = (float(perctDelays['Delayed'][key]) / float(count)) * float(100)
delta = timedelta(days=1)
start = start + delta
return perctDelays
def cancelPerDay(isNew):
conf = Config(isNew)
start, stop = conf.period()
period = pd.date_range(start, stop)
db = RailDatabase(isNew)
#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)
while(start <= stop):
t = time(0, 0, 0)
dt = datetime.combine(start, t)
key = start.isoformat()
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
lastStop = db.getLastStop(routeID)
for stopRow in lastStop:
arrival_detected = stopRow[3]
if not arrival_detected:
cancels[key] += 1
delta = timedelta(days=1)
start = start + delta
return cancels
def numberOfTweets(isNew):
db = TweetDatabase(isNew).connect()
tweets = db.find()
return tweets.count()
In [6]:
tpd = tweetsPerDay(False)
#delays = delayPerDay(False)
cancels = cancelPerDay(False)
#percentage = percentageDelays(False)
In [7]:
cancels['2014-11-24'] = 0
cancels['2014-12-01'] = 0
cancels['2014-12-08'] = 0
cancels['2014-12-11'] = 0
In [8]:
#worst = delays['Worst case']
#perc = percentage['Percentage']
df = pd.DataFrame(np.zeros((49, 2)), index=tpd.index, columns=['Tweets', 'Cancelled trains'])
df['Tweets'] = tpd
df['Cancelled trains'] = cancels
#fig = plt.figure()
#df.plot(kind='scatter', x='Tweets', y='Delays')
#z = np.polyfit(df['Tweets'], df['Delays'], 1)
#p = np.poly1d(z)
#plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
#plt.title('Tweets and Delayed trains (old timetable)')
#plt.xlabel('Tweets')
#plt.ylabel('Delays')
#plt.savefig('./../../Paper/plots/old_corr_tweets_delays.png')
fig = plt.figure()
df.plot(kind='scatter', x='Tweets', y='Cancelled trains')
z = np.polyfit(df['Tweets'], df['Cancelled trains'], 1)
p = np.poly1d(z)
plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
plt.title('Tweets and Cancelled trains (old timetable)')
plt.xlabel('Tweets')
plt.ylabel('Cancelled trains')
plt.savefig('./../../Paper/plots/old_corr_tweets_cancels.png')
#df.plot(kind='scatter', x='Tweets', y='Percentage')
#z = np.polyfit(df['Tweets'], df['Percentage'], 1)
#p = np.poly1d(z)
#plt.plot(df['Tweets'], p(df['Tweets']), 'r-')
#plt.title('Tweets and Percentage of delayed trains (old schedule)')
#plt.xlabel('Tweets')
#plt.ylabel('Delayed trains')
#plt.savefig('./plots/old_corr_tweets_percentage.pdf')
#plt.show()
In [9]:
def delaysPerHour(isNew):
conf = Config(isNew)
period = np.arange(24)
sr = pd.Series(0, index=period)
start, stop = conf.period()
while start <= stop:
weekday = start.weekday()
if weekday < 5:
t = time(0, 0, 0)
dt = datetime.combine(start, t)
db = RailDatabase(isNew)
routes = db.getAllRoutes(toUnix(dt))
routeCount = 0
tmp = pd.Series(0, index=period)
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
includeRoute = False
for stopRow in stops:
arrival_datetime = stopRow[1]
arrival_delay = stopRow[2]
arrival_detected = stopRow[3]
departure_datetime = stopRow[4]
departure_delay = stopRow[5]
departure_detected = stopRow[6]
if arrival_detected:
if arrival_delay > 0:
key = convertDatetime(arrival_datetime)
key = key.time().hour
tmp[key] += arrival_delay
includeRoute = True
elif departure_detected:
if departure_delay > 0:
key = convertDatetime(departure_datetime)
key = key.time().hour
tmp[key] += departure_delay
includeRoute = True
if includeRoute:
routeCount += 1
if routeCount == 0:
routeCount = 1
calcAverage = lambda x: float(x) / float(routeCount)
tmp = tmp.map(calcAverage)
for key in period:
a = sr[key]
b = tmp[key]
if a == 0:
sr[key] = b
elif b != 0:
sr[key] = np.mean([a, b])
delta = timedelta(days=1)
start = start + delta
return sr
def tweetsPerHour(isNew):
period = np.arange(24)
sr = pd.Series(0, index=period)
db = TweetDatabase(isNew).connect()
tweets = db.find()
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
weekday = create_datetime.weekday()
if weekday < 5:
create_hour = create_datetime.time().hour
sr[create_hour] += 1
return sr
delays = delaysPerHour(False)
tweets = tweetsPerHour(False)
In [10]:
plt.figure()
rolling_delay = pd.rolling_mean(delays, 3, center=True)
rolling_tweet = pd.rolling_mean(tweets, 3, center=True)
ax_delays = delays.plot(style='--', color='b', label='Delays')
rolling_delay.plot(color='b', ax=ax_delays, label='Rolling mean')
ax_delays.set_ylabel('Minutes')
ax_delays.set_ylim(ymax=10)
ax_delays.legend(loc=1)
ax_tweets = tweets.plot(style='--', color='r', secondary_y=True, label='Tweets')
rolling_tweet.plot(color='r', ax=ax_tweets, label='Rolling mean')
ax_tweets.set_ylabel('Tweets')
ax_tweets.set_ylim(ymax=2500)
ax_tweets.legend(loc=2)
plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets and Delays per hour (old timetable)')
plt.xlabel('Hour')
plt.savefig('./../../Paper/plots/old_corr_tweets_delays_hour.png')
In [ ]: