In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from IPython.display import Image
%matplotlib inline
In [2]:
rawData = pd.read_csv('../data/trenes-despachados.csv', sep = ';')
rawData.drop(['FR1_REGIST','FR1_KM','FR1_KMV'], axis = 1, inplace = True)
rawData.columns = ['date','line','dayType','travelId','trainId',
'notDispach1','notDispach1Details','notDispach2','notDispach2Details',
'amountCars1','amountCars2','dispached1','dispached2',
'time1','time2']
# split data into two dataframes, one for each terminal
t1 = rawData.drop(['notDispach2','notDispach2Details','amountCars2','dispached2','time2'], axis = 1)
t1['terminal']=1
t2 = rawData.drop(['notDispach1','notDispach1Details','amountCars2','dispached1','time1'], axis = 1)
t2['terminal']=2
#remane columns
t1.columns = ['date','line','dayType','travelId','trainId','notDispach',
'notDispachDetails','amountCars','dispached','time','terminal']
t2.columns = t1.columns
# join by rows
cleanData = t1.append(t2,ignore_index=True).reindex()
#format times
cleanData['depart'] = cleanData.date + ' ' + cleanData.time
cleanData.date = pd.to_datetime(cleanData.date, format='%d/%m/%Y')
cleanData.depart = pd.to_datetime(cleanData.depart, format='%d/%m/%Y %H:%M:%S')
#sort values for lag
cleanData.sort_values(by=['line','terminal','depart'],inplace=True)
cleanData.reset_index(drop=True, inplace=True)
#only dispatched trains
delayData = cleanData.loc[cleanData.dispached == 'S' ,['line','depart']].copy()
delayData['departLag'] = delayData.depart.shift(1)
delayData['delay'] = delayData.depart - delayData.departLag
#create days of each depart, a variable stating wich delays are between diferent days, removing those delay times
delayData['departDay'] = delayData.depart.map(lambda x: x.day)
delayData['departLagDay'] = delayData.departLag.map(lambda x: x.day)
delayData['departDifferentDay'] = ~(delayData.departDay == delayData.departLagDay)
delayData.delay[delayData.departDifferentDay] = pd.NaT
delayData.drop(['departDay','departLag','departLagDay','departDifferentDay'], axis = 1 , inplace=True)
#remove outliers based on each line top 0.5%
delayData.groupby(by=delayData.line)
delayByLineOutliers = delayData.delay[~delayData.delay.isnull()].groupby(by=delayData.line).quantile(0.995)
delayDataA = delayData[delayData.line == 'A']
delayDataA.delay[delayDataA.delay > delayByLineOutliers['A']] = pd.NaT
delayDataB = delayData[delayData.line == 'B']
delayDataB.delay[delayDataB.delay > delayByLineOutliers['B']] = pd.NaT
delayDataC = delayData[delayData.line == 'C']
delayDataC.delay[delayDataC.delay > delayByLineOutliers['C']] = pd.NaT
delayDataD = delayData[delayData.line == 'D']
delayDataD.delay[delayDataD.delay > delayByLineOutliers['D']] = pd.NaT
delayDataE = delayData[delayData.line == 'E']
delayDataE.delay[delayDataE.delay > delayByLineOutliers['E']] = pd.NaT
delayDataH = delayData[delayData.line == 'H']
delayDataH.delay[delayDataH.delay > delayByLineOutliers['H']] = pd.NaT
delayDataP = delayData[delayData.line == 'P']
delayDataP.delay[delayDataP.delay > delayByLineOutliers['P']] = pd.NaT
#Append in a full dataset
delayDataFinal = delayDataA.append(delayDataB)
delayDataFinal = delayDataFinal.append(delayDataC)
delayDataFinal = delayDataFinal.append(delayDataD)
delayDataFinal = delayDataFinal.append(delayDataE)
delayDataFinal = delayDataFinal.append(delayDataH)
delayDataFinal = delayDataFinal.append(delayDataP)
#convert timedelta into integers en create a month category, remove unnecesary columns
delayDataFinal['delayInt'] = delayDataFinal.delay / np.timedelta64(1, 'm')
delayDataFinal['month'] = delayDataFinal.depart.map(lambda x: str(x.year) + '-' + str(x.month))
delayDataFinal['year'] = delayDataFinal.depart.map(lambda x: x.year)
delayDataFinal['month2'] = delayDataFinal.depart.map(lambda x: x.month)
delayDataFinal['departHour'] = delayDataFinal.depart.map(lambda x: x.hour)
delayDataFinal.drop(['line','depart'], axis = 1 , inplace=True)
# Merge into subway final data
subwayData = pd.merge(cleanData,delayDataFinal,how='left',left_index=True, right_index=True)
In [3]:
subwayData.delayInt[~subwayData.delayInt.isnull()].describe()
Out[3]:
In [4]:
#colors for each line
lineColors = {'A':'dodgerblue','B':'red','C':'mediumblue','D':'forestgreen','E':'purple','H':'yellow','P':'coral'}
In [5]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,2,1)
ax = sns.boxplot(subwayData.delayInt,orient = 'v')
ax.set_title('Delay time in all lines')
ax.set_ylabel('Delay time in minutes as integers')
ax2 = fig.add_subplot(1,2,2)
ax2 = sns.boxplot(subwayData.line,subwayData.delayInt,palette=lineColors)
ax2.set_title('Delay time in all lines')
ax2.set_ylabel('Delay time in minutes as integers')
ax2.set_xlabel('Line')
Out[5]:
The first plot shows that the median waiting time between one train and the other in the BA subway as a whole, is 5 minutes. This changes when we take a look to what happens on a line by line basis. The C line is the line with the best frequency (3:41 minutes). Next is the B line (04:07 minutes) followed closely by the D line (04:15 minutes). Also within 4 minutes, we find the the A line (04:53 minutes). Already in the 7 minutes we hace the E line (07:10 minutes) and the the H line (7:30 minutes). Finally the P line (10 minutes).
In [6]:
lines = subwayData.line.unique()
waitingTimeByLine = subwayData.delay.groupby(by=subwayData.line).describe()
for line in lines:
print line,'median:',waitingTimeByLine[line]['50%'],line,'mean:',waitingTimeByLine[line]['mean']
Now, Mayor Larreta's public commitment was to reduce the waiting time for each subway to 3 minutes during roush hour. Classic rush hours can be defined between 8 and 10 am for the morning rush hour and between 5 to 7 pm for the evening rush hour. Also, it is necessary to take into account the terminal stations where we measure the waiting time. Buenos Aires City has a radial shape with a clear defined center. Over this underlying functional reality of the urban space, the subway lines were designed with two distintc types: radial lines (A,B,D,E,P), which connect city center with the most peripheral neighborhoods; and circumferential lines (only C and H), which go around city center and intersect the remaining lines perpendicularly. Therefore, while in the circumferential lines both rush hours periods are important (morning and evening) when considering the waiting time for a subway train, in the radial lines the really important one changes accordingly to the time of day. We want to check the waiting time on the peripheral terminal stations during the morning rush and on the center terminal stations during the evening rush.
In [7]:
Image("../img/baSubwayMap.png")
Out[7]:
In [8]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,1,1)
ax = sns.boxplot(subwayData.month,subwayData.delayInt,color='grey')
ax.artists[0].set_facecolor('firebrick')
ax.artists[1].set_facecolor('firebrick')
ax.artists[12].set_facecolor('firebrick')
ax.artists[13].set_facecolor('firebrick')
ax.set_title('Wating time by month')
ax.set_xlabel('Month')
ax.set_ylabel('Waiting time')
Out[8]:
The plot shows that during January and February, the most frequent holiday months in Argentina, waiting time increases, but afterwards, rapidly converge towards the 5 minutes median waiting time. There is no clear tendency of decreasing time towards the 3 minute goal, taking all the lines, in all the terminals during every hour. It remains to be seen if, after accounting for the terminals and rush hours, this goal is reached.
In [9]:
#create masks for rush hours during workdays
workdayNotNan = (subwayData.dayType == 'H') & (~subwayData.delay.isnull())
subwayData['rushMorning'] = (subwayData.departHour >= 8) & (subwayData.departHour <= 10) & workdayNotNan
subwayData['rushEvening'] = (subwayData.departHour >= 17) & (subwayData.departHour <= 19) & workdayNotNan
In [10]:
print subwayData.shape
waitingTime = subwayData.copy().loc[subwayData.rushEvening |subwayData.rushMorning,['line','month','terminal','delay','delayInt','rushMorning','rushEvening']]
waitingTimeNew = subwayData.copy().loc[subwayData.rushEvening |subwayData.rushMorning,['line','month','year','month2','terminal','delay','delayInt','rushMorning','rushEvening']]
print waitingTime.shape
waitingTime.head()
Out[10]:
In [11]:
stationsNames = {'A':['','San Pedrito','Plaza de Mayo'],
'B':['','Juan Manuel de Rosas','Leandro N. Alem'],
'C':['','Constitucion','Retiro'],
'D':['','Congreso de Tucuman','Catedral'],
'E':['','Plaza de los Virreyes','Bolivar'],
'H':['','Las Heras','Hospitales'],
'P':['','',''],}
In [12]:
def plotLineTimeSeries(line):
fig = plt.figure(figsize=(16,6))
plotId = 1
color = lineColors[line]
for i in [1,2]:
terminal = stationsNames[line][i]
if line == 'P' and i == 2 :
break
#print terminal
plotMask = (waitingTime.line == line) & (waitingTime.terminal == i)
#print sum(plotMask), len(plotMask)
if (line != 'C' and line != 'H'): #radial lines selection for rush hour depending on terminal
if i == 1:
plotMask = plotMask & waitingTime.rushMorning
rushTime = 'Morning'
else:
plotMask = plotMask & waitingTime.rushEvening
rushTime = 'Afternoon'
else:
plotMask = (plotMask & (waitingTime.rushEvening | waitingTime.rushMorning))
rushTime = 'Morning and Afternoon'
ax = fig.add_subplot(1,2,plotId)
ax = sns.boxplot(waitingTime.month[plotMask],waitingTime.delayInt[plotMask],color=color)
ax.set_xlabel('Month')
ax.set_ylabel('Delay (minutes)')
ax.set_xticklabels(waitingTime.month[plotMask].unique(),rotation='vertical')
ax.set_title('Line ' + line + ' - ' + terminal + ' - ' + rushTime)
fig.savefig('../img/plotLine' + line + '.png')
plotId += 1
In [14]:
plotLineTimeSeries(line='A')
In [44]:
def plotLineTimeSeries2(line,bootTraces = False):
fig = plt.figure(figsize=(16,6))
plotId = 1
color = lineColors[line]
for i in [1,2]:
terminal = stationsNames[line][i]
if line == 'P' and i == 2 :
break
#print terminal
plotMask = (waitingTime.line == line) & (waitingTime.terminal == i)
#print sum(plotMask), len(plotMask)
if (line != 'C' and line != 'H'): #radial lines selection for rush hour depending on terminal
if i == 1:
plotMask = plotMask & waitingTime.rushMorning
rushTime = 'Morning'
else:
plotMask = plotMask & waitingTime.rushEvening
rushTime = 'Afternoon'
else:
plotMask = (plotMask & (waitingTime.rushEvening | waitingTime.rushMorning))
rushTime = 'Morning and Afternoon'
waitingTime2 = waitingTime.loc[plotMask,:]
n = waitingTime2.month.value_counts().values.min()
baseParaMergear = waitingTime2.iloc[0:1,:].copy()
for mes in waitingTime2.month.unique():
insumo = waitingTime2.loc[waitingTime2.month==mes,:].sample(n=n).copy()
#print mes
baseParaMergear = pd.concat([baseParaMergear,insumo])
baseParaMergear.index = range(baseParaMergear.shape[0])
baseParaMergear.drop(0,axis=0,inplace=True)
baseParaMergear['uniqueId'] = baseParaMergear.groupby(['month']).cumcount()
baseParaMergear.index = range(baseParaMergear.shape[0])
month2 = []
iden = 1
for i in range(baseParaMergear.shape[0]):
if i==0:
month2.append(iden)
#print iden
else:
if baseParaMergear.month[i] == baseParaMergear.month[i-1]:
month2.append(iden)
else:
iden += 1
month2.append(iden)
#print iden
baseParaMergear['month2'] = month2
ax = fig.add_subplot(1,2,plotId)
if bootTraces:
ax = sns.tsplot(time="month2", value="delayInt",unit="uniqueId", condition="line",data=baseParaMergear,
color=color, err_style="boot_traces", n_boot=500)
else:
ax = sns.tsplot(time="month2", value="delayInt",unit="uniqueId", condition="line",data=baseParaMergear,
ci=[50,75,95], color=color,estimator=np.median)
ax.set_xlabel('Month')
ax.set_ylabel('Delay (minutes)')
ax.set_xticklabels(['Ene-2016','May-2015','Oct-2015','Mar-2016','Ago-2016'],rotation='vertical')
ax.set_title('Line ' + line + ' - ' + terminal + ' - ' + rushTime)
#fig.savefig('../img/plotLine' + line + '.png')
plotId += 1
In [50]:
plotLineTimeSeries2(line='A',bootTraces = False)
In [51]:
plotLineTimeSeries2(line='A',bootTraces = True)
In [49]:
plotLineTimeSeries2(line='E',bootTraces = False)
In [48]:
plotLineTimeSeries2(line='E',bootTraces = True)
In [43]:
fig = plt.figure(figsize=(20,40))
plotId = 1
for lineId in range(len(lines)):
line = lines[lineId]
color = lineColors[line]
#print line, color
for i in [1,2]:
terminal = stationsNames[line][i]
if line == 'P' and i == 2 :
break
#print terminal
plotMask = (waitingTime.line == line) & (waitingTime.terminal == i)
#print sum(plotMask), len(plotMask)
if (line != 'C' and line != 'H'): #radial lines selection for rush hour depending on terminal
if i == 1:
plotMask = plotMask & waitingTime.rushMorning
rushTime = 'Morning'
else:
plotMask = plotMask & waitingTime.rushEvening
rushTime = 'Afternoon'
else:
plotMask = (plotMask & (waitingTime.rushEvening | waitingTime.rushMorning))
rushTime = 'Morning and Afternoon'
ax = fig.add_subplot(7,2,plotId)
ax = sns.boxplot(waitingTime.month[plotMask],waitingTime.delayInt[plotMask],color=color)
ax.set_xlabel('Month')
ax.set_ylabel('Delay (minutes)')
ax.set_xticklabels(waitingTime.month[plotMask].unique(),rotation='vertical')
ax.set_title('Line ' + line + ' - ' + terminal + ' - ' + rushTime)
plotId += 1
fig.savefig('../img/plotAllLines.png')
In [ ]: