In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
df = pd.read_excel('data/delay.xlsx',sheetname='Sheet 1')
df.head()
Out[2]:
Date
Time
Day
Station
Code
Min Delay
Min Gap
Bound
Line
Vehicle
0
2014-01-01
02:06
Wednesday
HIGH PARK STATION
SUDP
3
7
W
BD
5001
1
2014-01-01
02:40
Wednesday
SHEPPARD STATION
MUNCA
0
0
NaN
YU
0
2
2014-01-01
03:10
Wednesday
LANSDOWNE STATION
SUDP
3
8
W
BD
5116
3
2014-01-01
03:20
Wednesday
BLOOR STATION
MUSAN
5
10
S
YU
5386
4
2014-01-01
03:29
Wednesday
DUFFERIN STATION
MUPAA
0
0
E
BD
5174
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69043 entries, 0 to 69042
Data columns (total 10 columns):
Date 69043 non-null datetime64[ns]
Time 69043 non-null object
Day 69043 non-null object
Station 69043 non-null object
Code 69043 non-null object
Min Delay 69043 non-null int64
Min Gap 69043 non-null int64
Bound 54249 non-null object
Line 68745 non-null object
Vehicle 69043 non-null int64
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 5.3+ MB
In [4]:
df.drop(['Code', 'Vehicle'], axis=1,inplace=True)
df.head()
Out[4]:
Date
Time
Day
Station
Min Delay
Min Gap
Bound
Line
0
2014-01-01
02:06
Wednesday
HIGH PARK STATION
3
7
W
BD
1
2014-01-01
02:40
Wednesday
SHEPPARD STATION
0
0
NaN
YU
2
2014-01-01
03:10
Wednesday
LANSDOWNE STATION
3
8
W
BD
3
2014-01-01
03:20
Wednesday
BLOOR STATION
5
10
S
YU
4
2014-01-01
03:29
Wednesday
DUFFERIN STATION
0
0
E
BD
In [5]:
df['Time'] = pd.to_datetime(df['Time'],format='%H:%M').dt.time
In [6]:
#check if str to time conversion worked by outputting a time cell
df.iloc[0,1]
Out[6]:
datetime.time(2, 6)
On weekdays and Saturdays, trains run every few minutes from about 6 a.m. until 1:30 a.m.
On Sundays, they run from about 8 a.m. to 1:30 a.m.
In [7]:
df['Hour'] = df['Time'].map(lambda x: x.hour)
df.head()
Out[7]:
Date
Time
Day
Station
Min Delay
Min Gap
Bound
Line
Hour
0
2014-01-01
02:06:00
Wednesday
HIGH PARK STATION
3
7
W
BD
2
1
2014-01-01
02:40:00
Wednesday
SHEPPARD STATION
0
0
NaN
YU
2
2
2014-01-01
03:10:00
Wednesday
LANSDOWNE STATION
3
8
W
BD
3
3
2014-01-01
03:20:00
Wednesday
BLOOR STATION
5
10
S
YU
3
4
2014-01-01
03:29:00
Wednesday
DUFFERIN STATION
0
0
E
BD
3
In [8]:
MF = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday']
SS = ['Saturday','Sunday']
weekday = df.loc[(df['Hour'] >= 6) & (df['Day'].isin(MF))]
# omitting 6AM-8AM bc I want the weekend data to have the same number of rows
weekend = df.loc[(df['Hour'] >= 8) & (df['Day'].isin(SS))]
In [9]:
# going to combine all the days in the week too..
ttc = df.loc[((df['Hour'] >= 6) & (df['Day'].isin(MF))) | ((df['Hour'] >= 8) & (df['Day'].isin(SS)))]
ttc.head()
Out[9]:
Date
Time
Day
Station
Min Delay
Min Gap
Bound
Line
Hour
5
2014-01-01
07:31:00
Wednesday
NORTH YORK CENTRE STAT
0
0
NaN
YU
7
6
2014-01-01
07:32:00
Wednesday
SHEPPARD STATION
0
0
NaN
YU
7
7
2014-01-01
07:34:00
Wednesday
RUNNYMEDE STATION
0
0
NaN
BD
7
8
2014-01-01
07:34:00
Wednesday
QUEEN STATION EAST SID
0
0
NaN
YU
7
9
2014-01-01
07:53:00
Wednesday
ST ANDREW STATION
0
0
NaN
YU
7
In [10]:
# np.set_printoptions(threshold=np.nan)
# ttc['Station'].unique()
In [12]:
# ttc[ttc['Station'].str.contains("\(")]
In [13]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x174216ae748>
In [14]:
df.loc[df['Bound'].isnull()]
Out[14]:
Date
Time
Day
Station
Min Delay
Min Gap
Bound
Line
Hour
1
2014-01-01
02:40:00
Wednesday
SHEPPARD STATION
0
0
NaN
YU
2
5
2014-01-01
07:31:00
Wednesday
NORTH YORK CENTRE STAT
0
0
NaN
YU
7
6
2014-01-01
07:32:00
Wednesday
SHEPPARD STATION
0
0
NaN
YU
7
7
2014-01-01
07:34:00
Wednesday
RUNNYMEDE STATION
0
0
NaN
BD
7
8
2014-01-01
07:34:00
Wednesday
QUEEN STATION EAST SID
0
0
NaN
YU
7
9
2014-01-01
07:53:00
Wednesday
ST ANDREW STATION
0
0
NaN
YU
7
10
2014-01-01
08:20:00
Wednesday
WELLESLEY STATION
0
0
NaN
YU
8
17
2014-01-01
09:10:00
Wednesday
WILSON CARHOUSE (10 TR
0
0
NaN
YU
9
33
2014-01-01
18:37:00
Wednesday
DUNDAS WEST STATION
0
0
NaN
BD
18
39
2014-01-01
23:22:00
Wednesday
VICTORIA PARK STATION
0
0
NaN
BD
23
45
2014-01-02
05:18:00
Thursday
BROADVIEW STATION
0
0
NaN
BD
5
46
2014-01-02
05:20:00
Thursday
LANSDOWNE STATION
0
0
NaN
BD
5
63
2014-01-02
08:29:00
Thursday
DUNDAS WEST STATION
0
0
NaN
BD
8
73
2014-01-02
11:30:00
Thursday
YONGE UNIVERSITY SPADI
0
0
NaN
YU
11
83
2014-01-02
14:28:00
Thursday
BLOOR STATION SOUTH BO
0
0
NaN
YU
14
91
2014-01-02
16:14:00
Thursday
WELLESLEY STATION
0
0
NaN
YU
16
100
2014-01-02
20:38:00
Thursday
RUNNYMEDE STATION
0
0
NaN
BD
20
101
2014-01-02
21:00:00
Thursday
YONGE/UNIVERSITY/SPADI
0
0
NaN
YU
21
104
2014-01-02
22:00:00
Thursday
YONGE/UNIVERSITY/SPADI
0
0
NaN
YU
22
108
2014-01-02
23:17:00
Thursday
OSGOODE STATION
0
0
NaN
YU
23
109
2014-01-02
23:42:00
Thursday
SHEPPARD STATION
0
0
NaN
YU
23
121
2014-01-03
05:46:00
Friday
DUNDAS STATION EAST SI
0
0
NaN
YU
5
125
2014-01-03
06:19:00
Friday
PAPE STATION
0
0
NaN
BD
6
140
2014-01-03
11:00:00
Friday
YONGE UNIVERSITY SPADI
0
0
NaN
YU
11
142
2014-01-03
11:30:00
Friday
BLOOR DANFORTH SUBWAY
0
0
NaN
BD
11
154
2014-01-03
15:48:00
Friday
DUNDAS STATION
0
0
NaN
YU
15
158
2014-01-03
18:01:00
Friday
BATHURST STATION
0
0
NaN
BD
18
159
2014-01-03
18:25:00
Friday
COXWELL STATION
0
0
NaN
BD
18
165
2014-01-03
20:41:00
Friday
UNION STATION
0
0
NaN
YU
20
166
2014-01-03
20:43:00
Friday
DONLANDS STATION
0
0
NaN
BD
20
...
...
...
...
...
...
...
...
...
...
68917
2017-04-27
22:40:00
Thursday
DUNDAS WEST STATION
0
0
NaN
BD
22
68918
2017-04-27
23:11:00
Thursday
FINCH STATION
0
0
NaN
YU
23
68923
2017-04-27
00:00:00
Thursday
BLOOR-DANFORTH SUBWAY
0
0
NaN
BD
0
68947
2017-04-28
08:20:00
Friday
LANSDOWNE STATION
0
0
NaN
BD
8
68950
2017-04-28
10:01:00
Friday
DUFFERIN STATION
0
0
NaN
BD
10
68951
2017-04-28
10:22:00
Friday
PAPE STATION
0
0
NaN
BD
10
68962
2017-04-28
17:14:00
Friday
NORTH YORK CTR STATION
0
0
NaN
YU
17
68965
2017-04-28
18:53:00
Friday
KIPLING STATION
0
0
NaN
BD
18
68970
2017-04-28
20:40:00
Friday
VICTORIA PARK STATION
0
0
NaN
BD
20
68974
2017-04-28
22:00:00
Friday
YONGE/UNIVERSITY-BLOOR
0
0
NaN
YU/BD
22
68975
2017-04-28
23:02:00
Friday
ST GEORGE BEDFORD
0
0
NaN
YU
23
68976
2017-04-28
23:14:00
Friday
YONGE-UNIVERSITY SUBWA
0
0
NaN
YU
23
68977
2017-04-28
23:41:00
Friday
BLOOR DANFORTH SUBWAY
0
0
NaN
BD
23
68980
2017-04-28
01:57:00
Friday
MAIN STREET STATION
0
0
NaN
BD
1
68990
2017-04-28
21:51:00
Friday
ELLESMERE STATION
0
0
NaN
SRT
21
68994
2017-04-29
10:51:00
Saturday
DUNDAS WEST STATION
0
0
NaN
BD
10
68995
2017-04-29
11:08:00
Saturday
BLOOR DANFORTH SUBWAY
0
0
NaN
BD
11
69003
2017-04-29
12:29:00
Saturday
EGLINTON STATION
0
0
NaN
YU
12
69004
2017-04-29
13:58:00
Saturday
CASTLE FRANK STATION
0
0
NaN
BD
13
69006
2017-04-29
16:31:00
Saturday
VICTORIA PARK STATION
0
0
NaN
BD
16
69013
2017-04-29
22:00:00
Saturday
YONGE-UNIVERSITY AND B
0
0
NaN
YU/BD
22
69017
2017-04-29
01:05:00
Saturday
WILSON GARAGE
0
0
NaN
YU
1
69022
2017-04-30
08:00:00
Sunday
YONGE/UNIVERSITY LINE
0
0
NaN
YU
8
69029
2017-04-30
16:03:00
Sunday
ST GEORGE YUS STATION
0
0
NaN
YU
16
69030
2017-04-30
16:44:00
Sunday
KENNEDY BD STATION
0
0
NaN
BD
16
69034
2017-04-30
18:32:00
Sunday
OSGOODE STATION
0
0
NaN
YU
18
69035
2017-04-30
19:50:00
Sunday
BLOOR/DANFORTH LINE
0
0
NaN
BD
19
69037
2017-04-30
20:29:00
Sunday
KEELE STATION
0
0
NaN
BD
20
69038
2017-04-30
22:00:00
Sunday
YONGE-UNIVERSITY AND B
0
0
NaN
YU/BD
22
69039
2017-04-30
22:13:00
Sunday
BLOOR-DANFORTH LINE
0
0
NaN
BD
22
14794 rows × 9 columns
In [15]:
df.loc[df['Line'].isnull()]
Out[15]:
Date
Time
Day
Station
Min Delay
Min Gap
Bound
Line
Hour
69
2014-01-02
08:59:00
Thursday
BLOOR
0
0
S
NaN
8
325
2014-01-07
09:13:00
Tuesday
ST GEORGE STATION
0
0
N
NaN
9
486
2014-01-09
14:31:00
Thursday
FINCH STATION
0
0
NaN
NaN
14
677
2014-01-13
06:30:00
Monday
QUEEN STATION - ALBERT
0
0
NaN
NaN
6
905
2014-01-17
08:13:00
Friday
DON MILLS STN
0
0
NaN
NaN
8
1178
2014-01-22
05:10:00
Wednesday
YONGE STATION
0
0
NaN
NaN
5
1420
2014-01-27
05:15:00
Monday
QUEEN ALBERT
0
0
NaN
NaN
5
1605
2014-01-29
09:38:00
Wednesday
MCBRIEN BUILDING
0
0
NaN
NaN
9
1611
2014-01-29
12:36:00
Wednesday
RONCEVALLES DIVISION
0
0
NaN
NaN
12
1738
2014-01-31
10:38:00
Friday
KING AND BATHRUST
0
0
NaN
NaN
10
2244
2014-02-07
19:06:00
Friday
DUNDAS WEST STATION
3
6
W
NaN
19
2391
2014-02-10
11:30:00
Monday
YONGE UNIVERSITY SPADI
0
0
NaN
NaN
11
2541
2014-02-12
18:08:00
Wednesday
KEELE STATION
0
0
NaN
NaN
18
3356
2014-02-24
11:30:00
Monday
BLOOR DANFORTH SUBWAY
0
0
NaN
NaN
11
3421
2014-02-25
11:30:00
Tuesday
YONGE-UNIVERSITY-SPADI
0
0
NaN
NaN
11
3818
2014-03-03
12:00:00
Monday
YONGE UNIVERSITY SPADI
0
0
NaN
NaN
12
3912
2014-03-04
23:23:00
Tuesday
GREENWOOD T/S BUILDING
0
0
NaN
NaN
23
3929
2014-03-05
05:16:00
Wednesday
QUEEN ALBERT
0
0
NaN
NaN
5
4071
2014-03-07
10:51:00
Friday
YONGE
0
0
W
NaN
10
4072
2014-03-07
11:30:00
Friday
YONGE UNIVERSITY SPADI
0
0
NaN
NaN
11
4078
2014-03-07
14:10:00
Friday
COXWELL STATION
0
0
NaN
NaN
14
4355
2014-03-11
11:30:00
Tuesday
YONGE UNIVERSITY SPADI
0
0
NaN
NaN
11
4853
2014-03-19
05:48:00
Wednesday
DANFORTH GARAGE
0
0
NaN
NaN
5
5852
2014-04-07
05:08:00
Monday
CHRISTIE STATION
0
0
NaN
NaN
5
5967
2014-04-08
14:00:00
Tuesday
TRANSIT CONTROL CENTRE
0
0
NaN
NaN
14
6264
2014-04-13
21:16:00
Sunday
FINCH STATION
0
0
NaN
NaN
21
6455
2014-04-16
19:29:00
Wednesday
BLOOR DANFORTH SUBWAY
0
0
NaN
NaN
19
6741
2014-04-22
13:51:00
Tuesday
ROYAL STATION STATION
0
0
NaN
NaN
13
7085
2014-04-29
11:30:00
Tuesday
WILSON GARAGE
0
0
NaN
NaN
11
7173
2014-05-01
06:47:00
Thursday
SPADINA STATION
0
0
NaN
NaN
6
...
...
...
...
...
...
...
...
...
...
59430
2016-11-02
22:09:00
Wednesday
OSSINGTON STATION
0
0
NaN
NaN
22
59433
2016-11-02
22:54:00
Wednesday
WILSON CARHOUSE
0
0
NaN
NaN
22
59761
2016-11-08
09:27:00
Tuesday
GREENWOOD CARHOUSE
0
0
NaN
NaN
9
60133
2016-11-15
09:08:00
Tuesday
INGLIS BLDG
0
0
NaN
NaN
9
60637
2016-11-22
10:17:00
Tuesday
WILSON LOWER
0
0
NaN
NaN
10
60872
2016-11-25
10:28:00
Friday
WILSON TRACK AND STRUC
0
0
NaN
NaN
10
60999
2016-11-27
19:05:00
Sunday
ST CLAIR STATION
0
0
E
NaN
19
61199
2016-11-30
09:34:00
Wednesday
GREENWOOD CARHOUSE
0
0
NaN
NaN
9
61208
2016-11-30
14:25:00
Wednesday
WARDEN STATION
0
0
NaN
NaN
14
62115
2016-12-15
10:15:00
Thursday
GREENWOOD SHOP
0
0
NaN
NaN
10
62916
2016-12-29
10:37:00
Thursday
BLOOR STATION
0
0
S
NaN
10
63653
2017-01-11
21:58:00
Wednesday
WARDEN STATION
0
0
W
NaN
21
63731
2017-01-12
21:35:00
Thursday
SHEPPARD BOOTH 1
0
0
NaN
NaN
21
63919
2017-01-17
08:23:00
Tuesday
DUNCAN SHOPS
0
0
NaN
NaN
8
64437
2017-01-28
11:06:00
Saturday
BLOOR/DANFORTH LINE
0
0
NaN
NaN
11
64951
2017-02-08
08:00:00
Wednesday
PLANT OPS BLDG
0
0
NaN
NaN
8
65338
2017-02-15
08:49:00
Wednesday
FINCH STATION
0
0
NaN
NaN
8
65672
2017-02-21
16:53:00
Tuesday
KENNEDY SRT STATION
0
0
NaN
NaN
16
65839
2017-02-25
05:56:00
Saturday
GREENWOOD WYE
0
0
W
NaN
5
65842
2017-02-25
10:58:00
Saturday
BLOOR/DANFORTH LINE
0
0
NaN
NaN
10
65976
2017-02-28
16:06:00
Tuesday
MCBRIEN BLDG - 1900 YO
0
0
NaN
NaN
16
66106
2017-03-02
18:50:00
Thursday
KENNEDY BD STATION
0
0
NaN
NaN
18
66408
2017-03-09
11:52:00
Thursday
BATHURST STATION
0
0
NaN
NaN
11
66449
2017-03-10
05:51:00
Friday
BLOOR STATION
0
0
S
NaN
5
66467
2017-03-10
16:02:00
Friday
WILSON CARHOUSE
0
0
NaN
NaN
16
66663
2017-03-14
14:14:00
Tuesday
GUNN BUILDING
0
0
NaN
NaN
14
66893
2017-03-18
15:58:00
Saturday
KENNEDY STATION
0
0
NaN
NaN
15
67000
2017-03-20
00:21:00
Monday
JANE STATION
0
0
NaN
NaN
0
68257
2017-04-15
05:35:00
Saturday
QUEEN STATION ALBERT
0
0
NaN
NaN
5
68327
2017-04-16
16:01:00
Sunday
ST GEORGE-BEDFORD
0
0
NaN
NaN
16
298 rows × 9 columns
A lot of rows with no line information have stations that are not for commuter use, so it's safe to drop those rows.
In [16]:
ttc = ttc.loc[~df['Line'].isnull()]
In [17]:
ttc.describe()
Out[17]:
Min Delay
Min Gap
Hour
count
60959.000000
60959.000000
60959.000000
mean
1.805410
2.715087
14.415378
std
8.621229
8.710495
5.125181
min
0.000000
0.000000
6.000000
25%
0.000000
0.000000
10.000000
50%
0.000000
0.000000
15.000000
75%
3.000000
5.000000
19.000000
max
999.000000
999.000000
23.000000
In [18]:
# there is a max of 999 minutes of delay.. drop it
# in fact, drop all delays > 2 hours
ttc = ttc.loc[ttc['Min Delay'] < 120]
In [19]:
ttc.shape
Out[19]:
(60935, 9)
In [20]:
ttc.loc[(ttc['Line']=='YU') & (ttc['Day']=='Friday')].plot.line(x='Time',y='Min Delay', marker='o', alpha=0.3,figsize=(12,7))
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x174216de7f0>
Content source: CivicTechTO/ttc_subway_times
Similar notebooks: