In [171]:
handle = '/Users/datascientist/jared/MTA_delay_prediction/google_transit/stop_times.txt'

In [172]:
import pandas as pd
df = pd.read_csv(handle)

In [173]:
df.columns


Out[173]:
Index([u'trip_id', u'arrival_time', u'departure_time', u'stop_id', u'stop_sequence', u'stop_headsign', u'pickup_type', u'drop_off_type', u'shape_dist_traveled'], dtype='object')

In [174]:
df.shape


Out[174]:
(530632, 9)

In [175]:
df = df.drop(['stop_headsign', u'pickup_type', u'drop_off_type', u'shape_dist_traveled'],1)

In [176]:
df.columns


Out[176]:
Index([u'trip_id', u'arrival_time', u'departure_time', u'stop_id', u'stop_sequence'], dtype='object')

In [177]:
def get_day(id_string):
    return id_string.split('_')[0][-3:]

In [178]:
df['trip_day'] = df['trip_id'].apply(get_day)

In [207]:
df_mon = df[df['trip_day']=='WKD']
df_tue = df[df['trip_day']=='WKD']
df_wed = df[df['trip_day']=='WKD']
df_thu = df[df['trip_day']=='WKD']
df_fri = df[df['trip_day']=='WKD']
df_sat = df[df['trip_day']=='SAT']
df_sun = df[df['trip_day']=='SUN']

In [180]:
max(df_tue['arrival_time'])


Out[180]:
'27:49:00'

In [181]:
df_sat.shape


Out[181]:
(162438, 6)

In [208]:
df_mon['trip_day'] = df_mon['trip_day'].apply(lambda x:'MON')
df_tue['trip_day'] = df_tue['trip_day'].apply(lambda x:'TUE')
df_wed['trip_day'] = df_wed['trip_day'].apply(lambda x:'WED')
df_thu['trip_day'] = df_thu['trip_day'].apply(lambda x:'THU')
df_fri['trip_day'] = df_fri['trip_day'].apply(lambda x:'FRI')

In [209]:
df_mon['arrival_time'] = df_mon['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_tue['arrival_time'] = df_tue['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_wed['arrival_time'] = df_wed['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_thu['arrival_time'] = df_thu['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_fri['arrival_time'] = df_fri['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_sat['arrival_time'] = df_sat['arrival_time'].apply(lambda x: pd.to_timedelta(x))
df_sun['arrival_time'] = df_sun['arrival_time'].apply(lambda x: pd.to_timedelta(x))

In [210]:
df_mon['departure_time'] = df_mon['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_tue['departure_time'] = df_tue['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_wed['departure_time'] = df_wed['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_thu['departure_time'] = df_thu['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_fri['departure_time'] = df_fri['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_sat['departure_time'] = df_sat['departure_time'].apply(lambda x: pd.to_timedelta(x))
df_sun['departure_time'] = df_sun['departure_time'].apply(lambda x: pd.to_timedelta(x))

In [211]:
overtime_mon = df_mon[df_mon['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_tue = df_tue[df_tue['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_wed = df_wed[df_wed['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_thu = df_thu[df_thu['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_fri = df_fri[df_fri['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_sat = df_sat[df_sat['arrival_time']>=pd.to_timedelta('24:00:00')]
overtime_sun = df_sun[df_sun['arrival_time']>=pd.to_timedelta('24:00:00')]

In [212]:
df_mon = df_mon[df_mon['arrival_time']<pd.to_timedelta('24:00:00')]
df_tue = df_tue[df_tue['arrival_time']<pd.to_timedelta('24:00:00')]
df_wed = df_wed[df_wed['arrival_time']<pd.to_timedelta('24:00:00')]
df_thu = df_thu[df_thu['arrival_time']<pd.to_timedelta('24:00:00')]
df_fri = df_fri[df_fri['arrival_time']<pd.to_timedelta('24:00:00')]
df_sat = df_sat[df_sat['arrival_time']<pd.to_timedelta('24:00:00')]
df_sun = df_sun[df_sun['arrival_time']<pd.to_timedelta('24:00:00')]

In [213]:
overtime_mon['arrival_time'] = overtime_mon['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_tue['arrival_time'] = overtime_tue['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_wed['arrival_time'] = overtime_wed['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_thu['arrival_time'] = overtime_thu['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_fri['arrival_time'] = overtime_fri['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_sat['arrival_time'] = overtime_sat['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_sun['arrival_time'] = overtime_sun['arrival_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))

In [166]:
max(overtime_mon['arrival_time'])


Out[166]:
Timedelta('0 days 03:49:00')

In [214]:
overtime_mon['departure_time'] = overtime_mon['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_tue['departure_time'] = overtime_tue['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_wed['departure_time'] = overtime_wed['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_thu['departure_time'] = overtime_thu['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_fri['departure_time'] = overtime_fri['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_sat['departure_time'] = overtime_sat['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))
overtime_sun['departure_time'] = overtime_sun['departure_time'].apply(lambda x: x-pd.to_timedelta('24:00:00'))

In [157]:
overtime_sun.head()


Out[157]:
trip_id arrival_time departure_time stop_id stop_sequence trip_day
7263 A20141207SUN_140400_3..N01R 00:03:00 00:03:00 132N 24 SUN
7264 A20141207SUN_140400_3..N01R 00:05:30 00:05:30 128N 25 SUN
7265 A20141207SUN_140400_3..N01R 00:06:30 00:06:30 127N 26 SUN
7266 A20141207SUN_140400_3..N01R 00:10:00 00:10:00 123N 27 SUN
7267 A20141207SUN_140400_3..N01R 00:12:30 00:12:30 120N 28 SUN

In [203]:
df_tue.shape


Out[203]:
(217696, 6)

In [ ]:
# Drop over columns

In [215]:
min(overtime_mon['arrival_time'])


Out[215]:
Timedelta('0 days 00:00:00')

In [216]:
max(df_tue['arrival_time'])


Out[216]:
Timedelta('0 days 23:59:30')

In [217]:
df_mon.append(overtime_sun);
df_tue.append(overtime_mon);
df_wed.append(overtime_tue);
df_thu.append(overtime_wed);
df_fri.append(overtime_thu);
df_sat.append(overtime_fri);
df_sun.append(overtime_sat);

In [218]:
max(df_mon['arrival_time'])


Out[218]:
Timedelta('0 days 23:59:30')

In [219]:
new_df = pd.DataFrame()
new_df = new_df.append([df_mon,df_tue,df_wed,df_thu,df_fri,df_sat,df_sun])

In [220]:
new_df.shape


Out[220]:
(1384379, 6)

In [221]:
new_df['arrival_time'] = new_df['arrival_time'].apply(lambda x: pd.to_datetime(x))
new_df['departure_time'] = new_df['departure_time'].apply(lambda x: pd.to_datetime(x))

In [222]:
new_df.to_csv('/Users/datascientist/jared/MTA_delay_prediction/google_transit/corrected_stop_times.txt')