In [ ]:
import sys
import re
import time
import datetime
import pandas as pd
import numpy as np
import func
# inline plot
%matplotlib inline
In [ ]:
all_trains = pd.read_csv("data/weather-trains_2015-06-08 16:57:272016-01-25 23:42:14.csv",sep='\t',error_bad_lines=False)
delayed = pd.read_csv("data/merged_delay.csv",sep='\t',error_bad_lines=False)
del delayed['Unnamed: 0']
del all_trains['Unnamed: 0']
del all_trains['time_pa']
In [ ]:
print all_trains.shape
print delayed.shape
In [ ]:
delayed.columns.values
In [ ]:
# merged = all_trains.merge(atw,left_on='time_pa',right_on='timestamp',how='outer')
# print merged.shape
# merged.head(5)
In [ ]:
# all_trains.merge(delayed,left_on=['train_id'])
In [ ]:
all_trains['timestamp'] = pd.to_datetime(all_trains['timestamp'],format="%Y-%m-%d %H:%M")
delayed['timestamp'] = pd.to_datetime(delayed['timestamp'],format="%Y-%m-%d %H:%M")
In [ ]:
max_date = delayed['timestamp'].max()
min_date = delayed['timestamp'].min()
print "Delayed"
print max_date
print min_date
In [ ]:
max_date = all_trains['timestamp'].max()
min_date = all_trains['timestamp'].min()
print "All_trains"
print max_date
print min_date
In [ ]:
all_trains.shape
In [ ]:
all_trains['timestamp'][0] + datetime.timedelta(days=1)
In [ ]:
import calendar
def populate_all_trains(t_row):
t=t_row['timestamp']
# next_mo = True if t.day >= calendar.monthrange(t.year,t.month)[1] else False
nd = t_row['timestamp']+ datetime.timedelta(days=1)
# year2 = t.year if next_mo is False & t.month != 12 else t.year+1
# month2 = t.month if next_mo is False else (t.month + 1 if (t.month+1)<=12 else 1)
# day2 = t.day+1 if next_mo is False else 1
d = delayed[(delayed['timestamp'] > datetime.datetime(t.year,t.month,t.day))
& (delayed['timestamp'] <= datetime.datetime(nd.year,nd.month,nd.day))
& (delayed['train_id_x'] == t_row['train_id'])]
if d.empty is False:
return pd.Series([
d.t_bullet.max(),
d.t_limited.max(),
d.uid.max(),
d.is_delay.max(),
d.delay_minor.max(),
d.delay_med.max(),
d.delay_major.max(),
d.delay_catastrophic.max(),
d.is_backlog.max(),
d.is_canceled.max(),
d.is_passing.max(),
d.is_accident.max(),
d.is_medical.max(),
d.is_mechanical.max(),
d.is_customer.max(),
d.is_event.max()])
else:
return pd.Series([
1 if str(t_row.train_id)[0] == 3 else 0,
1 if str(t_row.train_id)[0] == 2 else 0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0])
In [ ]:
test = all_trains.apply(lambda x:populate_all_trains(x),axis=1)
In [ ]:
test.columns = ['is_bullet','is_limited','tweet_id','is_delay','del_min','del_med','del_maj',
'del_cat','is_backlog','is_canceled','is_passing','is_accident','is_medical',
'is_mechanical','is_customer','is_event']
In [ ]:
merged = pd.concat([all_trains, test], axis=1)
In [ ]:
merged.timestamp[100].weekday()
In [ ]:
def get_days(r):
#Monday is 0 and Sunday is 6.
return pd.Series([
1 if r['timestamp'].weekday() == 0 else 0, #monday
1 if r['timestamp'].weekday() == 1 else 0,
1 if r['timestamp'].weekday() == 2 else 0,
1 if r['timestamp'].weekday() == 3 else 0,
1 if r['timestamp'].weekday() == 4 else 0,
1 if r['timestamp'].weekday() == 5 else 0,
1 if r['timestamp'].weekday() == 6 else 0 #sunday
])
In [ ]:
days = all_trains.apply(lambda x:get_days(x),axis=1)
days.columns = ['d_monday','d_tuesday','d_wednesday','d_thursday','d_friday','d_saturday','d_sunday',]
In [ ]:
merged = pd.concat([merged, days], axis=1)
In [ ]:
filename = "./data/merged_concat_final.csv"
merged.to_csv(filename, sep='\t', encoding='utf-8')
print filename
In [ ]:
merged.groupby('timestamp')
In [ ]:
merged[(merged['is_delay']==1)]
In [ ]: