In [ ]:
import sys
import re
import time
import datetime
import pandas as pd
import numpy as np
import func
# inline plot
%matplotlib inline

In [ ]:
all_trains = pd.read_csv("data/weather-trains_2015-06-08 16:57:272016-01-25 23:42:14.csv",sep='\t',error_bad_lines=False)
delayed = pd.read_csv("data/merged_delay.csv",sep='\t',error_bad_lines=False)
del delayed['Unnamed: 0']
del all_trains['Unnamed: 0']
del all_trains['time_pa']

In [ ]:
print all_trains.shape
print delayed.shape

In [ ]:
delayed.columns.values

In [ ]:
# merged = all_trains.merge(atw,left_on='time_pa',right_on='timestamp',how='outer')
# print merged.shape
# merged.head(5)

In [ ]:
# all_trains.merge(delayed,left_on=['train_id'])

In [ ]:
all_trains['timestamp'] = pd.to_datetime(all_trains['timestamp'],format="%Y-%m-%d %H:%M")
delayed['timestamp'] = pd.to_datetime(delayed['timestamp'],format="%Y-%m-%d %H:%M")

In [ ]:
max_date = delayed['timestamp'].max()
min_date = delayed['timestamp'].min()
print "Delayed"
print max_date
print min_date

In [ ]:
max_date = all_trains['timestamp'].max()
min_date = all_trains['timestamp'].min()
print "All_trains"
print max_date
print min_date

In [ ]:
all_trains.shape

In [ ]:
all_trains['timestamp'][0] + datetime.timedelta(days=1)

In [ ]:
import calendar
def populate_all_trains(t_row):
    t=t_row['timestamp']
#     next_mo = True if t.day >= calendar.monthrange(t.year,t.month)[1] else False
    nd = t_row['timestamp']+ datetime.timedelta(days=1)
    
#     year2 = t.year if next_mo is False & t.month != 12 else t.year+1
#     month2 = t.month if next_mo is False else (t.month + 1 if (t.month+1)<=12 else 1)
#     day2 = t.day+1 if next_mo is False else 1
    
    d = delayed[(delayed['timestamp'] > datetime.datetime(t.year,t.month,t.day)) 
           & (delayed['timestamp'] <= datetime.datetime(nd.year,nd.month,nd.day))
            & (delayed['train_id_x'] == t_row['train_id'])]

    if d.empty is False:
        return pd.Series([
            d.t_bullet.max(),
            d.t_limited.max(),
            d.uid.max(),
            d.is_delay.max(),
            d.delay_minor.max(),
            d.delay_med.max(),
            d.delay_major.max(),
            d.delay_catastrophic.max(),
            d.is_backlog.max(),
            d.is_canceled.max(),
            d.is_passing.max(),
            d.is_accident.max(),
            d.is_medical.max(),
            d.is_mechanical.max(),
            d.is_customer.max(),
            d.is_event.max()])
    else:
        return pd.Series([
            1 if str(t_row.train_id)[0] == 3 else 0,
            1 if str(t_row.train_id)[0] == 2 else 0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0])

In [ ]:
test = all_trains.apply(lambda x:populate_all_trains(x),axis=1)

In [ ]:
test.columns = ['is_bullet','is_limited','tweet_id','is_delay','del_min','del_med','del_maj',
                'del_cat','is_backlog','is_canceled','is_passing','is_accident','is_medical',
                'is_mechanical','is_customer','is_event']

In [ ]:
merged = pd.concat([all_trains, test], axis=1)

In [ ]:
merged.timestamp[100].weekday()

In [ ]:
def get_days(r):
    #Monday is 0 and Sunday is 6.
        return pd.Series([
            1 if r['timestamp'].weekday() == 0 else 0, #monday
            1 if r['timestamp'].weekday() == 1 else 0, 
            1 if r['timestamp'].weekday() == 2 else 0,
            1 if r['timestamp'].weekday() == 3 else 0,
            1 if r['timestamp'].weekday() == 4 else 0,
            1 if r['timestamp'].weekday() == 5 else 0,
            1 if r['timestamp'].weekday() == 6 else 0 #sunday
    ])

In [ ]:
days = all_trains.apply(lambda x:get_days(x),axis=1)
days.columns = ['d_monday','d_tuesday','d_wednesday','d_thursday','d_friday','d_saturday','d_sunday',]

In [ ]:
merged = pd.concat([merged, days], axis=1)

In [ ]:
filename = "./data/merged_concat_final.csv"
merged.to_csv(filename, sep='\t', encoding='utf-8')
print filename

In [ ]:
merged.groupby('timestamp')

In [ ]:
merged[(merged['is_delay']==1)]

In [ ]: