In [1]:
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func
# import pyowm # Historical API is paid

# inline plot
%matplotlib inline


/Users/albarron/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
truth = pd.read_csv("data/truth_tweets.csv",sep=',',error_bad_lines=False)
twts = pd.read_csv("data/formated_twts.csv",sep=',',error_bad_lines=False)
weather = pd.read_csv("data/weather-add-twt2016-03-06-21:26:39.csv",sep='\t',error_bad_lines=False)

In [3]:
del truth['Unnamed: 0']
truth.head()


Out[3]:
id text train_id is_delay delay_minor delay_med delay_major delay_catastrophic is_backlog is_canceled is_passing is_accident is_medical is_mechanical is_customer is_event
0 691768068385718275 #NB155 is 22 mins late. #Caltrain 155 1 0 0 1 0 0 0 0 0 0 0 0 0
1 690667494906814464 #NB 151 is running 10 mins behind #Caltrain 151 1 1 0 0 0 0 0 0 0 0 0 0 0
2 689936168893329408 #NB151 is 10 mins down at San Mateo. #Caltrain 151 1 1 0 0 0 0 0 0 0 0 0 0 0
3 689509733640732672 #SB138 will board on the northbound platform a... 138 0 0 0 0 0 0 0 0 0 0 0 0 0
4 689485484838416388 #SB216 delayed 13 minutes at SAT\n#NB225 delay... 216 1 0 1 0 0 0 0 0 0 0 0 0 0

In [4]:
del twts['Unnamed: 0']
twts.head(3)


Out[4]:
created_at favorite_count hashtags id in_reply_to_screen_name retweet_count text day_of_week day_of_month month time_of_day topic_train t_bullet t_limited t_northbound timestamp train_id tweet_id
0 2016-01-25 23:42:14 1 NB155, Caltrain 691768068385718275 NaN 2 #NB155 is 22 mins late. #Caltrain 0 25 1 error '155' 0 0 1 2016-01-25 23:42:14 155 691768068385718275
1 2016-01-22 22:48:57 NaN NB, Caltrain 690667494906814464 NaN 2 #NB 151 is running 10 mins behind #Caltrain 4 22 1 error '151' 0 0 1 2016-01-22 22:48:57 151 690667494906814464
2 2016-01-20 22:22:55 1 NB151, Caltrain 689936168893329408 NaN 2 #NB151 is 10 mins down at San Mateo. #Caltrain 2 20 1 error '151' 0 0 1 2016-01-20 22:22:55 151 689936168893329408

In [5]:
truth['uid'] = truth.apply(lambda x: int(str(x.train_id)+str(x.id)),axis=1)
twts['uid'] = truth.apply(lambda x: int(str(x.train_id)+str(x.id)),axis=1)

In [6]:
# df = twts.merge(truth,on=['id','train_id'],how='inner') # now have uid
df = twts.merge(truth,on='uid',how='inner')
df.shape


Out[6]:
(584, 35)

In [ ]:
# wt = weather[['id','temp','precipiation','visability','windspeed','humidity','cloudcover']]

In [ ]:
# wt.head(2)

In [ ]:
# df = df.merge(wt,left_on='tweet_id',right_on='id',how='outer')
# df.shape

In [8]:
df.head()


Out[8]:
created_at favorite_count hashtags id_x in_reply_to_screen_name retweet_count text_x day_of_week day_of_month month ... delay_major delay_catastrophic is_backlog is_canceled is_passing is_accident is_medical is_mechanical is_customer is_event
0 2016-01-25 23:42:14 1 NB155, Caltrain 691768068385718275 NaN 2 #NB155 is 22 mins late. #Caltrain 0 25 1 ... 1 0 0 0 0 0 0 0 0 0
1 2016-01-22 22:48:57 NaN NB, Caltrain 690667494906814464 NaN 2 #NB 151 is running 10 mins behind #Caltrain 4 22 1 ... 0 0 0 0 0 0 0 0 0 0
2 2016-01-20 22:22:55 1 NB151, Caltrain 689936168893329408 NaN 2 #NB151 is 10 mins down at San Mateo. #Caltrain 2 20 1 ... 0 0 0 0 0 0 0 0 0 0
3 2016-01-19 18:08:25 1 SB138, Caltrain 689509733640732672 NaN 2 #SB138 will board on the northbound platform a... 1 19 1 ... 0 0 0 0 0 0 0 0 0 0
4 2016-01-19 16:32:04 2 SB216, NB225, NB329, Caltrain 689485484838416388 NaN 4 #SB216 delayed 13 minutes at SAT\n#NB225 delay... 1 19 1 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 35 columns


In [9]:
df.columns.values


Out[9]:
array(['created_at', 'favorite_count', 'hashtags', 'id_x',
       'in_reply_to_screen_name', 'retweet_count', 'text_x', 'day_of_week',
       'day_of_month', 'month', 'time_of_day', 'topic_train', 't_bullet',
       't_limited', 't_northbound', 'timestamp', 'train_id_x', 'tweet_id',
       'uid', 'id_y', 'text_y', 'train_id_y', 'is_delay', 'delay_minor',
       'delay_med', 'delay_major', 'delay_catastrophic', 'is_backlog',
       'is_canceled', 'is_passing', 'is_accident', 'is_medical',
       'is_mechanical', 'is_customer', 'is_event'], dtype=object)

In [10]:
del df['created_at']
del df['favorite_count']
del df['hashtags']
del df['in_reply_to_screen_name']
del df['retweet_count']
del df['text_y']
del df['train_id_y']
del df['id_y']
del df['id_x']

In [11]:
df.corr()


Out[11]:
day_of_week day_of_month month t_bullet t_limited t_northbound train_id_x tweet_id delay_minor delay_med delay_major delay_catastrophic is_backlog is_canceled is_passing is_accident is_medical is_mechanical is_customer is_event
day_of_week 1.000000 -0.064527 0.032273 0.038234 -0.015425 -0.060972 0.119804 -0.074660 -0.050183 -0.041209 0.017500 0.035520 0.026806 -0.031875 0.043513 0.031715 0.050550 -0.041497 -0.044278 -0.030502
day_of_month -0.064527 1.000000 -0.167183 0.030566 0.050568 0.053409 0.023401 -0.226543 0.097248 0.077398 0.079371 -0.074655 -0.182253 -0.012005 0.016955 0.025270 -0.101009 -0.087499 0.020955 0.009100
month 0.032273 -0.167183 1.000000 -0.104911 0.057563 0.073191 0.005346 0.000845 -0.011825 0.028640 -0.038361 0.012834 -0.049368 -0.000624 0.000035 0.041778 -0.233609 0.039058 0.048913 0.000024
t_bullet 0.038234 0.030566 -0.104911 1.000000 -0.406237 -0.003214 0.657885 -0.066599 -0.021947 0.033449 -0.057969 0.095674 0.006079 0.056771 -0.005847 0.042806 -0.089497 0.049790 0.005040 -0.035980
t_limited -0.015425 0.050568 0.057563 -0.406237 1.000000 -0.049954 0.133330 -0.094702 -0.036353 -0.018982 0.005696 -0.039966 0.101631 -0.034239 0.073836 0.005562 -0.038704 -0.008069 -0.066965 -0.012378
t_northbound -0.060972 0.053409 0.073191 -0.003214 -0.049954 1.000000 0.006201 0.045560 0.008760 -0.058795 0.022023 0.018741 0.080351 -0.002916 0.076562 0.066943 -0.103233 -0.026955 -0.045545 0.003113
train_id_x 0.119804 0.023401 0.005346 0.657885 0.133330 0.006201 1.000000 -0.111896 -0.089655 -0.009991 -0.008451 0.065518 0.015809 0.031720 0.015133 0.047335 -0.054273 0.037056 -0.051603 -0.009753
tweet_id -0.074660 -0.226543 0.000845 -0.066599 -0.094702 0.045560 -0.111896 1.000000 -0.067052 -0.130514 -0.017715 0.047553 0.083153 -0.036222 0.132490 -0.051096 0.296229 -0.129205 -0.005532 -0.070528
delay_minor -0.050183 0.097248 -0.011825 -0.021947 -0.036353 0.008760 -0.089655 -0.067052 1.000000 -0.294072 -0.109563 -0.080127 -0.129274 -0.059536 -0.091557 -0.089406 -0.118606 -0.062823 0.002914 0.101125
delay_med -0.041209 0.077398 0.028640 0.033449 -0.018982 -0.058795 -0.009991 -0.130514 -0.294072 1.000000 -0.150462 -0.110038 -0.106410 -0.116536 -0.086000 -0.162627 -0.005602 0.206398 0.224643 0.078902
delay_major 0.017500 0.079371 -0.038361 -0.057969 0.005696 0.022023 -0.008451 -0.017715 -0.109563 -0.150462 1.000000 -0.040997 0.030199 -0.043418 -0.046845 0.040378 -0.028929 0.109105 0.036739 -0.032805
delay_catastrophic 0.035520 -0.074655 0.012834 0.095674 -0.039966 0.018741 0.065518 0.047553 -0.080127 -0.110038 -0.040997 1.000000 0.030178 -0.031753 -0.034259 0.170691 -0.044381 -0.038042 -0.016581 -0.023991
is_backlog 0.026806 -0.182253 -0.049368 0.006079 0.101631 0.080351 0.015809 0.083153 -0.129274 -0.106410 0.030199 0.030178 1.000000 -0.067674 0.093118 -0.072046 -0.094586 -0.020911 -0.037555 -0.051131
is_canceled -0.031875 -0.012005 -0.000624 0.056771 -0.034239 -0.002916 0.031720 -0.036222 -0.059536 -0.116536 -0.043418 -0.031753 -0.067674 1.000000 -0.036282 0.015404 -0.047002 0.069861 -0.056113 -0.025408
is_passing 0.043513 0.016955 0.000035 -0.005847 0.073836 0.076562 0.015133 0.132490 -0.091557 -0.086000 -0.046845 -0.034259 0.093118 -0.036282 1.000000 -0.059208 -0.050711 0.001630 -0.060542 -0.027413
is_accident 0.031715 0.025270 0.041778 0.042806 0.005562 0.066943 0.047335 -0.051096 -0.089406 -0.162627 0.040378 0.170691 -0.072046 0.015404 -0.059208 1.000000 0.001065 -0.080839 -0.091570 0.004398
is_medical 0.050550 -0.101009 -0.233609 -0.089497 -0.038704 -0.103233 -0.054273 0.296229 -0.118606 -0.005602 -0.028929 -0.044381 -0.094586 -0.047002 -0.050711 0.001065 1.000000 -0.100631 -0.052983 0.016859
is_mechanical -0.041497 -0.087499 0.039058 0.049790 -0.008069 -0.026955 0.037056 -0.129205 -0.062823 0.206398 0.109105 -0.038042 -0.020911 0.069861 0.001630 -0.080839 -0.100631 1.000000 0.185693 -0.054399
is_customer -0.044278 0.020955 0.048913 0.005040 -0.066965 -0.045545 -0.051603 -0.005532 0.002914 0.224643 0.036739 -0.016581 -0.037555 -0.056113 -0.060542 -0.091570 -0.052983 0.185693 1.000000 0.002621
is_event -0.030502 0.009100 0.000024 -0.035980 -0.012378 0.003113 -0.009753 -0.070528 0.101125 0.078902 -0.032805 -0.023991 -0.051131 -0.025408 -0.027413 0.004398 0.016859 -0.054399 0.002621 1.000000

In [12]:
del df['text_x']

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
df.tail()

In [ ]:


In [13]:
filename = "./data/merged_delay.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')

In [ ]: