notebook.community

Edit and run



In [1]:

    
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func
# import pyowm # Historical API is paid

# inline plot
%matplotlib inline









    



/Users/albarron/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [2]:

    
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
truth = pd.read_csv("data/truth_tweets.csv",sep=',',error_bad_lines=False)
twts = pd.read_csv("data/formated_twts.csv",sep=',',error_bad_lines=False)
weather = pd.read_csv("data/weather-add-twt2016-03-06-21:26:39.csv",sep='\t',error_bad_lines=False)



In [3]:

    
del truth['Unnamed: 0']
truth.head()









    Out[3]:






  
    
      
      id
      text
      train_id
      is_delay
      delay_minor
      delay_med
      delay_major
      delay_catastrophic
      is_backlog
      is_canceled
      is_passing
      is_accident
      is_medical
      is_mechanical
      is_customer
      is_event
    
  
  
    
      0
      691768068385718275
      #NB155 is 22 mins late. #Caltrain
      155
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      690667494906814464
      #NB 151 is running 10 mins behind #Caltrain
      151
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      689936168893329408
      #NB151 is 10 mins down at San Mateo. #Caltrain
      151
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      689509733640732672
      #SB138 will board on the northbound platform a...
      138
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      689485484838416388
      #SB216 delayed 13 minutes at SAT\n#NB225 delay...
      216
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0



In [4]:

    
del twts['Unnamed: 0']
twts.head(3)









    Out[4]:






  
    
      
      created_at
      favorite_count
      hashtags
      id
      in_reply_to_screen_name
      retweet_count
      text
      day_of_week
      day_of_month
      month
      time_of_day
      topic_train
      t_bullet
      t_limited
      t_northbound
      timestamp
      train_id
      tweet_id
    
  
  
    
      0
      2016-01-25 23:42:14
      1
      NB155, Caltrain
      691768068385718275
      NaN
      2
      #NB155 is 22 mins late. #Caltrain
      0
      25
      1
      error
      '155'
      0
      0
      1
      2016-01-25 23:42:14
      155
      691768068385718275
    
    
      1
      2016-01-22 22:48:57
      NaN
      NB, Caltrain
      690667494906814464
      NaN
      2
      #NB 151 is running 10 mins behind #Caltrain
      4
      22
      1
      error
      '151'
      0
      0
      1
      2016-01-22 22:48:57
      151
      690667494906814464
    
    
      2
      2016-01-20 22:22:55
      1
      NB151, Caltrain
      689936168893329408
      NaN
      2
      #NB151 is 10 mins down at San Mateo. #Caltrain
      2
      20
      1
      error
      '151'
      0
      0
      1
      2016-01-20 22:22:55
      151
      689936168893329408



In [5]:

    
truth['uid'] = truth.apply(lambda x: int(str(x.train_id)+str(x.id)),axis=1)
twts['uid'] = truth.apply(lambda x: int(str(x.train_id)+str(x.id)),axis=1)



In [6]:

    
# df = twts.merge(truth,on=['id','train_id'],how='inner') # now have uid
df = twts.merge(truth,on='uid',how='inner')
df.shape









    Out[6]:





(584, 35)



In [ ]:

    
# wt = weather[['id','temp','precipiation','visability','windspeed','humidity','cloudcover']]



In [ ]:

    
# wt.head(2)



In [ ]:

    
# df = df.merge(wt,left_on='tweet_id',right_on='id',how='outer')
# df.shape



In [8]:

    
df.head()









    Out[8]:






  
    
      
      created_at
      favorite_count
      hashtags
      id_x
      in_reply_to_screen_name
      retweet_count
      text_x
      day_of_week
      day_of_month
      month
      ...
      delay_major
      delay_catastrophic
      is_backlog
      is_canceled
      is_passing
      is_accident
      is_medical
      is_mechanical
      is_customer
      is_event
    
  
  
    
      0
      2016-01-25 23:42:14
      1
      NB155, Caltrain
      691768068385718275
      NaN
      2
      #NB155 is 22 mins late. #Caltrain
      0
      25
      1
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      2016-01-22 22:48:57
      NaN
      NB, Caltrain
      690667494906814464
      NaN
      2
      #NB 151 is running 10 mins behind #Caltrain
      4
      22
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      2016-01-20 22:22:55
      1
      NB151, Caltrain
      689936168893329408
      NaN
      2
      #NB151 is 10 mins down at San Mateo. #Caltrain
      2
      20
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      2016-01-19 18:08:25
      1
      SB138, Caltrain
      689509733640732672
      NaN
      2
      #SB138 will board on the northbound platform a...
      1
      19
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      2016-01-19 16:32:04
      2
      SB216, NB225, NB329, Caltrain
      689485484838416388
      NaN
      4
      #SB216 delayed 13 minutes at SAT\n#NB225 delay...
      1
      19
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 35 columns



In [9]:

    
df.columns.values









    Out[9]:





array(['created_at', 'favorite_count', 'hashtags', 'id_x',
       'in_reply_to_screen_name', 'retweet_count', 'text_x', 'day_of_week',
       'day_of_month', 'month', 'time_of_day', 'topic_train', 't_bullet',
       't_limited', 't_northbound', 'timestamp', 'train_id_x', 'tweet_id',
       'uid', 'id_y', 'text_y', 'train_id_y', 'is_delay', 'delay_minor',
       'delay_med', 'delay_major', 'delay_catastrophic', 'is_backlog',
       'is_canceled', 'is_passing', 'is_accident', 'is_medical',
       'is_mechanical', 'is_customer', 'is_event'], dtype=object)



In [10]:

    
del df['created_at']
del df['favorite_count']
del df['hashtags']
del df['in_reply_to_screen_name']
del df['retweet_count']
del df['text_y']
del df['train_id_y']
del df['id_y']
del df['id_x']



In [11]:

    
df.corr()









    Out[11]:






  
    
      
      day_of_week
      day_of_month
      month
      t_bullet
      t_limited
      t_northbound
      train_id_x
      tweet_id
      delay_minor
      delay_med
      delay_major
      delay_catastrophic
      is_backlog
      is_canceled
      is_passing
      is_accident
      is_medical
      is_mechanical
      is_customer
      is_event
    
  
  
    
      day_of_week
      1.000000
      -0.064527
      0.032273
      0.038234
      -0.015425
      -0.060972
      0.119804
      -0.074660
      -0.050183
      -0.041209
      0.017500
      0.035520
      0.026806
      -0.031875
      0.043513
      0.031715
      0.050550
      -0.041497
      -0.044278
      -0.030502
    
    
      day_of_month
      -0.064527
      1.000000
      -0.167183
      0.030566
      0.050568
      0.053409
      0.023401
      -0.226543
      0.097248
      0.077398
      0.079371
      -0.074655
      -0.182253
      -0.012005
      0.016955
      0.025270
      -0.101009
      -0.087499
      0.020955
      0.009100
    
    
      month
      0.032273
      -0.167183
      1.000000
      -0.104911
      0.057563
      0.073191
      0.005346
      0.000845
      -0.011825
      0.028640
      -0.038361
      0.012834
      -0.049368
      -0.000624
      0.000035
      0.041778
      -0.233609
      0.039058
      0.048913
      0.000024
    
    
      t_bullet
      0.038234
      0.030566
      -0.104911
      1.000000
      -0.406237
      -0.003214
      0.657885
      -0.066599
      -0.021947
      0.033449
      -0.057969
      0.095674
      0.006079
      0.056771
      -0.005847
      0.042806
      -0.089497
      0.049790
      0.005040
      -0.035980
    
    
      t_limited
      -0.015425
      0.050568
      0.057563
      -0.406237
      1.000000
      -0.049954
      0.133330
      -0.094702
      -0.036353
      -0.018982
      0.005696
      -0.039966
      0.101631
      -0.034239
      0.073836
      0.005562
      -0.038704
      -0.008069
      -0.066965
      -0.012378
    
    
      t_northbound
      -0.060972
      0.053409
      0.073191
      -0.003214
      -0.049954
      1.000000
      0.006201
      0.045560
      0.008760
      -0.058795
      0.022023
      0.018741
      0.080351
      -0.002916
      0.076562
      0.066943
      -0.103233
      -0.026955
      -0.045545
      0.003113
    
    
      train_id_x
      0.119804
      0.023401
      0.005346
      0.657885
      0.133330
      0.006201
      1.000000
      -0.111896
      -0.089655
      -0.009991
      -0.008451
      0.065518
      0.015809
      0.031720
      0.015133
      0.047335
      -0.054273
      0.037056
      -0.051603
      -0.009753
    
    
      tweet_id
      -0.074660
      -0.226543
      0.000845
      -0.066599
      -0.094702
      0.045560
      -0.111896
      1.000000
      -0.067052
      -0.130514
      -0.017715
      0.047553
      0.083153
      -0.036222
      0.132490
      -0.051096
      0.296229
      -0.129205
      -0.005532
      -0.070528
    
    
      delay_minor
      -0.050183
      0.097248
      -0.011825
      -0.021947
      -0.036353
      0.008760
      -0.089655
      -0.067052
      1.000000
      -0.294072
      -0.109563
      -0.080127
      -0.129274
      -0.059536
      -0.091557
      -0.089406
      -0.118606
      -0.062823
      0.002914
      0.101125
    
    
      delay_med
      -0.041209
      0.077398
      0.028640
      0.033449
      -0.018982
      -0.058795
      -0.009991
      -0.130514
      -0.294072
      1.000000
      -0.150462
      -0.110038
      -0.106410
      -0.116536
      -0.086000
      -0.162627
      -0.005602
      0.206398
      0.224643
      0.078902
    
    
      delay_major
      0.017500
      0.079371
      -0.038361
      -0.057969
      0.005696
      0.022023
      -0.008451
      -0.017715
      -0.109563
      -0.150462
      1.000000
      -0.040997
      0.030199
      -0.043418
      -0.046845
      0.040378
      -0.028929
      0.109105
      0.036739
      -0.032805
    
    
      delay_catastrophic
      0.035520
      -0.074655
      0.012834
      0.095674
      -0.039966
      0.018741
      0.065518
      0.047553
      -0.080127
      -0.110038
      -0.040997
      1.000000
      0.030178
      -0.031753
      -0.034259
      0.170691
      -0.044381
      -0.038042
      -0.016581
      -0.023991
    
    
      is_backlog
      0.026806
      -0.182253
      -0.049368
      0.006079
      0.101631
      0.080351
      0.015809
      0.083153
      -0.129274
      -0.106410
      0.030199
      0.030178
      1.000000
      -0.067674
      0.093118
      -0.072046
      -0.094586
      -0.020911
      -0.037555
      -0.051131
    
    
      is_canceled
      -0.031875
      -0.012005
      -0.000624
      0.056771
      -0.034239
      -0.002916
      0.031720
      -0.036222
      -0.059536
      -0.116536
      -0.043418
      -0.031753
      -0.067674
      1.000000
      -0.036282
      0.015404
      -0.047002
      0.069861
      -0.056113
      -0.025408
    
    
      is_passing
      0.043513
      0.016955
      0.000035
      -0.005847
      0.073836
      0.076562
      0.015133
      0.132490
      -0.091557
      -0.086000
      -0.046845
      -0.034259
      0.093118
      -0.036282
      1.000000
      -0.059208
      -0.050711
      0.001630
      -0.060542
      -0.027413
    
    
      is_accident
      0.031715
      0.025270
      0.041778
      0.042806
      0.005562
      0.066943
      0.047335
      -0.051096
      -0.089406
      -0.162627
      0.040378
      0.170691
      -0.072046
      0.015404
      -0.059208
      1.000000
      0.001065
      -0.080839
      -0.091570
      0.004398
    
    
      is_medical
      0.050550
      -0.101009
      -0.233609
      -0.089497
      -0.038704
      -0.103233
      -0.054273
      0.296229
      -0.118606
      -0.005602
      -0.028929
      -0.044381
      -0.094586
      -0.047002
      -0.050711
      0.001065
      1.000000
      -0.100631
      -0.052983
      0.016859
    
    
      is_mechanical
      -0.041497
      -0.087499
      0.039058
      0.049790
      -0.008069
      -0.026955
      0.037056
      -0.129205
      -0.062823
      0.206398
      0.109105
      -0.038042
      -0.020911
      0.069861
      0.001630
      -0.080839
      -0.100631
      1.000000
      0.185693
      -0.054399
    
    
      is_customer
      -0.044278
      0.020955
      0.048913
      0.005040
      -0.066965
      -0.045545
      -0.051603
      -0.005532
      0.002914
      0.224643
      0.036739
      -0.016581
      -0.037555
      -0.056113
      -0.060542
      -0.091570
      -0.052983
      0.185693
      1.000000
      0.002621
    
    
      is_event
      -0.030502
      0.009100
      0.000024
      -0.035980
      -0.012378
      0.003113
      -0.009753
      -0.070528
      0.101125
      0.078902
      -0.032805
      -0.023991
      -0.051131
      -0.025408
      -0.027413
      0.004398
      0.016859
      -0.054399
      0.002621
      1.000000



In [12]:

    
del df['text_x']



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
df.tail()



In [ ]:



In [13]:

    
filename = "./data/merged_delay.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')



In [ ]:

	id	text	train_id	is_delay	delay_minor	delay_med	delay_major
0	691768068385718275	#NB155 is 22 mins late. #Caltrain	155	1	0	0	1
1	690667494906814464	#NB 151 is running 10 mins behind #Caltrain	151	1	1	0	0
2	689936168893329408	#NB151 is 10 mins down at San Mateo. #Caltrain	151	1	1	0	0
3	689509733640732672	#SB138 will board on the northbound platform a...	138	0	0	0	0
4	689485484838416388	#SB216 delayed 13 minutes at SAT\n#NB225 delay...	216	1	0	1	0

	created_at	favorite_count	hashtags	id	in_reply_to_screen_name	retweet_count	text	day_of_week	day_of_month	month	time_of_day	topic_train	t_northbound	timestamp	train_id	tweet_id
0	2016-01-25 23:42:14	1	NB155, Caltrain	691768068385718275	NaN	2	#NB155 is 22 mins late. #Caltrain	0	25	1	error	'155'	1	2016-01-25 23:42:14	155	691768068385718275
1	2016-01-22 22:48:57	NaN	NB, Caltrain	690667494906814464	NaN	2	#NB 151 is running 10 mins behind #Caltrain	4	22	1	error	'151'	1	2016-01-22 22:48:57	151	690667494906814464
2	2016-01-20 22:22:55	1	NB151, Caltrain	689936168893329408	NaN	2	#NB151 is 10 mins down at San Mateo. #Caltrain	2	20	1	error	'151'	1	2016-01-20 22:22:55	151	689936168893329408

	day_of_week	day_of_month	month	t_bullet	t_limited	t_northbound	train_id_x	tweet_id	delay_minor	delay_med	delay_major	delay_catastrophic	is_backlog	is_canceled	is_passing	is_accident	is_medical	is_mechanical	is_customer	is_event
day_of_week	1.000000	-0.064527	0.032273	0.038234	-0.015425	-0.060972	0.119804	-0.074660	-0.050183	-0.041209	0.017500	0.035520	0.026806	-0.031875	0.043513	0.031715	0.050550	-0.041497	-0.044278	-0.030502
day_of_month	-0.064527	1.000000	-0.167183	0.030566	0.050568	0.053409	0.023401	-0.226543	0.097248	0.077398	0.079371	-0.074655	-0.182253	-0.012005	0.016955	0.025270	-0.101009	-0.087499	0.020955	0.009100
month	0.032273	-0.167183	1.000000	-0.104911	0.057563	0.073191	0.005346	0.000845	-0.011825	0.028640	-0.038361	0.012834	-0.049368	-0.000624	0.000035	0.041778	-0.233609	0.039058	0.048913	0.000024
t_bullet	0.038234	0.030566	-0.104911	1.000000	-0.406237	-0.003214	0.657885	-0.066599	-0.021947	0.033449	-0.057969	0.095674	0.006079	0.056771	-0.005847	0.042806	-0.089497	0.049790	0.005040	-0.035980
t_limited	-0.015425	0.050568	0.057563	-0.406237	1.000000	-0.049954	0.133330	-0.094702	-0.036353	-0.018982	0.005696	-0.039966	0.101631	-0.034239	0.073836	0.005562	-0.038704	-0.008069	-0.066965	-0.012378
t_northbound	-0.060972	0.053409	0.073191	-0.003214	-0.049954	1.000000	0.006201	0.045560	0.008760	-0.058795	0.022023	0.018741	0.080351	-0.002916	0.076562	0.066943	-0.103233	-0.026955	-0.045545	0.003113
train_id_x	0.119804	0.023401	0.005346	0.657885	0.133330	0.006201	1.000000	-0.111896	-0.089655	-0.009991	-0.008451	0.065518	0.015809	0.031720	0.015133	0.047335	-0.054273	0.037056	-0.051603	-0.009753
tweet_id	-0.074660	-0.226543	0.000845	-0.066599	-0.094702	0.045560	-0.111896	1.000000	-0.067052	-0.130514	-0.017715	0.047553	0.083153	-0.036222	0.132490	-0.051096	0.296229	-0.129205	-0.005532	-0.070528
delay_minor	-0.050183	0.097248	-0.011825	-0.021947	-0.036353	0.008760	-0.089655	-0.067052	1.000000	-0.294072	-0.109563	-0.080127	-0.129274	-0.059536	-0.091557	-0.089406	-0.118606	-0.062823	0.002914	0.101125
delay_med	-0.041209	0.077398	0.028640	0.033449	-0.018982	-0.058795	-0.009991	-0.130514	-0.294072	1.000000	-0.150462	-0.110038	-0.106410	-0.116536	-0.086000	-0.162627	-0.005602	0.206398	0.224643	0.078902
delay_major	0.017500	0.079371	-0.038361	-0.057969	0.005696	0.022023	-0.008451	-0.017715	-0.109563	-0.150462	1.000000	-0.040997	0.030199	-0.043418	-0.046845	0.040378	-0.028929	0.109105	0.036739	-0.032805
delay_catastrophic	0.035520	-0.074655	0.012834	0.095674	-0.039966	0.018741	0.065518	0.047553	-0.080127	-0.110038	-0.040997	1.000000	0.030178	-0.031753	-0.034259	0.170691	-0.044381	-0.038042	-0.016581	-0.023991
is_backlog	0.026806	-0.182253	-0.049368	0.006079	0.101631	0.080351	0.015809	0.083153	-0.129274	-0.106410	0.030199	0.030178	1.000000	-0.067674	0.093118	-0.072046	-0.094586	-0.020911	-0.037555	-0.051131
is_canceled	-0.031875	-0.012005	-0.000624	0.056771	-0.034239	-0.002916	0.031720	-0.036222	-0.059536	-0.116536	-0.043418	-0.031753	-0.067674	1.000000	-0.036282	0.015404	-0.047002	0.069861	-0.056113	-0.025408
is_passing	0.043513	0.016955	0.000035	-0.005847	0.073836	0.076562	0.015133	0.132490	-0.091557	-0.086000	-0.046845	-0.034259	0.093118	-0.036282	1.000000	-0.059208	-0.050711	0.001630	-0.060542	-0.027413
is_accident	0.031715	0.025270	0.041778	0.042806	0.005562	0.066943	0.047335	-0.051096	-0.089406	-0.162627	0.040378	0.170691	-0.072046	0.015404	-0.059208	1.000000	0.001065	-0.080839	-0.091570	0.004398
is_medical	0.050550	-0.101009	-0.233609	-0.089497	-0.038704	-0.103233	-0.054273	0.296229	-0.118606	-0.005602	-0.028929	-0.044381	-0.094586	-0.047002	-0.050711	0.001065	1.000000	-0.100631	-0.052983	0.016859
is_mechanical	-0.041497	-0.087499	0.039058	0.049790	-0.008069	-0.026955	0.037056	-0.129205	-0.062823	0.206398	0.109105	-0.038042	-0.020911	0.069861	0.001630	-0.080839	-0.100631	1.000000	0.185693	-0.054399
is_customer	-0.044278	0.020955	0.048913	0.005040	-0.066965	-0.045545	-0.051603	-0.005532	0.002914	0.224643	0.036739	-0.016581	-0.037555	-0.056113	-0.060542	-0.091570	-0.052983	0.185693	1.000000	0.002621
is_event	-0.030502	0.009100	0.000024	-0.035980	-0.012378	0.003113	-0.009753	-0.070528	0.101125	0.078902	-0.032805	-0.023991	-0.051131	-0.025408	-0.027413	0.004398	0.016859	-0.054399	0.002621	1.000000