Dictionary

train_direction = 0 south, 1 north traintype = 0 Local, 1 Limited, 2 Bullet train



In [1]:

    
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import re
import random
import operator
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

from func import *

# inline plot
%matplotlib inline
#%%javascript
#IPython.OutputArea.auto_scroll_threshold = 9999;









    



/Users/albarron/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [2]:

    
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
df = pd.read_csv("data/raw-twt2016-01-26-14-21-09.csv",sep='\t',error_bad_lines=False)
# df.head(5)
print len(df.index)
list(df.columns.values)









    



3199






    Out[2]:





['Unnamed: 0',
 'created_at',
 'favorite_count',
 'favorited',
 'hashtags',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'media',
 'place',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'urls',
 'user',
 'user_mentions']

Cleanin' the data



In [3]:

    
# Fill in blank hashtags
df = df.where((pd.notnull(df)), np.nan)
df["hashtags"].fillna('')

# Add some date/time things
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')

df["day_of_week"] = df["created_at"].apply(lambda x: x.weekday())
df["day_of_month"] = df["created_at"].apply(lambda x: x.day)
df["month"] = df["created_at"].apply(lambda x: x.month)
df["time_of_day"] = df["created_at"].apply(lambda x: get_time_of_day(x))

tod_Dummy = pd.get_dummies(df['time_of_day'])
print(tod_Dummy.head(5))
print tod_Dummy.count()
# del tod_Dummy['shutdown']

# df['in_reply_to_screen_name'].fillna(-1)
# df['in_reply_to_status_id'].fillna(-1)
# df['in_reply_to_user_id'].fillna(-1)
# df['retweeted_status'].fillna(-1)
# df['retweeted'].fillna(-1)
df['retweet_count'].fillna(np.nan)
df['favorite_count'].fillna(np.nan)
df["hashtags"].fillna(np.nan)
df["hashtags"] = df["hashtags"].apply(lambda x: str(x)[1:-1])
df.loc[df["hashtags"]=='a',"hashtags"] = ''
list(df.columns.values)









    



   error  rush_evening  rush_morning  workday
0      1             0             0        0
1      0             1             0        0
2      0             1             0        0
3      0             1             0        0
4      0             1             0        0
error           3199
rush_evening    3199
rush_morning    3199
workday         3199
dtype: int64






    Out[3]:





['Unnamed: 0',
 'created_at',
 'favorite_count',
 'favorited',
 'hashtags',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'media',
 'place',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'urls',
 'user',
 'user_mentions',
 'day_of_week',
 'day_of_month',
 'month',
 'time_of_day']



In [4]:

    
#Potentially remove, just cleaning for analysis sake
del df['Unnamed: 0']
del df['truncated']
del df['user_mentions']
del df['urls']
del df['source']
del df['lang']
del df['place']
del df['favorited']
del df['media']
del df['user']

# More likely to remove
del df['in_reply_to_status_id']
del df['in_reply_to_user_id']
del df['retweeted']
del df['retweeted_status']
len(df)









    Out[4]:





3199



In [5]:

    
df.plot(x='created_at', y='day_of_week', kind='hist')
# fdf = df[["created_at","id","text","hashtags"]]
# str(fdf









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x103c20910>

Let's start getting some more detailed data from the trips as well



In [6]:

    
# df['favorite_count'] = df['favorite_count'].astype(np.int64)
# df['retweet_count'] = df['retweet_count'].astype(np.int64)
# df['text'] = df['text'].astype(str)
# df['id'] = df['id'].astype(np.int64)
# df['day_of_week'] = df['day_of_week'].astype(np.int64)
# df['day_of_month'] = df['day_of_month'].astype(np.int64)
# df['month'] = df['month'].astype(np.int64)
# df['time_of_day'] = df['time_of_day'].astype(np.int64)
df.loc[df["hashtags"]=='on',"hashtags"] = np.nan
df.convert_objects(convert_numeric=True)
df.dtypes
len(df)









    



/Users/albarron/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.






    Out[6]:





3199



In [7]:

    
# Pull out potential trains from both hashtags and text
df["topic_train"] = df["text"].apply(lambda x: check_train_id(x))
df["topic_train"] = df["topic_train"].apply(lambda x: str(x)[1:-1])
df["topic_train"].fillna(np.nan)
df.head(5)









    Out[7]:






  
    
      
      created_at
      favorite_count
      hashtags
      id
      in_reply_to_screen_name
      retweet_count
      text
      day_of_week
      day_of_month
      month
      time_of_day
      topic_train
    
  
  
    
      0
      2016-01-26 20:32:15
      6
      SanFrancisco
      692082643022680064
      NaN
      7
      NOTICE: Ped &amp; Bike detours in place for Ma...
      1
      26
      1
      error
      
    
    
      1
      2016-01-26 19:41:32
      NaN
      
      692069881559134208
      therealwall
      NaN
      @therealwall After the end of the concert we w...
      1
      26
      1
      rush_evening
      
    
    
      2
      2016-01-26 19:28:52
      NaN
      SB50
      692066695838498816
      AemalTheAFGHAN
      NaN
      @AemalTheAFGHAN @BKDenverSports We're glad to ...
      1
      26
      1
      rush_evening
      '50'
    
    
      3
      2016-01-26 18:12:35
      1
      Sorry, Headphones
      692047497238175744
      4c4d
      NaN
      @4c4d Oh man. We love that, too. Our favorite ...
      1
      26
      1
      rush_evening
      
    
    
      4
      2016-01-26 17:53:20
      7
      
      692042650933862401
      NaN
      18
      Pssst, hey, regular Caltrain riders: expect a ...
      1
      26
      1
      rush_evening



In [8]:

    
len(df)









    Out[8]:





3199



In [9]:

    
# pd.pivot_table(
#   df,values='values',
#   index=['month'],
#   columns=['day_of_week'])

First, a word about the below code. In the accompanying func.py there is a function called parse_train that returns a pandas.Series object. For some reason, when it's returned from a map or apply, it seems to get cast as a string. When applied to a list or a dataframe, this string gets turned into a single field in the row, OR divided into several rows, throwing the count off.

To get around this, I return the results of the parse_train function and then CAST it back to a series. This adds a weird 0 index, which I delete. I then fill in the plethora of NaNs and recombine it with the primary dataframe.

For context, previous iterations included df['topic_train'].apply(lambda x:parse_train(x)) which would return a pd.Series object with str versions of the returned pd.Series from parse_train



In [10]:

    
ret = []

def parse_train(t):
# x should be a list with train codes eg 123
# {"id": "123", "type:" "bullet", direction: "south"}

    try:
        s = t['topic_train'].split(',')
    except:
        return t['topic_train']
    if s[0] == '':
#         print ""
        return np.nan
    for x in s:
#         print "Iter",x[1:-1]
        q = {}
        # Check train id
#         x = parse_train_id(x)
        x = str(x)
        x = re.sub('[^0-9]','', x)
        if len(x)<3: continue

        # 1 = north, 0 = south
        q["t_northbound"] = 1 if int(x[2]) in [1,3,5,7,9] else 0
        q['t_limited'] = 0
        q['t_bullet'] = 0
        
        if x[0] == '1':
            q['t_limited'] = 0
        elif x[0] == '2':
            q["t_limited"] = 1 # limited
        elif x[0] == '3':
            q["t_bullet"] = 1 # bullet
        else:
            q['t_limited'] = 0

        ret.append({'tweet_id': t['id'],
                    'timestamp': t['created_at'], 
                    'train_id': int(x),
                    't_northbound':q["t_northbound"], 
                    't_limited': q["t_limited"],
                    't_bullet': q['t_bullet']})
    return s



In [11]:

    
# Let's then filter those train topics into details
# Btw this is jank as fuck.

# red = df[['id','created_at','topic_train']]
red = df.apply(lambda x:parse_train(x),axis=1)
print "red return:",len(red)
print "ret return,",len(ret)
#red
tf = pd.DataFrame(ret)
tf.head(5)

#events = pd.DataFrame([pd.Series(x) for x in red.apply(parse_train)])
#events
#del new.iloc[0]
#new.fillna('')
#df.combine_first(new)









    



red return: 3199
ret return, 528






    Out[11]:






  
    
      
      t_bullet
      t_limited
      t_northbound
      timestamp
      train_id
      tweet_id
    
  
  
    
      0
      0
      0
      1
      2016-01-25 23:42:14
      155
      691768068385718275
    
    
      1
      0
      0
      1
      2016-01-22 22:48:57
      151
      690667494906814464
    
    
      2
      0
      0
      1
      2016-01-20 22:22:55
      151
      689936168893329408
    
    
      3
      0
      0
      0
      2016-01-19 18:08:25
      138
      689509733640732672
    
    
      4
      0
      1
      0
      2016-01-19 16:32:04
      216
      689485484838416388



In [12]:

    
print df.loc[df['topic_train'] != '',['topic_train','text']]









    



              topic_train                                               text
2                    '50'  @AemalTheAFGHAN @BKDenverSports We're glad to ...
8                    '50'  @iamsridhar we are adding extra trains &amp; c...
16                  '155'                  #NB155 is 22 mins late. #Caltrain
21                   '50'  RT @smctd: #Caltrain Devs: NEW Super Bowl sche...
22               '50', ''  Headed to #SantaClara for #SB50? Visit our SB ...
29                   '50'  RT @Broncos: We #BeatThePatriots!\n\nSee you i...
30                   '50'  We look forward to getting #CAR @Panthers, #DE...
32                  '151'        #NB 151 is running 10 mins behind #Caltrain
33                     ''  Trains 236-254 will board on the NB platform t...
44                  '151'     #NB151 is 10 mins down at San Mateo. #Caltrain
51                  '138'  #SB138 will board on the northbound platform a...
55    '216', '225', '329'  #SB216 delayed 13 minutes at SAT\n#NB225 delay...
58                  '323'  #NB323 reported car on tracks at Charleston Av...
61                   '50'  RT @chMtnViewPD: Stuff to know if you plan on ...
72                  '371'  #NB371 is currently 11 mins late at Cal Ave. #...
74                  '371'  #NB371 will pass &amp; be ahead of 269 at Redw...
75                     ''                     @bassemabdouni the train is NB
77                  '371'  #NB371 will depart San Jose on track 2. #Caltrain
78                  '269'         #NB269 will depart ahead of 371. #Caltrain
93           '217', '319'  #NB217 is 12 mi s late. #NB319 departed SJ ahe...
94                  '156'       #SB156 is 10 mins late out of TAM. #Caltrain
96                  '150'  #SB150 10 mins late @ SUN due to fare evader. ...
102                  '50'  RT @scsb50: We are excited to be working with ...
107             '323', ''  #NB323 – 10min late @ SBR. #DoorProblems. #Cal...
122                 '366'  375/277 @ PA\n279 @ MV 86 mins late\nNB366 alm...
127                 '366'                  NB366 currently at MIL. #Caltrain
136          '273', '366'  SB273 turned southbound @ SUN.\n375 &amp; 277 ...
143                '', ''  @Awkward_Caiti Some, but problem is the SB tra...
156                    ''  273 is turning southbound from Sunnyvale to SJ...
157          '375', '277'  #NB375 7 #NB277 coupling at LAW due to crew sh...
...                   ...                                                ...
3008                '268'  #SB268 held at SJ for #370 PAX transfer due to...
3009                '272'  #SB272 – 17 min late @ HIL Due to signal issue...
3011                '370'                #SB370 – 11 min late HIL. #Caltrain
3014                '151'        #NB151 was 10min late out of BEL. #Caltrain
3016                '142'            #SB142 is 12 mins late at PA. #Caltrain
3025                '376'  #SB376 - 21mins late @ PA. 376 ran around 274 ...
3026                '376'  #SB376 will depart SF approx. 10-15min late du...
3028                '156'  #SB156 will be about 15min late into CAP. #Cal...
3029                '258'                   #SB258 - 12 mins late. #Caltrain
3030                '258'         #SB258 departed SF 12 mins late. #Caltrain
3031                '254'               #SB254-14 mins late @ RWC. #Caltrain
3032                '155'   #NB155 – 10 min late out of San Mateo. #Caltrain
3035                '152'  #SB152-10min late @ RWC. Mechanical delay. #Ca...
3039                '146'       #SB146 – 13” mins late out of SMT. #Caltrain
3054                '104'                  #SB104 is 23 mins late. #Caltrain
3066                '221'    #NB221 is running about 15 mins late. #Caltrain
3067                '221'  RT @chenmor3: @Caltrain_News NB221 just arrive...
3068                '221'  #NB221 departed Gilroy 11 late. Still rolling ...
3070                   ''  Congratulations #DubNation @warriors #BayArea ...
3072                '380'  #SB380 reported a near miss with a trespasser ...
3077                '269'  #NB269 departed PA 10 LATE. 269 is at bike cap...
3082                '142'  #SB142 is 18 mins late out of Millbrae. #Caltrain
3107         '440', '442'  Mechanical problem with SB 440. It will return...
3108                '371'  RT @BikesOnCaltrain: #NB371 will run with a 5-...
3109                '152'                  #SB152 is 11 mins late. #Caltrain
3146                '135'  #NB135 is delayed 10 mins out of Redwood City....
3149                '323'  RT @BikesOnCaltrain: #NB323 departed San Jose ...
3164                '267'          #NB267 stopped at MPK with engine issues.
3176                '233'   #NB233 is 10 mins delayed at Millbrae. #Caltrain
3188                   ''  Final Twitter Update:\n274-13 mins late @ LAW\...

[620 rows x 2 columns]



In [13]:

    
len(tf)









    Out[13]:





528



In [ ]:



In [14]:

    
len(tf)









    Out[14]:





528



In [15]:

    
df = df.merge(tf, left_on='id',right_on='tweet_id',how='right')



In [16]:

    
df.groupby(['time_of_day','month']).mean()









    Out[16]:






  
    
      
      
      favorite_count
      id
      retweet_count
      day_of_week
      day_of_month
      t_bullet
      t_limited
      t_northbound
      train_id
      tweet_id
    
    
      time_of_day
      month
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      error
      1
      1.722222
      6.869895e+17
      3.280000
      2.480000
      12.360000
      0.320000
      0.200000
      0.480000
      242.720000
      6.869895e+17
    
    
      6
      1.437500
      6.126169e+17
      2.000000
      3.567568
      21.216216
      0.324324
      0.378378
      0.297297
      286.351351
      6.126169e+17
    
    
      7
      2.470588
      6.223543e+17
      4.953488
      2.558140
      18.162791
      0.348837
      0.441860
      0.511628
      281.720930
      6.223543e+17
    
    
      8
      2.480000
      6.328767e+17
      4.076923
      2.826923
      16.057692
      0.269231
      0.461538
      0.346154
      263.076923
      6.328767e+17
    
    
      9
      2.777778
      6.439176e+17
      3.909091
      2.314286
      15.514286
      0.128571
      0.471429
      0.671429
      271.014286
      6.439176e+17
    
    
      10
      1.857143
      6.527556e+17
      4.622222
      2.688889
      9.866667
      0.166667
      0.466667
      0.455556
      252.122222
      6.527556e+17
    
    
      11
      2.700000
      6.652811e+17
      3.562500
      1.515152
      13.272727
      0.060606
      0.393939
      0.484848
      210.303030
      6.652811e+17
    
    
      12
      2.000000
      6.747415e+17
      2.476190
      3.681818
      9.727273
      0.272727
      0.318182
      0.454545
      288.454545
      6.747415e+17
    
    
      rush_evening
      1
      1.875000
      6.881407e+17
      3.750000
      0.750000
      15.250000
      0.375000
      0.250000
      0.625000
      228.875000
      6.881407e+17
    
    
      6
      4.666667
      6.125047e+17
      1.818182
      1.090909
      20.545455
      0.181818
      0.454545
      0.545455
      211.818182
      6.125047e+17
    
    
      7
      1.333333
      6.225477e+17
      2.100000
      1.750000
      18.250000
      0.050000
      0.550000
      0.550000
      198.050000
      6.225477e+17
    
    
      8
      1.400000
      6.352651e+17
      2.428571
      2.285714
      22.285714
      0.000000
      0.071429
      0.785714
      146.785714
      6.352651e+17
    
    
      9
      2.000000
      6.447491e+17
      1.640000
      1.840000
      17.520000
      0.080000
      0.240000
      0.800000
      174.800000
      6.447491e+17
    
    
      10
      1.250000
      6.529508e+17
      2.210526
      2.210526
      10.157895
      0.105263
      0.315789
      0.684211
      219.210526
      6.529508e+17
    
    
      11
      1.200000
      6.663183e+17
      4.100000
      0.700000
      16.000000
      0.100000
      0.200000
      0.900000
      174.700000
      6.663183e+17
    
    
      12
      1.800000
      6.740357e+17
      3.700000
      1.700000
      7.300000
      0.100000
      0.300000
      0.500000
      184.300000
      6.740357e+17
    
    
      rush_morning
      10
      1.000000
      6.520161e+17
      1.000000
      3.000000
      8.000000
      0.000000
      0.000000
      1.000000
      199.000000
      6.520161e+17
    
    
      workday
      1
      1.000000
      6.872953e+17
      6.000000
      2.000000
      13.000000
      0.500000
      0.500000
      1.000000
      268.000000
      6.872953e+17
    
    
      6
      1.000000
      6.114410e+17
      1.571429
      1.714286
      17.714286
      0.285714
      0.571429
      0.714286
      232.714286
      6.114410e+17
    
    
      7
      2.333333
      6.223477e+17
      5.800000
      1.300000
      17.800000
      0.300000
      0.700000
      0.500000
      247.900000
      6.223477e+17
    
    
      8
      NaN
      6.342669e+17
      2.333333
      2.666667
      19.666667
      0.333333
      0.666667
      0.333333
      252.333333
      6.342669e+17
    
    
      9
      1.714286
      6.434607e+17
      5.933333
      2.866667
      14.066667
      0.266667
      0.666667
      0.866667
      242.466667
      6.434607e+17
    
    
      10
      2.000000
      6.525711e+17
      4.000000
      3.000000
      9.166667
      0.500000
      0.000000
      0.666667
      373.666667
      6.525711e+17
    
    
      12
      NaN
      6.724328e+17
      4.666667
      3.000000
      3.000000
      0.000000
      1.000000
      1.000000
      220.333333
      6.724328e+17



In [17]:

    
list(df.columns.values)









    Out[17]:





['created_at',
 'favorite_count',
 'hashtags',
 'id',
 'in_reply_to_screen_name',
 'retweet_count',
 'text',
 'day_of_week',
 'day_of_month',
 'month',
 'time_of_day',
 'topic_train',
 't_bullet',
 't_limited',
 't_northbound',
 'timestamp',
 'train_id',
 'tweet_id']



In [18]:

    
df.plot(x='time_of_day',y='day_of_week',kind='hist')









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x11009f490>



In [26]:

    
# pd.scatter_matrix(df,alpha=0.1,figsize=(15,15), diagonal='hist');



In [20]:

    
df.groupby('month').describe()









    Out[20]:






  
    
      
      
      day_of_month
      day_of_week
      favorite_count
      id
      retweet_count
      t_bullet
      t_limited
      t_northbound
      train_id
      tweet_id
    
    
      month
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      count
      35.000000
      35.000000
      28.000000
      3.500000e+01
      35.000000
      35.000000
      35.000000
      35.000000
      35.000000
      3.500000e+01
    
    
      mean
      13.057143
      2.057143
      1.714286
      6.872701e+17
      3.542857
      0.342857
      0.228571
      0.542857
      241.000000
      6.872701e+17
    
    
      std
      4.862479
      1.493965
      1.462042
      1.777861e+15
      3.860378
      0.481594
      0.426043
      0.505433
      94.183176
      1.777861e+15
    
    
      min
      4.000000
      0.000000
      1.000000
      6.840946e+17
      1.000000
      0.000000
      0.000000
      0.000000
      138.000000
      6.840946e+17
    
    
      25%
      9.500000
      1.000000
      1.000000
      6.861119e+17
      2.000000
      0.000000
      0.000000
      0.000000
      146.500000
      6.861119e+17
    
    
      50%
      12.000000
      1.000000
      1.000000
      6.867537e+17
      3.000000
      0.000000
      0.000000
      1.000000
      225.000000
      6.867537e+17
    
    
      75%
      15.000000
      4.000000
      2.000000
      6.878042e+17
      4.000000
      1.000000
      0.000000
      1.000000
      326.000000
      6.878042e+17
    
    
      max
      25.000000
      4.000000
      8.000000
      6.917681e+17
      24.000000
      1.000000
      1.000000
      1.000000
      375.000000
      6.917681e+17
    
    
      6
      count
      55.000000
      55.000000
      20.000000
      5.500000e+01
      55.000000
      55.000000
      55.000000
      55.000000
      55.000000
      5.500000e+01
    
    
      mean
      20.636364
      2.836364
      1.900000
      6.124448e+17
      1.909091
      0.290909
      0.418182
      0.400000
      264.618182
      6.124448e+17
    
    
      std
      5.873240
      1.853562
      1.586124
      2.110101e+15
      1.126659
      0.458368
      0.497807
      0.494413
      88.131739
      2.110101e+15
    
    
      min
      8.000000
      0.000000
      1.000000
      6.079546e+17
      1.000000
      0.000000
      0.000000
      0.000000
      104.000000
      6.079546e+17
    
    
      25%
      17.500000
      1.000000
      1.000000
      6.113585e+17
      1.000000
      0.000000
      0.000000
      0.000000
      205.500000
      6.113585e+17
    
    
      50%
      20.000000
      3.000000
      1.000000
      6.120688e+17
      2.000000
      0.000000
      0.000000
      0.000000
      264.000000
      6.120688e+17
    
    
      75%
      26.000000
      4.500000
      2.250000
      6.144020e+17
      3.000000
      1.000000
      1.000000
      1.000000
      344.000000
      6.144020e+17
    
    
      max
      30.000000
      6.000000
      6.000000
      6.156911e+17
      7.000000
      1.000000
      1.000000
      1.000000
      442.000000
      6.156911e+17
    
    
      7
      count
      73.000000
      73.000000
      26.000000
      7.300000e+01
      73.000000
      73.000000
      73.000000
      73.000000
      73.000000
      7.300000e+01
    
    
      mean
      18.136986
      2.164384
      2.192308
      6.224064e+17
      4.287671
      0.260274
      0.506849
      0.520548
      254.164384
      6.224064e+17
    
    
      std
      9.194375
      1.280340
      1.744001
      3.334705e+15
      9.187067
      0.441821
      0.503413
      0.503035
      77.017677
      3.334705e+15
    
    
      min
      1.000000
      0.000000
      1.000000
      6.160466e+17
      1.000000
      0.000000
      0.000000
      0.000000
      134.000000
      6.160466e+17
    
    
      25%
      14.000000
      1.000000
      1.000000
      6.207510e+17
      1.000000
      0.000000
      0.000000
      0.000000
      216.000000
      6.207510e+17
    
    
      50%
      20.000000
      2.000000
      2.000000
      6.231497e+17
      2.000000
      0.000000
      1.000000
      1.000000
      258.000000
      6.231497e+17
    
    
      75%
      25.000000
      3.000000
      2.750000
      6.247474e+17
      3.000000
      1.000000
      1.000000
      1.000000
      313.000000
      6.247474e+17
    
    
      max
      31.000000
      5.000000
      8.000000
      6.272640e+17
      60.000000
      1.000000
      1.000000
      1.000000
      386.000000
      6.272640e+17
    
    
      8
      count
      69.000000
      69.000000
      30.000000
      6.900000e+01
      69.000000
      69.000000
      69.000000
      69.000000
      69.000000
      6.900000e+01
    
    
      mean
      17.478261
      2.710145
      2.300000
      6.334217e+17
      3.666667
      0.217391
      0.391304
      0.434783
      239.014493
      6.334217e+17
    
    
      std
      9.827346
      1.572759
      1.263547
      3.632823e+15
      4.676810
      0.415493
      0.491618
      0.499360
      85.203475
      3.632823e+15
    
    
      min
      1.000000
      0.000000
      1.000000
      6.272702e+17
      1.000000
      0.000000
      0.000000
      0.000000
      135.000000
      6.272702e+17
    
    
      25%
      10.000000
      1.000000
      1.000000
      6.308697e+17
      2.000000
      0.000000
      0.000000
      0.000000
      150.000000
      6.308697e+17
    
    
      50%
      19.000000
      3.000000
      2.000000
      6.341380e+17
      2.000000
      0.000000
      0.000000
      0.000000
      257.000000
      6.341380e+17
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      9
      std
      9.011368
      1.341858
      3.257780
      3.245817e+15
      5.066955
      0.344745
      0.499291
      0.447400
      91.119182
      3.245817e+15
    
    
      min
      1.000000
      0.000000
      1.000000
      6.385084e+17
      1.000000
      0.000000
      0.000000
      0.000000
      135.000000
      6.385084e+17
    
    
      25%
      9.000000
      1.250000
      1.000000
      6.414256e+17
      2.000000
      0.000000
      0.000000
      0.000000
      152.000000
      6.414256e+17
    
    
      50%
      14.000000
      2.000000
      1.000000
      6.435046e+17
      2.000000
      0.000000
      0.000000
      1.000000
      254.000000
      6.435046e+17
    
    
      75%
      23.000000
      3.000000
      3.000000
      6.467907e+17
      4.000000
      0.000000
      1.000000
      1.000000
      279.000000
      6.467907e+17
    
    
      max
      30.000000
      5.000000
      20.000000
      6.493411e+17
      30.000000
      1.000000
      1.000000
      1.000000
      449.000000
      6.493411e+17
    
    
      10
      count
      116.000000
      116.000000
      49.000000
      1.160000e+02
      116.000000
      116.000000
      116.000000
      116.000000
      116.000000
      1.160000e+02
    
    
      mean
      9.862069
      2.629310
      1.795918
      6.527716e+17
      4.163793
      0.172414
      0.413793
      0.508621
      252.560345
      6.527716e+17
    
    
      std
      7.033951
      1.130834
      1.731805
      2.555516e+15
      4.816537
      0.379378
      0.494649
      0.502095
      80.750748
      2.555516e+15
    
    
      min
      1.000000
      0.000000
      1.000000
      6.496141e+17
      1.000000
      0.000000
      0.000000
      0.000000
      134.000000
      6.496141e+17
    
    
      25%
      7.000000
      2.000000
      1.000000
      6.515610e+17
      2.000000
      0.000000
      0.000000
      0.000000
      190.750000
      6.515610e+17
    
    
      50%
      8.000000
      3.000000
      1.000000
      6.520133e+17
      3.000000
      0.000000
      0.000000
      1.000000
      262.000000
      6.520133e+17
    
    
      75%
      14.000000
      3.000000
      2.000000
      6.541057e+17
      4.000000
      0.000000
      1.000000
      1.000000
      284.750000
      6.541057e+17
    
    
      max
      30.000000
      5.000000
      11.000000
      6.602246e+17
      33.000000
      1.000000
      1.000000
      1.000000
      425.000000
      6.602246e+17
    
    
      11
      count
      43.000000
      43.000000
      25.000000
      4.300000e+01
      42.000000
      43.000000
      43.000000
      43.000000
      43.000000
      4.300000e+01
    
    
      mean
      13.906977
      1.325581
      2.400000
      6.655223e+17
      3.690476
      0.069767
      0.348837
      0.581395
      202.023256
      6.655223e+17
    
    
      std
      8.733662
      1.209505
      1.957890
      3.138674e+15
      3.032250
      0.257770
      0.482243
      0.499169
      71.815401
      3.138674e+15
    
    
      min
      2.000000
      0.000000
      1.000000
      6.612370e+17
      1.000000
      0.000000
      0.000000
      0.000000
      135.000000
      6.612370e+17
    
    
      25%
      7.500000
      0.000000
      1.000000
      6.632601e+17
      2.000000
      0.000000
      0.000000
      0.000000
      146.000000
      6.632601e+17
    
    
      50%
      12.000000
      1.000000
      2.000000
      6.649104e+17
      3.000000
      0.000000
      0.000000
      1.000000
      155.000000
      6.649104e+17
    
    
      75%
      20.000000
      2.000000
      3.000000
      6.676899e+17
      3.750000
      0.000000
      1.000000
      1.000000
      266.000000
      6.676899e+17
    
    
      max
      30.000000
      4.000000
      7.000000
      6.713959e+17
      15.000000
      1.000000
      1.000000
      1.000000
      375.000000
      6.713959e+17
    
    
      12
      count
      35.000000
      35.000000
      16.000000
      3.500000e+01
      34.000000
      35.000000
      35.000000
      35.000000
      35.000000
      3.500000e+01
    
    
      mean
      8.457143
      3.057143
      1.937500
      6.743419e+17
      3.029412
      0.200000
      0.371429
      0.514286
      252.857143
      6.743419e+17
    
    
      std
      7.636379
      1.764734
      1.340087
      2.747556e+15
      2.110368
      0.405840
      0.490241
      0.507093
      103.178356
      2.747556e+15
    
    
      min
      1.000000
      0.000000
      1.000000
      6.717135e+17
      1.000000
      0.000000
      0.000000
      0.000000
      134.000000
      6.717135e+17
    
    
      25%
      3.500000
      2.000000
      1.000000
      6.724021e+17
      2.000000
      0.000000
      0.000000
      0.000000
      149.000000
      6.724021e+17
    
    
      50%
      5.000000
      3.000000
      1.500000
      6.729805e+17
      2.000000
      0.000000
      0.000000
      1.000000
      233.000000
      6.729805e+17
    
    
      75%
      11.500000
      4.500000
      2.250000
      6.754842e+17
      3.000000
      0.000000
      1.000000
      1.000000
      344.000000
      6.754842e+17
    
    
      max
      29.000000
      6.000000
      6.000000
      6.819313e+17
      10.000000
      1.000000
      1.000000
      1.000000
      448.000000
      6.819313e+17
    
  

64 rows × 10 columns



In [21]:

    
train = df[df['train_id'] > 0]



In [22]:

    
train.groupby('day_of_week').count()









    Out[22]:






  
    
      
      created_at
      favorite_count
      hashtags
      id
      in_reply_to_screen_name
      retweet_count
      text
      day_of_month
      month
      time_of_day
      topic_train
      t_bullet
      t_limited
      t_northbound
      timestamp
      train_id
      tweet_id
    
    
      day_of_week
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      54
      27
      54
      54
      0
      49
      54
      54
      54
      54
      54
      54
      54
      54
      54
      54
      54
    
    
      1
      102
      48
      102
      102
      0
      102
      102
      102
      102
      102
      102
      102
      102
      102
      102
      102
      102
    
    
      2
      135
      59
      135
      135
      0
      135
      135
      135
      135
      135
      135
      135
      135
      135
      135
      135
      135
    
    
      3
      117
      44
      117
      117
      0
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
    
    
      4
      75
      24
      75
      75
      0
      75
      75
      75
      75
      75
      75
      75
      75
      75
      75
      75
      75
    
    
      5
      48
      32
      48
      48
      0
      48
      48
      48
      48
      48
      48
      48
      48
      48
      48
      48
      48
    
    
      6
      5
      5
      5
      5
      0
      4
      5
      5
      5
      5
      5
      5
      5
      5
      5
      5
      5



In [23]:

    
train.groupby('month').count()









    Out[23]:






  
    
      
      created_at
      favorite_count
      hashtags
      id
      in_reply_to_screen_name
      retweet_count
      text
      day_of_week
      day_of_month
      time_of_day
      topic_train
      t_bullet
      t_limited
      t_northbound
      timestamp
      train_id
      tweet_id
    
    
      month
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      35
      28
      35
      35
      0
      35
      35
      35
      35
      35
      35
      35
      35
      35
      35
      35
      35
    
    
      6
      55
      20
      55
      55
      0
      55
      55
      55
      55
      55
      55
      55
      55
      55
      55
      55
      55
    
    
      7
      73
      26
      73
      73
      0
      73
      73
      73
      73
      73
      73
      73
      73
      73
      73
      73
      73
    
    
      8
      69
      30
      69
      69
      0
      69
      69
      69
      69
      69
      69
      69
      69
      69
      69
      69
      69
    
    
      9
      110
      45
      110
      110
      0
      106
      110
      110
      110
      110
      110
      110
      110
      110
      110
      110
      110
    
    
      10
      116
      49
      116
      116
      0
      116
      116
      116
      116
      116
      116
      116
      116
      116
      116
      116
      116
    
    
      11
      43
      25
      43
      43
      0
      42
      43
      43
      43
      43
      43
      43
      43
      43
      43
      43
      43
    
    
      12
      35
      16
      35
      35
      0
      34
      35
      35
      35
      35
      35
      35
      35
      35
      35
      35
      35



In [24]:

    
train.groupby('time_of_day').count()









    Out[24]:






  
    
      
      created_at
      favorite_count
      hashtags
      id
      in_reply_to_screen_name
      retweet_count
      text
      day_of_week
      day_of_month
      month
      topic_train
      t_bullet
      t_limited
      t_northbound
      timestamp
      train_id
      tweet_id
    
    
      time_of_day
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      error
      372
      185
      372
      372
      0
      366
      372
      372
      372
      372
      372
      372
      372
      372
      372
      372
      372
    
    
      rush_evening
      117
      38
      117
      117
      0
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
      117
    
    
      rush_morning
      1
      1
      1
      1
      0
      1
      1
      1
      1
      1
      1
      1
      1
      1
      1
      1
      1
    
    
      workday
      46
      15
      46
      46
      0
      46
      46
      46
      46
      46
      46
      46
      46
      46
      46
      46
      46



In [25]:

    
df.corr()









    Out[25]:






  
    
      
      favorite_count
      id
      retweet_count
      day_of_week
      day_of_month
      month
      t_bullet
      t_limited
      t_northbound
      train_id
      tweet_id
    
  
  
    
      favorite_count
      1.000000
      -0.056542
      0.551203
      0.100412
      -0.013727
      0.057577
      0.184392
      0.019418
      0.071410
      0.182412
      -0.056542
    
    
      id
      -0.056542
      1.000000
      0.024774
      -0.083374
      -0.201225
      0.119187
      -0.048254
      -0.091769
      0.070766
      -0.083925
      1.000000
    
    
      retweet_count
      0.551203
      0.024774
      1.000000
      0.019303
      -0.058542
      0.030757
      0.096546
      0.100636
      -0.041501
      0.148645
      0.024774
    
    
      day_of_week
      0.100412
      -0.083374
      0.019303
      1.000000
      -0.063651
      0.022254
      0.047302
      -0.022492
      -0.061165
      0.132644
      -0.083374
    
    
      day_of_month
      -0.013727
      -0.201225
      -0.058542
      -0.063651
      1.000000
      -0.204438
      0.014090
      0.057579
      0.053588
      -0.007253
      -0.201225
    
    
      month
      0.057577
      0.119187
      0.030757
      0.022254
      -0.204438
      1.000000
      -0.145967
      0.041591
      0.045134
      -0.039853
      0.119187
    
    
      t_bullet
      0.184392
      -0.048254
      0.096546
      0.047302
      0.014090
      -0.145967
      1.000000
      -0.416707
      -0.010226
      0.651110
      -0.048254
    
    
      t_limited
      0.019418
      -0.091769
      0.100636
      -0.022492
      0.057579
      0.041591
      -0.416707
      1.000000
      -0.041419
      0.110541
      -0.091769
    
    
      t_northbound
      0.071410
      0.070766
      -0.041501
      -0.061165
      0.053588
      0.045134
      -0.010226
      -0.041419
      1.000000
      0.008933
      0.070766
    
    
      train_id
      0.182412
      -0.083925
      0.148645
      0.132644
      -0.007253
      -0.039853
      0.651110
      0.110541
      0.008933
      1.000000
      -0.083925
    
    
      tweet_id
      -0.056542
      1.000000
      0.024774
      -0.083374
      -0.201225
      0.119187
      -0.048254
      -0.091769
      0.070766
      -0.083925
      1.000000



In [ ]:

	created_at	favorite_count	hashtags	id	in_reply_to_screen_name	retweet_count	text	day_of_week	day_of_month	month	time_of_day	topic_train
0	2016-01-26 20:32:15	6	SanFrancisco	692082643022680064	NaN	7	NOTICE: Ped & Bike detours in place for Ma...	1	26	1	error
1	2016-01-26 19:41:32	NaN		692069881559134208	therealwall	NaN	@therealwall After the end of the concert we w...	1	26	1	rush_evening
2	2016-01-26 19:28:52	NaN	SB50	692066695838498816	AemalTheAFGHAN	NaN	@AemalTheAFGHAN @BKDenverSports We're glad to ...	1	26	1	rush_evening	'50'
3	2016-01-26 18:12:35	1	Sorry, Headphones	692047497238175744	4c4d	NaN	@4c4d Oh man. We love that, too. Our favorite ...	1	26	1	rush_evening
4	2016-01-26 17:53:20	7		692042650933862401	NaN	18	Pssst, hey, regular Caltrain riders: expect a ...	1	26	1	rush_evening

	t_limited	t_northbound	timestamp	train_id	tweet_id
0	0	1	2016-01-25 23:42:14	155	691768068385718275
1	0	1	2016-01-22 22:48:57	151	690667494906814464
2	0	1	2016-01-20 22:22:55	151	689936168893329408
3	0	0	2016-01-19 18:08:25	138	689509733640732672
4	1	0	2016-01-19 16:32:04	216	689485484838416388

		favorite_count	id	retweet_count	day_of_week	day_of_month	t_bullet	t_limited	t_northbound	train_id	tweet_id
time_of_day	month
error	1	1.722222	6.869895e+17	3.280000	2.480000	12.360000	0.320000	0.200000	0.480000	242.720000	6.869895e+17
	6	1.437500	6.126169e+17	2.000000	3.567568	21.216216	0.324324	0.378378	0.297297	286.351351	6.126169e+17
	7	2.470588	6.223543e+17	4.953488	2.558140	18.162791	0.348837	0.441860	0.511628	281.720930	6.223543e+17
	8	2.480000	6.328767e+17	4.076923	2.826923	16.057692	0.269231	0.461538	0.346154	263.076923	6.328767e+17
	9	2.777778	6.439176e+17	3.909091	2.314286	15.514286	0.128571	0.471429	0.671429	271.014286	6.439176e+17
	10	1.857143	6.527556e+17	4.622222	2.688889	9.866667	0.166667	0.466667	0.455556	252.122222	6.527556e+17
	11	2.700000	6.652811e+17	3.562500	1.515152	13.272727	0.060606	0.393939	0.484848	210.303030	6.652811e+17
	12	2.000000	6.747415e+17	2.476190	3.681818	9.727273	0.272727	0.318182	0.454545	288.454545	6.747415e+17
rush_evening	1	1.875000	6.881407e+17	3.750000	0.750000	15.250000	0.375000	0.250000	0.625000	228.875000	6.881407e+17
	6	4.666667	6.125047e+17	1.818182	1.090909	20.545455	0.181818	0.454545	0.545455	211.818182	6.125047e+17
	7	1.333333	6.225477e+17	2.100000	1.750000	18.250000	0.050000	0.550000	0.550000	198.050000	6.225477e+17
	8	1.400000	6.352651e+17	2.428571	2.285714	22.285714	0.000000	0.071429	0.785714	146.785714	6.352651e+17
	9	2.000000	6.447491e+17	1.640000	1.840000	17.520000	0.080000	0.240000	0.800000	174.800000	6.447491e+17
	10	1.250000	6.529508e+17	2.210526	2.210526	10.157895	0.105263	0.315789	0.684211	219.210526	6.529508e+17
	11	1.200000	6.663183e+17	4.100000	0.700000	16.000000	0.100000	0.200000	0.900000	174.700000	6.663183e+17
	12	1.800000	6.740357e+17	3.700000	1.700000	7.300000	0.100000	0.300000	0.500000	184.300000	6.740357e+17
rush_morning	10	1.000000	6.520161e+17	1.000000	3.000000	8.000000	0.000000	0.000000	1.000000	199.000000	6.520161e+17
workday	1	1.000000	6.872953e+17	6.000000	2.000000	13.000000	0.500000	0.500000	1.000000	268.000000	6.872953e+17
	6	1.000000	6.114410e+17	1.571429	1.714286	17.714286	0.285714	0.571429	0.714286	232.714286	6.114410e+17
	7	2.333333	6.223477e+17	5.800000	1.300000	17.800000	0.300000	0.700000	0.500000	247.900000	6.223477e+17
	8	NaN	6.342669e+17	2.333333	2.666667	19.666667	0.333333	0.666667	0.333333	252.333333	6.342669e+17
	9	1.714286	6.434607e+17	5.933333	2.866667	14.066667	0.266667	0.666667	0.866667	242.466667	6.434607e+17
	10	2.000000	6.525711e+17	4.000000	3.000000	9.166667	0.500000	0.000000	0.666667	373.666667	6.525711e+17
	12	NaN	6.724328e+17	4.666667	3.000000	3.000000	0.000000	1.000000	1.000000	220.333333	6.724328e+17

		day_of_month	day_of_week	favorite_count	id	retweet_count	t_bullet	t_limited	t_northbound	train_id	tweet_id
month
1	count	35.000000	35.000000	28.000000	3.500000e+01	35.000000	35.000000	35.000000	35.000000	35.000000	3.500000e+01
	mean	13.057143	2.057143	1.714286	6.872701e+17	3.542857	0.342857	0.228571	0.542857	241.000000	6.872701e+17
	std	4.862479	1.493965	1.462042	1.777861e+15	3.860378	0.481594	0.426043	0.505433	94.183176	1.777861e+15
	min	4.000000	0.000000	1.000000	6.840946e+17	1.000000	0.000000	0.000000	0.000000	138.000000	6.840946e+17
	25%	9.500000	1.000000	1.000000	6.861119e+17	2.000000	0.000000	0.000000	0.000000	146.500000	6.861119e+17
	50%	12.000000	1.000000	1.000000	6.867537e+17	3.000000	0.000000	0.000000	1.000000	225.000000	6.867537e+17
	75%	15.000000	4.000000	2.000000	6.878042e+17	4.000000	1.000000	0.000000	1.000000	326.000000	6.878042e+17
	max	25.000000	4.000000	8.000000	6.917681e+17	24.000000	1.000000	1.000000	1.000000	375.000000	6.917681e+17
6	count	55.000000	55.000000	20.000000	5.500000e+01	55.000000	55.000000	55.000000	55.000000	55.000000	5.500000e+01
	mean	20.636364	2.836364	1.900000	6.124448e+17	1.909091	0.290909	0.418182	0.400000	264.618182	6.124448e+17
	std	5.873240	1.853562	1.586124	2.110101e+15	1.126659	0.458368	0.497807	0.494413	88.131739	2.110101e+15
	min	8.000000	0.000000	1.000000	6.079546e+17	1.000000	0.000000	0.000000	0.000000	104.000000	6.079546e+17
	25%	17.500000	1.000000	1.000000	6.113585e+17	1.000000	0.000000	0.000000	0.000000	205.500000	6.113585e+17
	50%	20.000000	3.000000	1.000000	6.120688e+17	2.000000	0.000000	0.000000	0.000000	264.000000	6.120688e+17
	75%	26.000000	4.500000	2.250000	6.144020e+17	3.000000	1.000000	1.000000	1.000000	344.000000	6.144020e+17
	max	30.000000	6.000000	6.000000	6.156911e+17	7.000000	1.000000	1.000000	1.000000	442.000000	6.156911e+17
7	count	73.000000	73.000000	26.000000	7.300000e+01	73.000000	73.000000	73.000000	73.000000	73.000000	7.300000e+01
	mean	18.136986	2.164384	2.192308	6.224064e+17	4.287671	0.260274	0.506849	0.520548	254.164384	6.224064e+17
	std	9.194375	1.280340	1.744001	3.334705e+15	9.187067	0.441821	0.503413	0.503035	77.017677	3.334705e+15
	min	1.000000	0.000000	1.000000	6.160466e+17	1.000000	0.000000	0.000000	0.000000	134.000000	6.160466e+17
	25%	14.000000	1.000000	1.000000	6.207510e+17	1.000000	0.000000	0.000000	0.000000	216.000000	6.207510e+17
	50%	20.000000	2.000000	2.000000	6.231497e+17	2.000000	0.000000	1.000000	1.000000	258.000000	6.231497e+17
	75%	25.000000	3.000000	2.750000	6.247474e+17	3.000000	1.000000	1.000000	1.000000	313.000000	6.247474e+17
	max	31.000000	5.000000	8.000000	6.272640e+17	60.000000	1.000000	1.000000	1.000000	386.000000	6.272640e+17
8	count	69.000000	69.000000	30.000000	6.900000e+01	69.000000	69.000000	69.000000	69.000000	69.000000	6.900000e+01
	mean	17.478261	2.710145	2.300000	6.334217e+17	3.666667	0.217391	0.391304	0.434783	239.014493	6.334217e+17
	std	9.827346	1.572759	1.263547	3.632823e+15	4.676810	0.415493	0.491618	0.499360	85.203475	3.632823e+15
	min	1.000000	0.000000	1.000000	6.272702e+17	1.000000	0.000000	0.000000	0.000000	135.000000	6.272702e+17
	25%	10.000000	1.000000	1.000000	6.308697e+17	2.000000	0.000000	0.000000	0.000000	150.000000	6.308697e+17
	50%	19.000000	3.000000	2.000000	6.341380e+17	2.000000	0.000000	0.000000	0.000000	257.000000	6.341380e+17
...	...	...	...	...	...	...	...	...	...	...	...
9	std	9.011368	1.341858	3.257780	3.245817e+15	5.066955	0.344745	0.499291	0.447400	91.119182	3.245817e+15
	min	1.000000	0.000000	1.000000	6.385084e+17	1.000000	0.000000	0.000000	0.000000	135.000000	6.385084e+17
	25%	9.000000	1.250000	1.000000	6.414256e+17	2.000000	0.000000	0.000000	0.000000	152.000000	6.414256e+17
	50%	14.000000	2.000000	1.000000	6.435046e+17	2.000000	0.000000	0.000000	1.000000	254.000000	6.435046e+17
	75%	23.000000	3.000000	3.000000	6.467907e+17	4.000000	0.000000	1.000000	1.000000	279.000000	6.467907e+17
	max	30.000000	5.000000	20.000000	6.493411e+17	30.000000	1.000000	1.000000	1.000000	449.000000	6.493411e+17
10	count	116.000000	116.000000	49.000000	1.160000e+02	116.000000	116.000000	116.000000	116.000000	116.000000	1.160000e+02
	mean	9.862069	2.629310	1.795918	6.527716e+17	4.163793	0.172414	0.413793	0.508621	252.560345	6.527716e+17
	std	7.033951	1.130834	1.731805	2.555516e+15	4.816537	0.379378	0.494649	0.502095	80.750748	2.555516e+15
	min	1.000000	0.000000	1.000000	6.496141e+17	1.000000	0.000000	0.000000	0.000000	134.000000	6.496141e+17
	25%	7.000000	2.000000	1.000000	6.515610e+17	2.000000	0.000000	0.000000	0.000000	190.750000	6.515610e+17
	50%	8.000000	3.000000	1.000000	6.520133e+17	3.000000	0.000000	0.000000	1.000000	262.000000	6.520133e+17
	75%	14.000000	3.000000	2.000000	6.541057e+17	4.000000	0.000000	1.000000	1.000000	284.750000	6.541057e+17
	max	30.000000	5.000000	11.000000	6.602246e+17	33.000000	1.000000	1.000000	1.000000	425.000000	6.602246e+17
11	count	43.000000	43.000000	25.000000	4.300000e+01	42.000000	43.000000	43.000000	43.000000	43.000000	4.300000e+01
	mean	13.906977	1.325581	2.400000	6.655223e+17	3.690476	0.069767	0.348837	0.581395	202.023256	6.655223e+17
	std	8.733662	1.209505	1.957890	3.138674e+15	3.032250	0.257770	0.482243	0.499169	71.815401	3.138674e+15
	min	2.000000	0.000000	1.000000	6.612370e+17	1.000000	0.000000	0.000000	0.000000	135.000000	6.612370e+17
	25%	7.500000	0.000000	1.000000	6.632601e+17	2.000000	0.000000	0.000000	0.000000	146.000000	6.632601e+17
	50%	12.000000	1.000000	2.000000	6.649104e+17	3.000000	0.000000	0.000000	1.000000	155.000000	6.649104e+17
	75%	20.000000	2.000000	3.000000	6.676899e+17	3.750000	0.000000	1.000000	1.000000	266.000000	6.676899e+17
	max	30.000000	4.000000	7.000000	6.713959e+17	15.000000	1.000000	1.000000	1.000000	375.000000	6.713959e+17
12	count	35.000000	35.000000	16.000000	3.500000e+01	34.000000	35.000000	35.000000	35.000000	35.000000	3.500000e+01
	mean	8.457143	3.057143	1.937500	6.743419e+17	3.029412	0.200000	0.371429	0.514286	252.857143	6.743419e+17
	std	7.636379	1.764734	1.340087	2.747556e+15	2.110368	0.405840	0.490241	0.507093	103.178356	2.747556e+15
	min	1.000000	0.000000	1.000000	6.717135e+17	1.000000	0.000000	0.000000	0.000000	134.000000	6.717135e+17
	25%	3.500000	2.000000	1.000000	6.724021e+17	2.000000	0.000000	0.000000	0.000000	149.000000	6.724021e+17
	50%	5.000000	3.000000	1.500000	6.729805e+17	2.000000	0.000000	0.000000	1.000000	233.000000	6.729805e+17
	75%	11.500000	4.500000	2.250000	6.754842e+17	3.000000	0.000000	1.000000	1.000000	344.000000	6.754842e+17
	max	29.000000	6.000000	6.000000	6.819313e+17	10.000000	1.000000	1.000000	1.000000	448.000000	6.819313e+17

	created_at	favorite_count	hashtags	id	in_reply_to_screen_name	retweet_count	text	day_of_month	month	time_of_day	topic_train	t_bullet	t_limited	t_northbound	timestamp	train_id	tweet_id
day_of_week
0	54	27	54	54	0	49	54	54	54	54	54	54	54	54	54	54	54
1	102	48	102	102	0	102	102	102	102	102	102	102	102	102	102	102	102
2	135	59	135	135	0	135	135	135	135	135	135	135	135	135	135	135	135
3	117	44	117	117	0	117	117	117	117	117	117	117	117	117	117	117	117
4	75	24	75	75	0	75	75	75	75	75	75	75	75	75	75	75	75
5	48	32	48	48	0	48	48	48	48	48	48	48	48	48	48	48	48
6	5	5	5	5	0	4	5	5	5	5	5	5	5	5	5	5	5

	favorite_count	id	retweet_count	day_of_week	day_of_month	month	t_bullet	t_limited	t_northbound	train_id	tweet_id
favorite_count	1.000000	-0.056542	0.551203	0.100412	-0.013727	0.057577	0.184392	0.019418	0.071410	0.182412	-0.056542
id	-0.056542	1.000000	0.024774	-0.083374	-0.201225	0.119187	-0.048254	-0.091769	0.070766	-0.083925	1.000000
retweet_count	0.551203	0.024774	1.000000	0.019303	-0.058542	0.030757	0.096546	0.100636	-0.041501	0.148645	0.024774
day_of_week	0.100412	-0.083374	0.019303	1.000000	-0.063651	0.022254	0.047302	-0.022492	-0.061165	0.132644	-0.083374
day_of_month	-0.013727	-0.201225	-0.058542	-0.063651	1.000000	-0.204438	0.014090	0.057579	0.053588	-0.007253	-0.201225
month	0.057577	0.119187	0.030757	0.022254	-0.204438	1.000000	-0.145967	0.041591	0.045134	-0.039853	0.119187
t_bullet	0.184392	-0.048254	0.096546	0.047302	0.014090	-0.145967	1.000000	-0.416707	-0.010226	0.651110	-0.048254
t_limited	0.019418	-0.091769	0.100636	-0.022492	0.057579	0.041591	-0.416707	1.000000	-0.041419	0.110541	-0.091769
t_northbound	0.071410	0.070766	-0.041501	-0.061165	0.053588	0.045134	-0.010226	-0.041419	1.000000	0.008933	0.070766
train_id	0.182412	-0.083925	0.148645	0.132644	-0.007253	-0.039853	0.651110	0.110541	0.008933	1.000000	-0.083925
tweet_id	-0.056542	1.000000	0.024774	-0.083374	-0.201225	0.119187	-0.048254	-0.091769	0.070766	-0.083925	1.000000