Dictionary

train_direction = 0 south, 1 north traintype = 0 Local, 1 Limited, 2 Bullet train


In [1]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import re
import random
import operator
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

from func import *

# inline plot
%matplotlib inline
#%%javascript
#IPython.OutputArea.auto_scroll_threshold = 9999;

In [2]:
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
df = pd.read_csv("data/raw-twt2016-01-26-14-21-09.csv",sep='\t',error_bad_lines=False)
# df.head(5)
print len(df.index)
list(df.columns.values)


3199
Out[2]:
['Unnamed: 0',
 'created_at',
 'favorite_count',
 'favorited',
 'hashtags',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'media',
 'place',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'urls',
 'user',
 'user_mentions']

Cleanin' the data


In [3]:
# Fill in blank hashtags
df = df.where((pd.notnull(df)), np.nan)
df["hashtags"].fillna('')

# Add some date/time things
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')

df["day_of_week"] = df["created_at"].apply(lambda x: x.weekday())
df["day_of_month"] = df["created_at"].apply(lambda x: x.day)
df["month"] = df["created_at"].apply(lambda x: x.month)
df["time_of_day"] = df["created_at"].apply(lambda x: get_time_of_day(x))

tod_Dummy = pd.get_dummies(df['time_of_day'])
print(tod_Dummy.head(5))
# del tod_Dummy['shutdown']

# df['in_reply_to_screen_name'].fillna(-1)
# df['in_reply_to_status_id'].fillna(-1)
# df['in_reply_to_user_id'].fillna(-1)
# df['retweeted_status'].fillna(-1)
# df['retweeted'].fillna(-1)
df['retweet_count'].fillna(np.nan)
df['favorite_count'].fillna(np.nan)
df["hashtags"].fillna(np.nan)
df["hashtags"] = df["hashtags"].apply(lambda x: str(x)[1:-1])
df.loc[df["hashtags"]=='a',"hashtags"] = ''
list(df.columns.values)


   error  rush_evening  rush_morning  workday
0      1             0             0        0
1      0             1             0        0
2      0             1             0        0
3      0             1             0        0
4      0             1             0        0
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-3-dd98f2448e95> in <module>()
     13 tod_Dummy = pd.get_dummies(df['time_of_day'])
     14 print(tod_Dummy.head(5))
---> 15 del tod_Dummy['shutdown']
     16 
     17 # df['in_reply_to_screen_name'].fillna(-1)

/Users/albarron/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __delitem__(self, key)
   1341             # there was no match, this call should raise the appropriate
   1342             # exception:
-> 1343             self._data.delete(key)
   1344 
   1345         # delete from the caches

/Users/albarron/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in delete(self, item)
   3157         Delete selected item (items if non-unique) in-place.
   3158         """
-> 3159         indexer = self.items.get_loc(item)
   3160 
   3161         is_deleted = np.zeros(self.shape[0], dtype=np.bool_)

/Users/albarron/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method, tolerance)
   1690                 raise ValueError('tolerance argument only valid if using pad, '
   1691                                  'backfill or nearest lookups')
-> 1692             return self._engine.get_loc(_values_from_object(key))
   1693 
   1694         indexer = self.get_indexer([key], method=method,

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3979)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)()

KeyError: 'shutdown'

In [ ]:
#Potentially remove, just cleaning for analysis sake
del df['Unnamed: 0']
del df['truncated']
del df['user_mentions']
del df['urls']
del df['source']
del df['lang']
del df['place']
del df['favorited']
del df['media']
del df['user']

# More likely to remove
del df['in_reply_to_status_id']
del df['in_reply_to_user_id']
del df['retweeted']
del df['retweeted_status']
len(df)

In [ ]:
df.plot(x='created_at', y='day_of_week', kind='hist')
# fdf = df[["created_at","id","text","hashtags"]]
# str(fdf

Let's start getting some more detailed data from the trips as well


In [ ]:
# df['favorite_count'] = df['favorite_count'].astype(np.int64)
# df['retweet_count'] = df['retweet_count'].astype(np.int64)
# df['text'] = df['text'].astype(str)
# df['id'] = df['id'].astype(np.int64)
# df['day_of_week'] = df['day_of_week'].astype(np.int64)
# df['day_of_month'] = df['day_of_month'].astype(np.int64)
# df['month'] = df['month'].astype(np.int64)
# df['time_of_day'] = df['time_of_day'].astype(np.int64)
df.loc[df["hashtags"]=='on',"hashtags"] = np.nan
df.convert_objects(convert_numeric=True)
df.dtypes
len(df)

In [ ]:
# Pull out potential trains from both hashtags and text
df["topic_train"] = df["text"].apply(lambda x: check_train_id(x))
df["topic_train"] = df["topic_train"].apply(lambda x: str(x)[1:-1])
df["topic_train"].fillna(np.nan)
df.head(5)

In [ ]:
len(df)

In [ ]:
# pd.pivot_table(
#   df,values='values',
#   index=['month'],
#   columns=['day_of_week'])

First, a word about the below code. In the accompanying func.py there is a function called parse_train that returns a pandas.Series object. For some reason, when it's returned from a map or apply, it seems to get cast as a string. When applied to a list or a dataframe, this string gets turned into a single field in the row, OR divided into several rows, throwing the count off.

To get around this, I return the results of the parse_train function and then CAST it back to a series. This adds a weird 0 index, which I delete. I then fill in the plethora of NaNs and recombine it with the primary dataframe.

For context, previous iterations included df['topic_train'].apply(lambda x:parse_train(x)) which would return a pd.Series object with str versions of the returned pd.Series from parse_train


In [ ]:
ret = []

def parse_train(t):
# x should be a list with train codes eg 123
# {"id": "123", "type:" "bullet", direction: "south"}

    try:
        s = t['topic_train'].split(',')
    except:
        return t['topic_train']
    if s[0] == '':
#         print ""
        return np.nan
    for x in s:
#         print "Iter",x[1:-1]
        q = {}
        # Check train id
#         x = parse_train_id(x)
        x = str(x)
        x = re.sub('[^0-9]','', x)
        if len(x)<3: continue

        # 1 = north, 0 = south
        q["t_northbound"] = 1 if int(x[2]) in [1,3,5,7,9] else 0
        q['t_limited'] = 0
        q['t_bullet'] = 0
        
        if x[0] == '1':
            q['t_limited'] = 0
        elif x[0] == '2':
            q["t_limited"] = 1 # limited
        elif x[0] == '3':
            q["t_bullet"] = 1 # bullet
        else:
            q['t_limited'] = 0

        ret.append({'tweet_id': t['id'],
                    'timestamp': t['created_at'], 
                    'train_id': int(x),
                    't_northbound':q["t_northbound"], 
                    't_limited': q["t_limited"],
                    't_bullet': q['t_bullet']})
    return s

In [ ]:
# Let's then filter those train topics into details
# Btw this is jank as fuck.

# red = df[['id','created_at','topic_train']]
red = df.apply(lambda x:parse_train(x),axis=1)
print "red return:",len(red)
print "ret return,",len(ret)
#red
tf = pd.DataFrame(ret)
tf.head(5)

#events = pd.DataFrame([pd.Series(x) for x in red.apply(parse_train)])
#events
#del new.iloc[0]
#new.fillna('')
#df.combine_first(new)

In [ ]:
print df.loc[df['topic_train'] != '',['topic_train','text']]

In [ ]:
len(tf)

In [ ]:


In [ ]:
len(tf)

In [ ]:
df = df.merge(tf, left_on='id',right_on='tweet_id',how='right')

In [ ]:
df.groupby(['time_of_day','month']).mean()

In [ ]:
list(df.columns.values)

In [ ]:
df.plot(x='time_of_day',y='day_of_week',kind='hist')

In [ ]:
pd.scatter_matrix(df,alpha=0.1,figsize=(15,15), diagonal='hist');

In [ ]:
df.groupby('month').describe()

In [ ]:
df[df['train_id'] > 0].groupby('day_of_week').count()

In [ ]:
df[df['train_id'] > 0].groupby('month').count()

In [ ]:
df[df['train_id'] > 0].groupby('time_of_day').count()

In [ ]:
df.corr()

In [ ]: