Dictionary

train_direction = 0 south, 1 north traintype = 0 Local, 1 Limited, 2 Bullet train


In [ ]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import re
import random
import operator

from func import *

# inline plot
%matplotlib inline
#%%javascript
#IPython.OutputArea.auto_scroll_threshold = 9999;

In [ ]:
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
df = pd.read_csv("data/raw-twt2016-01-26-14-21-09.csv",sep='\t',error_bad_lines=False)
# df.head(5)
print len(df.index)
list(df.columns.values)

Cleanin' the data


In [ ]:
# Fill in blank hashtags
df = df.where((pd.notnull(df)), np.nan)
df["hashtags"].fillna('')

# Add some date/time things
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')

df["day_of_week"] = df["created_at"].apply(lambda x: x.weekday())
df["day_of_month"] = df["created_at"].apply(lambda x: x.day)
df["month"] = df["created_at"].apply(lambda x: x.month)
df["isRushHour"] = df["created_at"].apply(lambda x: get_time_of_day(x))

# del tod_Dummy['shutdown']

# df['in_reply_to_screen_name'].fillna(-1)
# df['in_reply_to_status_id'].fillna(-1)
# df['in_reply_to_user_id'].fillna(-1)
# df['retweeted_status'].fillna(-1)
# df['retweeted'].fillna(-1)
df['retweet_count'].fillna(np.nan)
df['favorite_count'].fillna(np.nan)
df["hashtags"].fillna(np.nan)
df["hashtags"] = df["hashtags"].apply(lambda x: str(x)[1:-1])
df.loc[df["hashtags"]=='a',"hashtags"] = ''
#list(df.columns.values)

In [ ]:
#Potentially remove, just cleaning for analysis sake
del df['Unnamed: 0']
# del df['truncated']
del df['user_mentions']
del df['urls']
del df['source']
del df['lang']
del df['place']
del df['favorited']
del df['media']
del df['user']

# More likely to remove
del df['in_reply_to_status_id']
del df['in_reply_to_user_id']
del df['retweeted']
del df['retweeted_status']
len(df)

Let's start getting some more detailed data from the trips as well


In [ ]:
# df['favorite_count'] = df['favorite_count'].astype(np.int64)
# df['retweet_count'] = df['retweet_count'].astype(np.int64)
# df['text'] = df['text'].astype(str)
# df['id'] = df['id'].astype(np.int64)
# df['day_of_week'] = df['day_of_week'].astype(np.int64)
# df['day_of_month'] = df['day_of_month'].astype(np.int64)
# df['month'] = df['month'].astype(np.int64)
# df['time_of_day'] = df['time_of_day'].astype(np.int64)
df.loc[df["hashtags"]=='on',"hashtags"] = np.nan
df.convert_objects(convert_numeric=True)
df.dtypes
len(df)

In [ ]:
# Pull out potential trains from both hashtags and text
df["topic_train"] = df["text"].apply(lambda x: check_train_id(x))
df["topic_train"] = df["topic_train"].apply(lambda x: str(x)[1:-1])
df["topic_train"].fillna(np.nan)
df.head(2)

First, a word about the below code. In the accompanying func.py there is a function called parse_train that returns a pandas.Series object. For some reason, when it's returned from a map or apply, it seems to get cast as a string. When applied to a list or a dataframe, this string gets turned into a single field in the row, OR divided into several rows, throwing the count off.

To get around this, I return the results of the parse_train function and then CAST it back to a series. This adds a weird 0 index, which I delete. I then fill in the plethora of NaNs and recombine it with the primary dataframe.

For context, previous iterations included df['topic_train'].apply(lambda x:parse_train(x)) which would return a pd.Series object with str versions of the returned pd.Series from parse_train


In [ ]:
ret = []

def parse_train(t):
# Revised this function to work with categorical variables
# x should be a list with train codes eg 123
# {"id": "123", "type:" "bullet", direction: "south"}

    try:
        s = t['topic_train'].split(',')
    except:
        return t['topic_train']
    if s[0] == '':
        return np.nan
    for x in s:
        q = {}
        x = str(x)
        x = re.sub('[^0-9]','', x)
        if len(x)<3: continue

        # 1 = north, 0 = south
        q["t_northbound"] = 1 if int(x[2]) in [1,3,5,7,9] else 0
        q['t_limited'] = 0
        q['t_bullet'] = 0
        
        if x[0] == '1':
            q['t_limited'] = 0
        elif x[0] == '2':
            q["t_limited"] = 1 # limited
        elif x[0] == '3':
            q["t_bullet"] = 1 # bullet
        else:
            q['t_limited'] = 0

        ret.append({'tweet_id': t['id'],
                    'timestamp': t['created_at'], 
                    'train_id': int(x),
                    't_northbound':q["t_northbound"], 
                    't_limited': q["t_limited"],
                    't_bullet': q['t_bullet']})
    return s

In [ ]:
# Let's then filter those train topics into details
# Btw this is jank as fuck.

# red = df[['id','created_at','topic_train']]
red = df.apply(lambda x:parse_train(x),axis=1)
print "red return:",len(red)
print "ret return,",len(ret)
#red
tf = pd.DataFrame(ret)
tf.head(5)

#events = pd.DataFrame([pd.Series(x) for x in red.apply(parse_train)])
#events
#del new.iloc[0]
#new.fillna('')
#df.combine_first(new)

In [ ]:
print df.loc[df['topic_train'] != '',['topic_train','text']]

In [ ]:
len(tf)

In [ ]:
# Merge on tweet ID,
df = df.merge(tf, left_on='id',right_on='tweet_id',how='right')

In [ ]:
# Okay, let's try and get a yae or nae to delay and see our hit rate.

In [ ]:
# Only events that have train mentioned
trains = df[df['train_id'] > 0]

In [ ]:
# d
filename = "./data/formated_twts.csv"
df.to_csv(filename, sep=',', encoding='utf-8')

The next step: manually clean the data.

This is what the caltrain is for.

The fields:

  • is_delay: 1 = true, 0 false (ignore 0)
  • delay_minor: >10min = 1 else 0
  • delay_med: <10m & >20m = 1 else 0
  • delay_major: <20m & >40m = 1 else 0
  • delay_catastrophic:<40m = 1 else 0
  • is_backlog: multiple trains IN SAME DIRECTION delayed or passing else 0
  • is_canceled: Train Canceled else 0
  • is_passing: Train indicates passing/rescheduling else 0

NOTE: is_delay with no indication of magnitude means UNKNOWN but reported


In [ ]: