In [ ]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import re
import random
import operator
from func import *
# inline plot
%matplotlib inline
#%%javascript
#IPython.OutputArea.auto_scroll_threshold = 9999;
In [ ]:
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
df = pd.read_csv("data/raw-twt2016-01-26-14-21-09.csv",sep='\t',error_bad_lines=False)
# df.head(5)
print len(df.index)
list(df.columns.values)
Cleanin' the data
In [ ]:
# Fill in blank hashtags
df = df.where((pd.notnull(df)), np.nan)
df["hashtags"].fillna('')
# Add some date/time things
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')
df["day_of_week"] = df["created_at"].apply(lambda x: x.weekday())
df["day_of_month"] = df["created_at"].apply(lambda x: x.day)
df["month"] = df["created_at"].apply(lambda x: x.month)
df["isRushHour"] = df["created_at"].apply(lambda x: get_time_of_day(x))
# del tod_Dummy['shutdown']
# df['in_reply_to_screen_name'].fillna(-1)
# df['in_reply_to_status_id'].fillna(-1)
# df['in_reply_to_user_id'].fillna(-1)
# df['retweeted_status'].fillna(-1)
# df['retweeted'].fillna(-1)
df['retweet_count'].fillna(np.nan)
df['favorite_count'].fillna(np.nan)
df["hashtags"].fillna(np.nan)
df["hashtags"] = df["hashtags"].apply(lambda x: str(x)[1:-1])
df.loc[df["hashtags"]=='a',"hashtags"] = ''
#list(df.columns.values)
In [ ]:
#Potentially remove, just cleaning for analysis sake
del df['Unnamed: 0']
# del df['truncated']
del df['user_mentions']
del df['urls']
del df['source']
del df['lang']
del df['place']
del df['favorited']
del df['media']
del df['user']
# More likely to remove
del df['in_reply_to_status_id']
del df['in_reply_to_user_id']
del df['retweeted']
del df['retweeted_status']
len(df)
Let's start getting some more detailed data from the trips as well
In [ ]:
# df['favorite_count'] = df['favorite_count'].astype(np.int64)
# df['retweet_count'] = df['retweet_count'].astype(np.int64)
# df['text'] = df['text'].astype(str)
# df['id'] = df['id'].astype(np.int64)
# df['day_of_week'] = df['day_of_week'].astype(np.int64)
# df['day_of_month'] = df['day_of_month'].astype(np.int64)
# df['month'] = df['month'].astype(np.int64)
# df['time_of_day'] = df['time_of_day'].astype(np.int64)
df.loc[df["hashtags"]=='on',"hashtags"] = np.nan
df.convert_objects(convert_numeric=True)
df.dtypes
len(df)
In [ ]:
# Pull out potential trains from both hashtags and text
df["topic_train"] = df["text"].apply(lambda x: check_train_id(x))
df["topic_train"] = df["topic_train"].apply(lambda x: str(x)[1:-1])
df["topic_train"].fillna(np.nan)
df.head(2)
First, a word about the below code.
In the accompanying func.py
there is a function called parse_train that returns a pandas.Series
object. For some reason, when it's returned from a map or apply, it seems to get cast as a string. When applied to a list or a dataframe, this string gets turned into a single field in the row, OR divided into several rows, throwing the count off.
To get around this, I return the results of the parse_train function and then CAST it back to a series. This adds a weird 0 index, which I delete. I then fill in the plethora of NaNs and recombine it with the primary dataframe.
For context, previous iterations included
df['topic_train'].apply(lambda x:parse_train(x))
which would return a pd.Series object with str
versions of the returned pd.Series from parse_train
In [ ]:
ret = []
def parse_train(t):
# Revised this function to work with categorical variables
# x should be a list with train codes eg 123
# {"id": "123", "type:" "bullet", direction: "south"}
try:
s = t['topic_train'].split(',')
except:
return t['topic_train']
if s[0] == '':
return np.nan
for x in s:
q = {}
x = str(x)
x = re.sub('[^0-9]','', x)
if len(x)<3: continue
# 1 = north, 0 = south
q["t_northbound"] = 1 if int(x[2]) in [1,3,5,7,9] else 0
q['t_limited'] = 0
q['t_bullet'] = 0
if x[0] == '1':
q['t_limited'] = 0
elif x[0] == '2':
q["t_limited"] = 1 # limited
elif x[0] == '3':
q["t_bullet"] = 1 # bullet
else:
q['t_limited'] = 0
ret.append({'tweet_id': t['id'],
'timestamp': t['created_at'],
'train_id': int(x),
't_northbound':q["t_northbound"],
't_limited': q["t_limited"],
't_bullet': q['t_bullet']})
return s
In [ ]:
# Let's then filter those train topics into details
# Btw this is jank as fuck.
# red = df[['id','created_at','topic_train']]
red = df.apply(lambda x:parse_train(x),axis=1)
print "red return:",len(red)
print "ret return,",len(ret)
#red
tf = pd.DataFrame(ret)
tf.head(5)
#events = pd.DataFrame([pd.Series(x) for x in red.apply(parse_train)])
#events
#del new.iloc[0]
#new.fillna('')
#df.combine_first(new)
In [ ]:
print df.loc[df['topic_train'] != '',['topic_train','text']]
In [ ]:
len(tf)
In [ ]:
# Merge on tweet ID,
df = df.merge(tf, left_on='id',right_on='tweet_id',how='right')
In [ ]:
# Okay, let's try and get a yae or nae to delay and see our hit rate.
In [ ]:
# Only events that have train mentioned
trains = df[df['train_id'] > 0]
In [ ]:
# d
filename = "./data/formated_twts.csv"
df.to_csv(filename, sep=',', encoding='utf-8')
The next step: manually clean the data.
This is what the caltrain is for.
The fields:
is_delay
: 1 = true, 0 false (ignore 0)delay_minor
: >10min = 1 else 0delay_med
: <10m & >20m = 1 else 0delay_major
: <20m & >40m = 1 else 0delay_catastrophic
:<40m = 1 else 0is_backlog
: multiple trains IN SAME DIRECTION delayed or passing else 0is_canceled
: Train Canceled else 0is_passing
: Train indicates passing/rescheduling else 0NOTE: is_delay
with no indication of magnitude means UNKNOWN but reported
In [ ]: