In [1]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import re
import random
import operator
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from func import *
# inline plot
%matplotlib inline
#%%javascript
#IPython.OutputArea.auto_scroll_threshold = 9999;
In [2]:
#%load 'data/raw-twt2016-01-26-14/21/09.csv'
df = pd.read_csv("data/raw-twt2016-01-26-14-21-09.csv",sep='\t',error_bad_lines=False)
# df.head(5)
print len(df.index)
list(df.columns.values)
Out[2]:
Cleanin' the data
In [3]:
# Fill in blank hashtags
df = df.where((pd.notnull(df)), np.nan)
df["hashtags"].fillna('')
# Add some date/time things
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')
df["day_of_week"] = df["created_at"].apply(lambda x: x.weekday())
df["day_of_month"] = df["created_at"].apply(lambda x: x.day)
df["month"] = df["created_at"].apply(lambda x: x.month)
df["time_of_day"] = df["created_at"].apply(lambda x: get_time_of_day(x))
tod_Dummy = pd.get_dummies(df['time_of_day'])
print(tod_Dummy.head(5))
# del tod_Dummy['shutdown']
# df['in_reply_to_screen_name'].fillna(-1)
# df['in_reply_to_status_id'].fillna(-1)
# df['in_reply_to_user_id'].fillna(-1)
# df['retweeted_status'].fillna(-1)
# df['retweeted'].fillna(-1)
df['retweet_count'].fillna(np.nan)
df['favorite_count'].fillna(np.nan)
df["hashtags"].fillna(np.nan)
df["hashtags"] = df["hashtags"].apply(lambda x: str(x)[1:-1])
df.loc[df["hashtags"]=='a',"hashtags"] = ''
list(df.columns.values)
In [ ]:
#Potentially remove, just cleaning for analysis sake
del df['Unnamed: 0']
del df['truncated']
del df['user_mentions']
del df['urls']
del df['source']
del df['lang']
del df['place']
del df['favorited']
del df['media']
del df['user']
# More likely to remove
del df['in_reply_to_status_id']
del df['in_reply_to_user_id']
del df['retweeted']
del df['retweeted_status']
len(df)
In [ ]:
df.plot(x='created_at', y='day_of_week', kind='hist')
# fdf = df[["created_at","id","text","hashtags"]]
# str(fdf
Let's start getting some more detailed data from the trips as well
In [ ]:
# df['favorite_count'] = df['favorite_count'].astype(np.int64)
# df['retweet_count'] = df['retweet_count'].astype(np.int64)
# df['text'] = df['text'].astype(str)
# df['id'] = df['id'].astype(np.int64)
# df['day_of_week'] = df['day_of_week'].astype(np.int64)
# df['day_of_month'] = df['day_of_month'].astype(np.int64)
# df['month'] = df['month'].astype(np.int64)
# df['time_of_day'] = df['time_of_day'].astype(np.int64)
df.loc[df["hashtags"]=='on',"hashtags"] = np.nan
df.convert_objects(convert_numeric=True)
df.dtypes
len(df)
In [ ]:
# Pull out potential trains from both hashtags and text
df["topic_train"] = df["text"].apply(lambda x: check_train_id(x))
df["topic_train"] = df["topic_train"].apply(lambda x: str(x)[1:-1])
df["topic_train"].fillna(np.nan)
df.head(5)
In [ ]:
len(df)
In [ ]:
# pd.pivot_table(
# df,values='values',
# index=['month'],
# columns=['day_of_week'])
First, a word about the below code.
In the accompanying func.py
there is a function called parse_train that returns a pandas.Series
object. For some reason, when it's returned from a map or apply, it seems to get cast as a string. When applied to a list or a dataframe, this string gets turned into a single field in the row, OR divided into several rows, throwing the count off.
To get around this, I return the results of the parse_train function and then CAST it back to a series. This adds a weird 0 index, which I delete. I then fill in the plethora of NaNs and recombine it with the primary dataframe.
For context, previous iterations included
df['topic_train'].apply(lambda x:parse_train(x))
which would return a pd.Series object with str
versions of the returned pd.Series from parse_train
In [ ]:
ret = []
def parse_train(t):
# x should be a list with train codes eg 123
# {"id": "123", "type:" "bullet", direction: "south"}
try:
s = t['topic_train'].split(',')
except:
return t['topic_train']
if s[0] == '':
# print ""
return np.nan
for x in s:
# print "Iter",x[1:-1]
q = {}
# Check train id
# x = parse_train_id(x)
x = str(x)
x = re.sub('[^0-9]','', x)
if len(x)<3: continue
# 1 = north, 0 = south
q["t_northbound"] = 1 if int(x[2]) in [1,3,5,7,9] else 0
q['t_limited'] = 0
q['t_bullet'] = 0
if x[0] == '1':
q['t_limited'] = 0
elif x[0] == '2':
q["t_limited"] = 1 # limited
elif x[0] == '3':
q["t_bullet"] = 1 # bullet
else:
q['t_limited'] = 0
ret.append({'tweet_id': t['id'],
'timestamp': t['created_at'],
'train_id': int(x),
't_northbound':q["t_northbound"],
't_limited': q["t_limited"],
't_bullet': q['t_bullet']})
return s
In [ ]:
# Let's then filter those train topics into details
# Btw this is jank as fuck.
# red = df[['id','created_at','topic_train']]
red = df.apply(lambda x:parse_train(x),axis=1)
print "red return:",len(red)
print "ret return,",len(ret)
#red
tf = pd.DataFrame(ret)
tf.head(5)
#events = pd.DataFrame([pd.Series(x) for x in red.apply(parse_train)])
#events
#del new.iloc[0]
#new.fillna('')
#df.combine_first(new)
In [ ]:
print df.loc[df['topic_train'] != '',['topic_train','text']]
In [ ]:
len(tf)
In [ ]:
In [ ]:
len(tf)
In [ ]:
df = df.merge(tf, left_on='id',right_on='tweet_id',how='right')
In [ ]:
df.groupby(['time_of_day','month']).mean()
In [ ]:
list(df.columns.values)
In [ ]:
df.plot(x='time_of_day',y='day_of_week',kind='hist')
In [ ]:
pd.scatter_matrix(df,alpha=0.1,figsize=(15,15), diagonal='hist');
In [ ]:
df.groupby('month').describe()
In [ ]:
df[df['train_id'] > 0].groupby('day_of_week').count()
In [ ]:
df[df['train_id'] > 0].groupby('month').count()
In [ ]:
df[df['train_id'] > 0].groupby('time_of_day').count()
In [ ]:
df.corr()
In [ ]: