Grab as many tweets as we can from the @caltrain_news twitter feed, export to a raw CSV, and to a formatted csv for import into another notebook
In [107]:
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func
# inline plot
%matplotlib inline
In [101]:
keys = pd.read_csv('keys.csv') # hidden from github
api = twitter.Api(consumer_key=keys.iloc[0].string,
consumer_secret=keys.iloc[1].string,
access_token_key=keys.iloc[2].string,
access_token_secret=keys.iloc[3].string)
In [99]:
api.VerifyCredentials()
Out[99]:
In [23]:
api.GetUserTimeline?
In [48]:
twt = []
for i in range(3200/200): # defined by api
if i is 0:
# If the first call, just start from most recent
twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200))
else:
# If not the first in the lot, grab the last tweet id and continue
next_id = re.search('\"id\":\s([0-9]{18})',str(twt[-1][-1]))
twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200, max_id=next_id.group(1)))
time.sleep(.5)
In [11]:
Out[11]:
In [49]:
# This is a bit clumsy. I'm grabbing batches of up to 200 tweets,
# and then need to resort into a flat list, and convert to dicts
# on the fly
df = pd.DataFrame([t.AsDict() for call in twt for t in call])
In [50]:
df.head(3)
Out[50]:
In [51]:
df.describe()
Out[51]:
In [52]:
filename = "./data/raw-twt{date}.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')
In [ ]: