Grab as many tweets as we can from the @caltrain_news twitter feed, export to a raw CSV, and to a formatted csv for import into another notebook
In [107]:
    
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func
# inline plot
%matplotlib inline
    
In [101]:
    
keys = pd.read_csv('keys.csv') # hidden from github
api = twitter.Api(consumer_key=keys.iloc[0].string,
                 consumer_secret=keys.iloc[1].string,
                 access_token_key=keys.iloc[2].string,
                 access_token_secret=keys.iloc[3].string)
    
In [99]:
    
api.VerifyCredentials()
    
    Out[99]:
In [23]:
    
api.GetUserTimeline?
    
In [48]:
    
twt = []
for i in range(3200/200): # defined by api
    if i is 0:
        # If the first call, just start from most recent
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200))
    else:
        # If not the first in the lot, grab the last tweet id and continue
        next_id = re.search('\"id\":\s([0-9]{18})',str(twt[-1][-1]))
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200, max_id=next_id.group(1)))
    time.sleep(.5)
    
In [11]:
    
    
    Out[11]:
In [49]:
    
# This is a bit clumsy. I'm grabbing batches of up to 200 tweets,
# and then need to resort into a flat list, and convert to dicts
# on the fly
df = pd.DataFrame([t.AsDict() for call in twt for t in call])
    
In [50]:
    
df.head(3)
    
    Out[50]:
In [51]:
    
df.describe()
    
    Out[51]:
In [52]:
    
filename = "./data/raw-twt{date}.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')
    
In [ ]: