Grab as many tweets as we can from the @caltrain_news twitter feed, export to a raw CSV, and to a formatted csv for import into another notebook


In [107]:
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func

# inline plot
%matplotlib inline

In [101]:
keys = pd.read_csv('keys.csv') # hidden from github

api = twitter.Api(consumer_key=keys.iloc[0].string,
                 consumer_secret=keys.iloc[1].string,
                 access_token_key=keys.iloc[2].string,
                 access_token_secret=keys.iloc[3].string)

In [99]:
api.VerifyCredentials()


Out[99]:
<twitter.user.User at 0x1180f6850>

In [23]:
api.GetUserTimeline?

In [48]:
twt = []
for i in range(3200/200): # defined by api
    if i is 0:
        # If the first call, just start from most recent
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200))
    else:
        # If not the first in the lot, grab the last tweet id and continue
        next_id = re.search('\"id\":\s([0-9]{18})',str(twt[-1][-1]))
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200, max_id=next_id.group(1)))
    time.sleep(.5)

In [11]:



Out[11]:
200

In [49]:
# This is a bit clumsy. I'm grabbing batches of up to 200 tweets,
# and then need to resort into a flat list, and convert to dicts
# on the fly
df = pd.DataFrame([t.AsDict() for call in twt for t in call])

In [50]:
df.head(3)


Out[50]:
created_at favorite_count favorited hashtags id in_reply_to_screen_name in_reply_to_status_id in_reply_to_user_id lang media place retweet_count retweeted retweeted_status source text truncated urls user user_mentions
0 Tue Jan 26 20:32:15 +0000 2016 6 False [SanFrancisco] 692082643022680064 NaN NaN NaN en [{u'expanded_url': u'http://twitter.com/Caltra... NaN 7 False NaN <a href="https://about.twitter.com/products/tw... NOTICE: Ped &amp; Bike detours in place for Ma... False {u'https://t.co/hcYGYF5L5S': u'https://www.sfm... {u'id': 456808166, u'verified': True, u'profil... NaN
1 Tue Jan 26 19:41:32 +0000 2016 NaN False NaN 692069881559134208 therealwall 6.920673e+17 46136761 en NaN NaN NaN False NaN <a href="https://about.twitter.com/products/tw... @therealwall After the end of the concert we w... False {u'https://t.co/3f9VEAaGTY': u'http://www.calt... {u'id': 456808166, u'verified': True, u'profil... [{u'screen_name': u'therealwall', u'id': 46136...
2 Tue Jan 26 19:28:52 +0000 2016 NaN False [SB50] 692066695838498816 AemalTheAFGHAN 6.920578e+17 291505788 en NaN NaN NaN False NaN <a href="https://about.twitter.com/products/tw... @AemalTheAFGHAN @BKDenverSports We're glad to ... False {u'https://t.co/fgMOSXplzZ': u'http://www.calt... {u'id': 456808166, u'verified': True, u'profil... [{u'screen_name': u'AemalTheAFGHAN', u'id': 29...

In [51]:
df.describe()


Out[51]:
favorite_count favorited id in_reply_to_status_id in_reply_to_user_id retweet_count retweeted truncated
count 1509.000000 3199 3.199000e+03 1.270000e+03 1.300000e+03 1946.000000 3199 3199
mean 3.513585 0 6.484877e+17 6.474381e+17 6.174353e+08 18.703494 0 0
std 6.693903 0 2.243800e+16 2.323574e+16 1.026750e+09 467.130516 0 0
min 1.000000 False 6.069953e+17 6.069949e+17 3.632000e+03 1.000000 False False
25% 1.000000 0 6.308106e+17 6.265296e+17 2.001160e+07 1.000000 0 0
50% 2.000000 0 6.497296e+17 6.494642e+17 1.069600e+08 3.000000 0 0
75% 3.000000 0 6.664330e+17 6.626031e+17 5.472453e+08 4.000000 0 0
max 116.000000 False 6.920826e+17 6.920673e+17 4.265436e+09 18479.000000 False False

In [52]:
filename = "./data/raw-twt{date}.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')

In [ ]: