notebook.community

Edit and run

Grab as many tweets as we can from the @caltrain_news twitter feed, export to a raw CSV, and to a formatted csv for import into another notebook



In [107]:

    
import sys
import re
import time
import datetime
# Requires for ipython to pick up on twitter?
sys.path.append('/Library/Python/2.7/site-packages/')
import twitter
import pandas as pd
import func

# inline plot
%matplotlib inline



In [101]:

    
keys = pd.read_csv('keys.csv') # hidden from github

api = twitter.Api(consumer_key=keys.iloc[0].string,
                 consumer_secret=keys.iloc[1].string,
                 access_token_key=keys.iloc[2].string,
                 access_token_secret=keys.iloc[3].string)



In [99]:

    
api.VerifyCredentials()









    Out[99]:





<twitter.user.User at 0x1180f6850>



In [23]:

    
api.GetUserTimeline?



In [48]:

    
twt = []
for i in range(3200/200): # defined by api
    if i is 0:
        # If the first call, just start from most recent
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200))
    else:
        # If not the first in the lot, grab the last tweet id and continue
        next_id = re.search('\"id\":\s([0-9]{18})',str(twt[-1][-1]))
        twt.append(api.GetUserTimeline(screen_name='caltrain_news', count=200, max_id=next_id.group(1)))
    time.sleep(.5)



In [11]:









    Out[11]:





200



In [49]:

    
# This is a bit clumsy. I'm grabbing batches of up to 200 tweets,
# and then need to resort into a flat list, and convert to dicts
# on the fly
df = pd.DataFrame([t.AsDict() for call in twt for t in call])



In [50]:

    
df.head(3)









    Out[50]:






  
    
      
      created_at
      favorite_count
      favorited
      hashtags
      id
      in_reply_to_screen_name
      in_reply_to_status_id
      in_reply_to_user_id
      lang
      media
      place
      retweet_count
      retweeted
      retweeted_status
      source
      text
      truncated
      urls
      user
      user_mentions
    
  
  
    
      0
      Tue Jan 26 20:32:15 +0000 2016
      6
      False
      [SanFrancisco]
      692082643022680064
      NaN
      NaN
      NaN
      en
      [{u'expanded_url': u'http://twitter.com/Caltra...
      NaN
      7
      False
      NaN
      <a href="https://about.twitter.com/products/tw...
      NOTICE: Ped &amp; Bike detours in place for Ma...
      False
      {u'https://t.co/hcYGYF5L5S': u'https://www.sfm...
      {u'id': 456808166, u'verified': True, u'profil...
      NaN
    
    
      1
      Tue Jan 26 19:41:32 +0000 2016
      NaN
      False
      NaN
      692069881559134208
      therealwall
      6.920673e+17
      46136761
      en
      NaN
      NaN
      NaN
      False
      NaN
      <a href="https://about.twitter.com/products/tw...
      @therealwall After the end of the concert we w...
      False
      {u'https://t.co/3f9VEAaGTY': u'http://www.calt...
      {u'id': 456808166, u'verified': True, u'profil...
      [{u'screen_name': u'therealwall', u'id': 46136...
    
    
      2
      Tue Jan 26 19:28:52 +0000 2016
      NaN
      False
      [SB50]
      692066695838498816
      AemalTheAFGHAN
      6.920578e+17
      291505788
      en
      NaN
      NaN
      NaN
      False
      NaN
      <a href="https://about.twitter.com/products/tw...
      @AemalTheAFGHAN @BKDenverSports We're glad to ...
      False
      {u'https://t.co/fgMOSXplzZ': u'http://www.calt...
      {u'id': 456808166, u'verified': True, u'profil...
      [{u'screen_name': u'AemalTheAFGHAN', u'id': 29...



In [51]:

    
df.describe()









    Out[51]:






  
    
      
      favorite_count
      favorited
      id
      in_reply_to_status_id
      in_reply_to_user_id
      retweet_count
      retweeted
      truncated
    
  
  
    
      count
      1509.000000
      3199
      3.199000e+03
      1.270000e+03
      1.300000e+03
      1946.000000
      3199
      3199
    
    
      mean
      3.513585
      0
      6.484877e+17
      6.474381e+17
      6.174353e+08
      18.703494
      0
      0
    
    
      std
      6.693903
      0
      2.243800e+16
      2.323574e+16
      1.026750e+09
      467.130516
      0
      0
    
    
      min
      1.000000
      False
      6.069953e+17
      6.069949e+17
      3.632000e+03
      1.000000
      False
      False
    
    
      25%
      1.000000
      0
      6.308106e+17
      6.265296e+17
      2.001160e+07
      1.000000
      0
      0
    
    
      50%
      2.000000
      0
      6.497296e+17
      6.494642e+17
      1.069600e+08
      3.000000
      0
      0
    
    
      75%
      3.000000
      0
      6.664330e+17
      6.626031e+17
      5.472453e+08
      4.000000
      0
      0
    
    
      max
      116.000000
      False
      6.920826e+17
      6.920673e+17
      4.265436e+09
      18479.000000
      False
      False



In [52]:

    
filename = "./data/raw-twt{date}.csv".format(date=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
df.to_csv(filename, sep='\t', encoding='utf-8')



In [ ]:

	created_at	favorite_count	favorited	hashtags	id	in_reply_to_screen_name	in_reply_to_status_id	in_reply_to_user_id	lang	media	place	retweet_count	retweeted	retweeted_status	source	text	truncated	urls	user	user_mentions
0	Tue Jan 26 20:32:15 +0000 2016	6	False	[SanFrancisco]	692082643022680064	NaN	NaN	NaN	en	[{u'expanded_url': u'http://twitter.com/Caltra...	NaN	7	False	NaN	<a href="https://about.twitter.com/products/tw...	NOTICE: Ped & Bike detours in place for Ma...	False	{u'https://t.co/hcYGYF5L5S': u'https://www.sfm...	{u'id': 456808166, u'verified': True, u'profil...	NaN
1	Tue Jan 26 19:41:32 +0000 2016	NaN	False	NaN	692069881559134208	therealwall	6.920673e+17	46136761	en	NaN	NaN	NaN	False	NaN	<a href="https://about.twitter.com/products/tw...	@therealwall After the end of the concert we w...	False	{u'https://t.co/3f9VEAaGTY': u'http://www.calt...	{u'id': 456808166, u'verified': True, u'profil...	[{u'screen_name': u'therealwall', u'id': 46136...
2	Tue Jan 26 19:28:52 +0000 2016	NaN	False	[SB50]	692066695838498816	AemalTheAFGHAN	6.920578e+17	291505788	en	NaN	NaN	NaN	False	NaN	<a href="https://about.twitter.com/products/tw...	@AemalTheAFGHAN @BKDenverSports We're glad to ...	False	{u'https://t.co/fgMOSXplzZ': u'http://www.calt...	{u'id': 456808166, u'verified': True, u'profil...	[{u'screen_name': u'AemalTheAFGHAN', u'id': 29...

	favorite_count	favorited	id	in_reply_to_status_id	in_reply_to_user_id	retweet_count	retweeted	truncated
count	1509.000000	3199	3.199000e+03	1.270000e+03	1.300000e+03	1946.000000	3199	3199
mean	3.513585	0	6.484877e+17	6.474381e+17	6.174353e+08	18.703494	0	0
std	6.693903	0	2.243800e+16	2.323574e+16	1.026750e+09	467.130516	0	0
min	1.000000	False	6.069953e+17	6.069949e+17	3.632000e+03	1.000000	False	False
25%	1.000000	0	6.308106e+17	6.265296e+17	2.001160e+07	1.000000	0	0
50%	2.000000	0	6.497296e+17	6.494642e+17	1.069600e+08	3.000000	0	0
75%	3.000000	0	6.664330e+17	6.626031e+17	5.472453e+08	4.000000	0	0
max	116.000000	False	6.920826e+17	6.920673e+17	4.265436e+09	18479.000000	False	False