notebook.community

Edit and run



In [2]:

    
import twitter

def oauth_login():
    # XXX: Go to http://twitter.com/apps/new to create an app and get values
    # for these credentials that you'll need to provide in place of these
    # empty string values that are defined as placeholders.
    # See https://dev.twitter.com/docs/auth/oauth for more information 
    # on Twitter's OAuth implementation.
    
    CONSUMER_KEY = 'mEnkTsYmvFGgzjV73SPOz084K'
    CONSUMER_SECRET = 'YnQxTyFhTCG5KSGBeRq1qeVwUkxOhZ99amm6uauy8ett51UE3t'
    OAUTH_TOKEN = '301689344-MG8rknSLPC8dUXAjWE6Eo4DQTeS4JJGjNuTJ6i41'
    OAUTH_TOKEN_SECRET = 'vcjYSSekdT0O8qwMVhh9e6flVC1LaP5OlssIsU4nGWewh'
    
    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api

# Sample usage
twitter_api = oauth_login()    

# Nothing to see by displaying twitter_api except that it's now a
# defined variable

print twitter_api









    



<twitter.api.Twitter object at 0x7f96a2fe2710>



In [5]:

    
def analyze_tweet_content(statuses):
    
    if len(statuses) == 0:
        print "No statuses to analyze"
        return
    
    # A nested helper function for computing lexical diversity
    def lexical_diversity(tokens):
        return 1.0*len(set(tokens))/len(tokens) 
    
    # A nested helper function for computing the average number of words per tweet
    def average_words(statuses):
        total_words = sum([ len(s.split()) for s in statuses ]) 
        return 1.0*total_words/len(statuses)

    status_texts = [ status['text'] for status in statuses ]
    screen_names, hashtags, urls, media, _ = extract_tweet_entities(statuses)
    
    # Compute a collection of all words from all tweets
    words = [ w 
          for t in status_texts 
              for w in t.split() ]
    
    print "Lexical diversity (words):", lexical_diversity(words)
    print "Lexical diversity (screen names):", lexical_diversity(screen_names)
    print "Lexical diversity (hashtags):", lexical_diversity(hashtags)
    print "Averge words per tweet:", average_words(status_texts)

    
# Sample usage

q = 'CrossFit'
twitter_api = oauth_login()
search_results = twitter_search(twitter_api, q)

analyze_tweet_content(search_results)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-af78bac1e789> in <module>()
     34 search_results = twitter_search(twitter_api, q)
     35 
---> 36 analyze_tweet_content(search_results)

<ipython-input-5-af78bac1e789> in analyze_tweet_content(statuses)
     15 
     16     status_texts = [ status['text'] for status in statuses ]
---> 17     screen_names, hashtags, urls, media, _ = extract_tweet_entities(statuses)
     18 
     19     # Compute a collection of all words from all tweets

NameError: global name 'extract_tweet_entities' is not defined



In [6]:

    
import nltk



In [7]:

    
import twitter

# Go to http://twitter.com/apps/new to create an app and get these items
# See https://dev.twitter.com/docs/auth/oauth for more information on Twitter's OAuth implementation

CONSUMER_KEY = 'mEnkTsYmvFGgzjV73SPOz084K'
CONSUMER_SECRET = 'YnQxTyFhTCG5KSGBeRq1qeVwUkxOhZ99amm6uauy8ett51UE3t'
OAUTH_TOKEN = '301689344-MG8rknSLPC8dUXAjWE6Eo4DQTeS4JJGjNuTJ6i41'
OAUTH_TOKEN_SECRET = 'vcjYSSekdT0O8qwMVhh9e6flVC1LaP5OlssIsU4nGWewh'

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.Twitter(domain='api.twitter.com', 
                              api_version='1.1',
                              auth=auth
                             )



In [9]:

    
# The Yahoo! Where On Earth ID for the entire world is 1
WORLD_WOE_ID = 1 

# Prefix id with the underscore for query string parameterization.
# Without the underscore, it's appended to the URL itself
world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
print world_trends
import json
print json.dumps(world_trends, indent=1)









    



[{u'created_at': u'2015-03-23T20:16:17Z', u'trends': [{u'url': u'http://twitter.com/search?q=%23LIAMHASNOCHILL', u'query': u'%23LIAMHASNOCHILL', u'name': u'#LIAMHASNOCHILL', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%23HaramKPSSsorusu%C3%87almad%C4%B1k', u'query': u'%23HaramKPSSsorusu%C3%87almad%C4%B1k', u'name': u'#HaramKPSSsorusu\xc7almad\u0131k', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%23ReisvarFitneYok', u'query': u'%23ReisvarFitneYok', u'name': u'#ReisvarFitneYok', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%23NationalPuppyDay', u'query': u'%23NationalPuppyDay', u'name': u'#NationalPuppyDay', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%23MirandoCombate', u'query': u'%23MirandoCombate', u'name': u'#MirandoCombate', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%22Mauro+Matos%22', u'query': u'%22Mauro+Matos%22', u'name': u'Mauro Matos', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%22Sassy+Liam%22', u'query': u'%22Sassy+Liam%22', u'name': u'Sassy Liam', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=G%C3%B6k%C3%A7ek', u'query': u'G%C3%B6k%C3%A7ek', u'name': u'G\xf6k\xe7ek', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%22Guerreros+ROJOS%22', u'query': u'%22Guerreros+ROJOS%22', u'name': u'Guerreros ROJOS', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%22%D9%85%D8%AD%D9%85%D8%AF+%D8%B4%D8%A7%D9%87%D9%8A%D9%86%22', u'query': u'%22%D9%85%D8%AD%D9%85%D8%AF+%D8%B4%D8%A7%D9%87%D9%8A%D9%86%22', u'name': u'\u0645\u062d\u0645\u062f \u0634\u0627\u0647\u064a\u0646', u'promoted_content': None}], u'as_of': u'2015-03-23T20:18:24Z', u'locations': [{u'woeid': 1, u'name': u'D\xfcnya'}]}]
[
 {
  "created_at": "2015-03-23T20:16:17Z", 
  "trends": [
   {
    "url": "http://twitter.com/search?q=%23LIAMHASNOCHILL", 
    "query": "%23LIAMHASNOCHILL", 
    "name": "#LIAMHASNOCHILL", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23HaramKPSSsorusu%C3%87almad%C4%B1k", 
    "query": "%23HaramKPSSsorusu%C3%87almad%C4%B1k", 
    "name": "#HaramKPSSsorusu\u00c7almad\u0131k", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23ReisvarFitneYok", 
    "query": "%23ReisvarFitneYok", 
    "name": "#ReisvarFitneYok", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23NationalPuppyDay", 
    "query": "%23NationalPuppyDay", 
    "name": "#NationalPuppyDay", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23MirandoCombate", 
    "query": "%23MirandoCombate", 
    "name": "#MirandoCombate", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%22Mauro+Matos%22", 
    "query": "%22Mauro+Matos%22", 
    "name": "Mauro Matos", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%22Sassy+Liam%22", 
    "query": "%22Sassy+Liam%22", 
    "name": "Sassy Liam", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=G%C3%B6k%C3%A7ek", 
    "query": "G%C3%B6k%C3%A7ek", 
    "name": "G\u00f6k\u00e7ek", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%22Guerreros+ROJOS%22", 
    "query": "%22Guerreros+ROJOS%22", 
    "name": "Guerreros ROJOS", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%22%D9%85%D8%AD%D9%85%D8%AF+%D8%B4%D8%A7%D9%87%D9%8A%D9%86%22", 
    "query": "%22%D9%85%D8%AD%D9%85%D8%AF+%D8%B4%D8%A7%D9%87%D9%8A%D9%86%22", 
    "name": "\u0645\u062d\u0645\u062f \u0634\u0627\u0647\u064a\u0646", 
    "promoted_content": null
   }
  ], 
  "as_of": "2015-03-23T20:18:24Z", 
  "locations": [
   {
    "woeid": 1, 
    "name": "D\u00fcnya"
   }
  ]
 }
]



In [11]:

    
# Like all other APIs, search requests now require authentication and have a slightly different request and
# response format. See https://dev.twitter.com/docs/api/1.1/get/search/tweets

q = "SNL"
count = 5

search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']

# v1.1 of Twitter's API provides a value in the response for the next batch of results that needs to be parsed out
# and passed back in as keyword args if you want to retrieve more than one page. It appears in the 'search_metadata'
# field of the response object and has the following form:'?max_id=313519052523986943&q=NCAA&include_entities=1'
# The tweets themselves are encoded in the 'statuses' field of the response


# Here's how you would grab five more batches of results and collect the statuses as a list
for _ in range(5): 
    try:
        next_results = search_results['search_metadata']['next_results']
    except KeyError, e: # No more results when next_results doesn't exist
        break

    kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ]) # Create a dictionary from the query string params
    search_results = twitter_api.search.tweets(**kwargs)
    statuses += search_results['statuses']



In [12]:

    
import json
print json.dumps(statuses[0:2], indent=1) # Only print a couple of tweets here in IPython Notebook









    



[
 {
  "contributors": null, 
  "truncated": false, 
  "text": "RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7", 
  "in_reply_to_status_id": null, 
  "id": 580101417278447616, 
  "favorite_count": 0, 
  "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", 
  "retweeted": false, 
  "coordinates": null, 
  "entities": {
   "symbols": [], 
   "user_mentions": [
    {
     "id": 250831586, 
     "indices": [
      3, 
      11
     ], 
     "id_str": "250831586", 
     "screen_name": "TheRock", 
     "name": "Dwayne Johnson"
    }
   ], 
   "hashtags": [
    {
     "indices": [
      107, 
      111
     ], 
     "text": "SNL"
    }
   ], 
   "urls": [
    {
     "url": "https://t.co/6RuaAEXAy7", 
     "indices": [
      113, 
      136
     ], 
     "expanded_url": "https://instagram.com/p/0lThN1oh6N/", 
     "display_url": "instagram.com/p/0lThN1oh6N/"
    }
   ]
  }, 
  "in_reply_to_screen_name": null, 
  "in_reply_to_user_id": null, 
  "retweet_count": 89, 
  "id_str": "580101417278447616", 
  "favorited": false, 
  "retweeted_status": {
   "contributors": null, 
   "truncated": false, 
   "text": "In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7", 
   "in_reply_to_status_id": null, 
   "id": 580099557746450433, 
   "favorite_count": 235, 
   "source": "<a href=\"http://instagram.com\" rel=\"nofollow\">Instagram</a>", 
   "retweeted": false, 
   "coordinates": null, 
   "entities": {
    "symbols": [], 
    "user_mentions": [], 
    "hashtags": [
     {
      "indices": [
       94, 
       98
      ], 
      "text": "SNL"
     }
    ], 
    "urls": [
     {
      "url": "https://t.co/6RuaAEXAy7", 
      "indices": [
       100, 
       123
      ], 
      "expanded_url": "https://instagram.com/p/0lThN1oh6N/", 
      "display_url": "instagram.com/p/0lThN1oh6N/"
     }
    ]
   }, 
   "in_reply_to_screen_name": null, 
   "in_reply_to_user_id": null, 
   "retweet_count": 89, 
   "id_str": "580099557746450433", 
   "favorited": false, 
   "user": {
    "follow_request_sent": false, 
    "profile_use_background_image": false, 
    "profile_text_color": "333333", 
    "default_profile_image": false, 
    "id": 250831586, 
    "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/378800000088014848/dedc235cc458261485ddc9989b6f968b.jpeg", 
    "verified": true, 
    "profile_location": null, 
    "profile_image_url_https": "https://pbs.twimg.com/profile_images/3478244961/01ebfc40ecc194a2abc81e82ab877af4_normal.jpeg", 
    "profile_sidebar_fill_color": "C7C7C7", 
    "entities": {
     "url": {
      "urls": [
       {
        "url": "http://t.co/oa15jibQAk", 
        "indices": [
         0, 
         22
        ], 
        "expanded_url": "http://www.facebook.com/DwayneJohnson", 
        "display_url": "facebook.com/DwayneJohnson"
       }
      ]
     }, 
     "description": {
      "urls": [
       {
        "url": "http://t.co/spOmCwIGrX", 
        "indices": [
         56, 
         78
        ], 
        "expanded_url": "http://instagram.com/TheRock", 
        "display_url": "instagram.com/TheRock"
       }
      ]
     }
    }, 
    "followers_count": 8308121, 
    "profile_sidebar_border_color": "000000", 
    "id_str": "250831586", 
    "profile_background_color": "131516", 
    "listed_count": 27815, 
    "is_translation_enabled": true, 
    "utc_offset": -25200, 
    "statuses_count": 9370, 
    "description": "Official Twitter account for Dwayne 'The Rock' Johnson. http://t.co/spOmCwIGrX", 
    "friends_count": 1, 
    "location": "", 
    "profile_link_color": "2B3856", 
    "profile_image_url": "http://pbs.twimg.com/profile_images/3478244961/01ebfc40ecc194a2abc81e82ab877af4_normal.jpeg", 
    "following": false, 
    "geo_enabled": true, 
    "profile_banner_url": "https://pbs.twimg.com/profile_banners/250831586/1425938242", 
    "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/378800000088014848/dedc235cc458261485ddc9989b6f968b.jpeg", 
    "name": "Dwayne Johnson", 
    "lang": "en", 
    "profile_background_tile": true, 
    "favourites_count": 100, 
    "screen_name": "TheRock", 
    "notifications": false, 
    "url": "http://t.co/oa15jibQAk", 
    "created_at": "Fri Feb 11 22:14:10 +0000 2011", 
    "contributors_enabled": false, 
    "time_zone": "Pacific Time (US & Canada)", 
    "protected": false, 
    "default_profile": false, 
    "is_translator": false
   }, 
   "geo": null, 
   "in_reply_to_user_id_str": null, 
   "possibly_sensitive": false, 
   "lang": "en", 
   "created_at": "Mon Mar 23 20:11:26 +0000 2015", 
   "in_reply_to_status_id_str": null, 
   "place": null, 
   "metadata": {
    "iso_language_code": "en", 
    "result_type": "recent"
   }
  }, 
  "user": {
   "follow_request_sent": false, 
   "profile_use_background_image": true, 
   "profile_text_color": "666666", 
   "default_profile_image": false, 
   "id": 565516255, 
   "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme5/bg.gif", 
   "verified": false, 
   "profile_location": null, 
   "profile_image_url_https": "https://pbs.twimg.com/profile_images/539195650639167490/lazJNA-Y_normal.jpeg", 
   "profile_sidebar_fill_color": "252429", 
   "entities": {
    "description": {
     "urls": []
    }
   }, 
   "followers_count": 763, 
   "profile_sidebar_border_color": "FFFFFF", 
   "id_str": "565516255", 
   "profile_background_color": "352726", 
   "listed_count": 102, 
   "is_translation_enabled": false, 
   "utc_offset": 3600, 
   "statuses_count": 96367, 
   "description": "in search of Wonder.", 
   "friends_count": 2000, 
   "location": "Brussels", 
   "profile_link_color": "0909E0", 
   "profile_image_url": "http://pbs.twimg.com/profile_images/539195650639167490/lazJNA-Y_normal.jpeg", 
   "following": false, 
   "geo_enabled": true, 
   "profile_banner_url": "https://pbs.twimg.com/profile_banners/565516255/1404908197", 
   "profile_background_image_url": "http://abs.twimg.com/images/themes/theme5/bg.gif", 
   "name": "Shakkha Spirit", 
   "lang": "fr", 
   "profile_background_tile": false, 
   "favourites_count": 163, 
   "screen_name": "ShakkhaEsprit", 
   "notifications": false, 
   "url": null, 
   "created_at": "Sat Apr 28 15:13:52 +0000 2012", 
   "contributors_enabled": false, 
   "time_zone": "Brussels", 
   "protected": false, 
   "default_profile": false, 
   "is_translator": false
  }, 
  "geo": null, 
  "in_reply_to_user_id_str": null, 
  "possibly_sensitive": true, 
  "lang": "en", 
  "created_at": "Mon Mar 23 20:18:50 +0000 2015", 
  "in_reply_to_status_id_str": null, 
  "place": null, 
  "metadata": {
   "iso_language_code": "en", 
   "result_type": "recent"
  }
 }, 
 {
  "contributors": null, 
  "truncated": false, 
  "text": "RT @Real_Liam_Payne: That was so much fun thank you SNL", 
  "in_reply_to_status_id": null, 
  "id": 580101404762718209, 
  "favorite_count": 0, 
  "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", 
  "retweeted": false, 
  "coordinates": null, 
  "entities": {
   "symbols": [], 
   "user_mentions": [
    {
     "id": 158314798, 
     "indices": [
      3, 
      19
     ], 
     "id_str": "158314798", 
     "screen_name": "Real_Liam_Payne", 
     "name": "Liam"
    }
   ], 
   "hashtags": [], 
   "urls": []
  }, 
  "in_reply_to_screen_name": null, 
  "in_reply_to_user_id": null, 
  "retweet_count": 75691, 
  "id_str": "580101404762718209", 
  "favorited": false, 
  "retweeted_status": {
   "contributors": null, 
   "truncated": false, 
   "text": "That was so much fun thank you SNL", 
   "in_reply_to_status_id": null, 
   "id": 546558827949486080, 
   "favorite_count": 138391, 
   "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", 
   "retweeted": false, 
   "coordinates": null, 
   "entities": {
    "symbols": [], 
    "user_mentions": [], 
    "hashtags": [], 
    "urls": []
   }, 
   "in_reply_to_screen_name": null, 
   "in_reply_to_user_id": null, 
   "retweet_count": 75691, 
   "id_str": "546558827949486080", 
   "favorited": false, 
   "user": {
    "follow_request_sent": false, 
    "profile_use_background_image": true, 
    "profile_text_color": "333333", 
    "default_profile_image": false, 
    "id": 158314798, 
    "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", 
    "verified": true, 
    "profile_location": null, 
    "profile_image_url_https": "https://pbs.twimg.com/profile_images/561711433847554049/6QAa1YuW_normal.jpeg", 
    "profile_sidebar_fill_color": "DDEEF6", 
    "entities": {
     "url": {
      "urls": [
       {
        "url": "http://t.co/zUsqCh2RnX", 
        "indices": [
         0, 
         22
        ], 
        "expanded_url": "http://www.onedirectionmusic.com", 
        "display_url": "onedirectionmusic.com"
       }
      ]
     }, 
     "description": {
      "urls": []
     }
    }, 
    "followers_count": 19600299, 
    "profile_sidebar_border_color": "C0DEED", 
    "id_str": "158314798", 
    "profile_background_color": "C0DEED", 
    "listed_count": 106249, 
    "is_translation_enabled": true, 
    "utc_offset": -36000, 
    "statuses_count": 8908, 
    "description": "99% of the time i have no idea whats going on", 
    "friends_count": 17322, 
    "location": "UK", 
    "profile_link_color": "0084B4", 
    "profile_image_url": "http://pbs.twimg.com/profile_images/561711433847554049/6QAa1YuW_normal.jpeg", 
    "following": false, 
    "geo_enabled": true, 
    "profile_banner_url": "https://pbs.twimg.com/profile_banners/158314798/1422757417", 
    "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", 
    "name": "Liam", 
    "lang": "en", 
    "profile_background_tile": false, 
    "favourites_count": 110, 
    "screen_name": "Real_Liam_Payne", 
    "notifications": false, 
    "url": "http://t.co/zUsqCh2RnX", 
    "created_at": "Tue Jun 22 10:19:08 +0000 2010", 
    "contributors_enabled": false, 
    "time_zone": "Hawaii", 
    "protected": false, 
    "default_profile": true, 
    "is_translator": false
   }, 
   "geo": null, 
   "in_reply_to_user_id_str": null, 
   "lang": "en", 
   "created_at": "Sun Dec 21 06:52:33 +0000 2014", 
   "in_reply_to_status_id_str": null, 
   "place": null, 
   "metadata": {
    "iso_language_code": "en", 
    "result_type": "recent"
   }
  }, 
  "user": {
   "follow_request_sent": false, 
   "profile_use_background_image": true, 
   "profile_text_color": "333333", 
   "default_profile_image": false, 
   "id": 2330377609, 
   "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", 
   "verified": false, 
   "profile_location": null, 
   "profile_image_url_https": "https://pbs.twimg.com/profile_images/577182515976400896/aSKZxfQi_normal.jpeg", 
   "profile_sidebar_fill_color": "DDEEF6", 
   "entities": {
    "description": {
     "urls": []
    }
   }, 
   "followers_count": 53, 
   "profile_sidebar_border_color": "C0DEED", 
   "id_str": "2330377609", 
   "profile_background_color": "C0DEED", 
   "listed_count": 0, 
   "is_translation_enabled": false, 
   "utc_offset": null, 
   "statuses_count": 1190, 
   "description": "ZNLLH, 1D, you're the reason for my smile so keep on being you/thank you for everything you've done so far and for everything you will do//15//#germanywants1D", 
   "friends_count": 32, 
   "location": "", 
   "profile_link_color": "0084B4", 
   "profile_image_url": "http://pbs.twimg.com/profile_images/577182515976400896/aSKZxfQi_normal.jpeg", 
   "following": false, 
   "geo_enabled": false, 
   "profile_banner_url": "https://pbs.twimg.com/profile_banners/2330377609/1397321026", 
   "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", 
   "name": "PLS ZIALL", 
   "lang": "de", 
   "profile_background_tile": false, 
   "favourites_count": 407, 
   "screen_name": "BrgerJulia", 
   "notifications": false, 
   "url": null, 
   "created_at": "Thu Feb 06 14:27:05 +0000 2014", 
   "contributors_enabled": false, 
   "time_zone": null, 
   "protected": false, 
   "default_profile": true, 
   "is_translator": false
  }, 
  "geo": null, 
  "in_reply_to_user_id_str": null, 
  "lang": "en", 
  "created_at": "Mon Mar 23 20:18:47 +0000 2015", 
  "in_reply_to_status_id_str": null, 
  "place": null, 
  "metadata": {
   "iso_language_code": "en", 
   "result_type": "recent"
  }
 }
]



In [13]:

    
tweets = [ status['text'] for status in statuses ]

print tweets[0]









    



RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL… https://t.co/6RuaAEXAy7



In [61]:

    
len(tweets)
print type(tweets)
print tweets









    



<type 'list'>
[u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @Real_Liam_Payne: That was so much fun thank you SNL', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u"SNL has new topic now --&gt; Sen. Ted Cruz announces presidential bid, vows to 'stand for liberty' | http://t.co/mFZvhMhVD8 @nbcsnl", u'IsahNaara28 \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'AnaLauraZimmer \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'The Rock @therock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this\u2026 http://t.co/IUQFYDTlxl', u'kidrauhlandemix \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'teenbiebervouge \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'justinbieberpg_ \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'Celebrilizer \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'BelieverfromJB \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'Interwinesus \u2018SNL\u2019 Parodies Justiin Biieber\u2019s Calvin Klein Campaign http://t.co/BvR7JCBEDF http://t.co/skj3u492kR #SWAG', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'#SNL', u"RT @albz: Ted Cruz already looks, talks and acts like someone who's playing Ted Cruz on SNL.", u"RT @albz: Ted Cruz already looks, talks and acts like someone who's playing Ted Cruz on SNL.", u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @SouthernNetwork: Anchor Profile: Alex @cwiokk Sophomore Broadcast major with a minor in English. Hopes to work for SNL post grad \U0001f3a5 http\u2026', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'RT @TheRock: In 5 days I take the stage and host one more time. Had a blast w/ my bud Will Ferrell in this #SNL\u2026 https://t.co/6RuaAEXAy7', u'@SNL_football yeah I like him. Took my next te instead', u'@ZacWassink http://t.co/tOwCQBGgTm']



In [15]:

    
words = []
for t in tweets:
    words += [ w for w in t.split() ]

# total words
print len(words) 

# unique words
print len(set(words)) 

# lexical diversity
print 1.0*len(set(words))/len(words) 

# avg words per tweet
print 1.0*sum([ len(t.split()) for t in tweets ])/len(tweets)









    



552
115
0.208333333333
18.4



In [17]:

    
import nltk

freq_dist = nltk.FreqDist(words)
print json.dumps(freq_dist.keys()[:50], indent=1) # 50 most frequent tokens
print "--------"
print json.dumps(freq_dist.keys()[-50:], indent=1) # 50 least frequent tokens









    



[
 "Had", 
 "liberty'", 
 "talks", 
 "looks,", 
 "RT", 
 "blast", 
 "\ud83c\udfa5", 
 "thank", 
 "Klein", 
 "time.", 
 "Broadcast", 
 "to", 
 "has", 
 "someone", 
 "Sophomore", 
 "who's", 
 "Calvin", 
 "English.", 
 "now", 
 "presidential", 
 "like", 
 "Campaign", 
 "BelieverfromJB", 
 "SNL", 
 "https://t.co/6RuaAEXAy7", 
 "@SouthernNetwork:", 
 "grad", 
 "|", 
 "http://t.co/tOwCQBGgTm", 
 "Profile:", 
 "w/", 
 "kidrauhlandemix", 
 "yeah", 
 "Justiin", 
 "for", 
 "#SNL\u2026", 
 "announces", 
 "Took", 
 "him.", 
 "Cruz", 
 "new", 
 "IsahNaara28", 
 "Sen.", 
 "vows", 
 "host", 
 "Rock", 
 "post", 
 "Anchor", 
 "stage", 
 "@therock:"
]
--------
[
 "bid,", 
 "was", 
 "minor", 
 "more", 
 "on", 
 "teenbiebervouge", 
 "SNL.", 
 "--&gt;", 
 "with", 
 "bud", 
 "@ZacWassink", 
 "http\u2026", 
 "this\u2026", 
 "this", 
 "#SWAG", 
 "work", 
 "Ted", 
 "Ferrell", 
 "te", 
 "fun", 
 "my", 
 "and", 
 "Parodies", 
 "@SNL_football", 
 "topic", 
 "http://t.co/mFZvhMhVD8", 
 "in", 
 "Interwinesus", 
 "@TheRock:", 
 "acts", 
 "http://t.co/skj3u492kR", 
 "5", 
 "take", 
 "@cwiokk", 
 "instead", 
 "you", 
 "Alex", 
 "I", 
 "http://t.co/BvR7JCBEDF", 
 "Will", 
 "justinbieberpg_", 
 "Celebrilizer", 
 "The", 
 "a", 
 "That", 
 "@nbcsnl", 
 "so", 
 "In", 
 "the", 
 "playing"
]



In [19]:

    
import networkx as nx
import re
g = nx.DiGraph()

def get_rt_sources(tweet):
    rt_patterns = re.compile(r'(RT|via)((?:\b\W*@\w+)+)', re.IGNORECASE)
    return [ source.strip()
             for tuple in rt_patterns.findall(tweet)
                 for source in tuple
                     if source not in ("RT", "via") ]

for status in statuses:
    rt_sources = get_rt_sources(status['text'])
    if not rt_sources: continue
    for rt_source in rt_sources:
        g.add_edge(rt_source, status['user']['screen_name'], {'tweet_id' : status['id']})
            
print nx.info(g)
print g.edges(data=True)[0]
print len(nx.connected_components(g.to_undirected()))
print sorted(nx.degree(g).values())









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-19-c34419f27ff2> in <module>()
     18 print nx.info(g)
     19 print g.edges(data=True)[0]
---> 20 print len(nx.connected_components(g.to_undirected()))
     21 print sorted(nx.degree(g).values())

TypeError: object of type 'generator' has no len()





    



Couldn't import dot_parser, loading of dot files will not be possible.
Name: 
Type: DiGraph
Number of nodes: 21
Number of edges: 17
Average in degree:   0.8095
Average out degree:   0.8095
(u'@SouthernNetwork', u'liltea23', {'tweet_id': 580100856676253697})



In [20]:

    
from nltk.book import *









    



*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908



In [28]:

    
def lexical_diversity(text):
    return len(text)/(1.0*len(set(text)))

def percentage(count, total):
    return 100*count/total



In [30]:

    
print lexical_diversity(text3)









    



16.0501972033



In [31]:

    
fdist1 = FreqDist(text1)



In [32]:

    
vocabulary1 = fdist1.keys()



In [33]:

    
fdist1['whale']









    Out[33]:





906



In [34]:

    
fdist1.plot(50, cumulative=True)



In [40]:

    
thursday_sents = nltk.corpus.gutenberg.sents('chesterton-thursday.txt')
sent22 = thursday_sents[22]
' '.join(sent22)









    Out[40]:





u'THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .'



In [42]:

    
a = nltk.bigrams(w for w in sent22 if w.isalpha())



In [45]:

    
for i in a :
    print i









    



(u'THE', u'suburb')
(u'suburb', u'of')
(u'of', u'Saffron')
(u'Saffron', u'Park')
(u'Park', u'lay')
(u'lay', u'on')
(u'on', u'the')
(u'the', u'sunset')
(u'sunset', u'side')
(u'side', u'of')
(u'of', u'London')
(u'London', u'as')
(u'as', u'red')
(u'red', u'and')
(u'and', u'ragged')
(u'ragged', u'as')
(u'as', u'a')
(u'a', u'cloud')
(u'cloud', u'of')
(u'of', u'sunset')



In [50]:

    
import networkx as nx
G = nx.Graph()
begin_sent = 22
end_sent = 24
sents = thursday_sents[begin_sent:end_sent+1]

for sent in sents:
    G.add_edges_from(nltk.bigrams(w for w in sent if w.isalpha()))
nx.draw(G)



In [56]:

    
import codecs, nltk, pprint

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [58]:

    
sent_tokenizer.tokenize(text1)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-58-b51f84e75ee0> in <module>()
----> 1 sent_tokenizer.tokenize(text1)

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in tokenize(self, text, realign_boundaries)
   1268         Given a text, returns a list of the sentences in that text.
   1269         """
-> 1270         return list(self.sentences_from_text(text, realign_boundaries))
   1271 
   1272     def debug_decisions(self, text):

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in sentences_from_text(self, text, realign_boundaries)
   1316         follows the period.
   1317         """
-> 1318         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1319 
   1320     def _slices_from_text(self, text):

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in span_tokenize(self, text, realign_boundaries)
   1307         if realign_boundaries:
   1308             slices = self._realign_boundaries(text, slices)
-> 1309         return [(sl.start, sl.stop) for sl in slices]
   1310 
   1311     def sentences_from_text(self, text, realign_boundaries=True):

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in _realign_boundaries(self, text, slices)
   1346         """
   1347         realign = 0
-> 1348         for sl1, sl2 in _pair_iter(slices):
   1349             sl1 = slice(sl1.start + realign, sl1.stop)
   1350             if not sl2:

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in _pair_iter(it)
    352     """
    353     it = iter(it)
--> 354     prev = next(it)
    355     for el in it:
    356         yield (prev, el)

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.pyc in _slices_from_text(self, text)
   1320     def _slices_from_text(self, text):
   1321         last_break = 0
-> 1322         for match in self._lang_vars.period_context_re().finditer(text):
   1323             context = match.group() + match.group('after_tok')
   1324             if self.text_contains_sentbreak(context):

TypeError: expected string or buffer



In [ ]: