Analyze 2 days worth of twitter tweets and determine
Stream processing problems is derived from http://adilmoujahid.com/posts/2014/07/twitter-analytics/ https://github.com/adilmoujahid/Twitter_Analytics
In order to access Twitter Streaming API, we need to get 4 pieces of information from Twitter:
1) API key, 2) API secret, 3) Access token and 4) Access token secret.
Follow the steps below to get all 4 elements:
Using twython - pip install twython or follow instructions on https://github.com/tweepy/tweepy
Copy the following code into your own file twitter_streaming_INITIALS.py Replace with the tokens and keys that you copied in the previous step Run the code before writing to your local code directory. Use your interrupt or restart Kernal ops in python to stop the stream
In [37]:
%%writefile code/twython_streaming.py
import twython
from twython import TwythonStreamer
OAUTH_TOKEN = "YOUR_ACCESS_TOKEN"
OAUTH_TOKEN_SECRET = "YOUR_TOKEN_SECRET"
APP_KEY = "YOUR_CONSUMER_KEY"
APP_SECRET = "YOUR_CONSUMER_SECRET"
class MyStreamer(TwythonStreamer):
def on_success(self, data):
if 'text' in data:
print data['text'].encode('utf-8')
def on_error(self, status_code, data):
print status_code
# Want to stop trying to get data because of the error?
# Uncomment the next line!
self.disconnect()
if __name__ == "__main__":
stream = MyStreamer(APP_KEY, APP_SECRET,
OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
stream.statuses.filter(track=['python', 'javascript', 'ruby'])
In [35]:
%%sh
python code/twython_streaming_PS.py > twython_twits.txt
In [ ]:
# %load twython_twits.txt
RT @CSGOEmpire: ❤️🗡️10x Ruby Gut Knife Giveaway with @anomalyxd
-RT
-Follow
-Visit https://t.co/jKqCwU6Tjn
1 winner picked daily!…
RT @CSGOEmpire: The 2nd winner of our 10x Gut Knife | Ruby Giveaway will be picked in 24 hours!
Enter here:
-RT
https://t.co/n8oAM1iAe3
RT @CSGOEmpire: ❤️🗡️10x Ruby Gut Knife Giveaway with @anomalyxd
-RT
-Follow
-Visit https://t.co/jKqCwU6Tjn
1 winner picked daily!…
RT @Ruben_Amon: Permitid que vuelva a decirlo: Puigdemonty Python
In [ ]:
Using tweepy - pip install tweepy or follow instructions on https://github.com/tweepy/tweepy
Copy the following code into your own file tweepy_streaming_INITIALS.py Replace with the tokens and keys that you copied in the previous step
In [38]:
%%writefile code/twitter_streaming.py
# %load code/twitter_streaming.py
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
#Variables that contains the user credentials to access Twitter API
access_token = "YOUR_ACCESS_TOKEN"
access_token_secret = "YOUR_TOKEN_SECRET"
consumer_key = "YOUR_CONSUMER_KEY"
consumer_secret = "YOUR_CONSUMER_SECRET"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['python', 'javascript', 'ruby'])
In [45]:
%%sh
python code/Twitter_streaming_PS.py > tweepy_twits.txt
In [ ]:
# %load tweepy_twits.txt
{"created_at":"Mon Oct 30 18:02:00 +0000 2017","id":925060231297724417,"id_str":"925060231297724417","text":"botlib 35: Framework to program bots https:\/\/t.co\/CfNGl6MjiN","source":"\u003ca href=\"http:\/\/github.com\/tell-k\" rel=\"nofollow\"\u003epypi_updates2\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":739422728168476673,"id_str":"739422728168476673","name":"PyPI Recent Updates","screen_name":"pypi_updates2","location":null,"url":"https:\/\/github.com\/tell-k\/pypi-updates","description":"Unofficial bot to flow PyPI recent updates.","translator_type":"none","protected":false,"verified":false,"followers_count":480,"friends_count":0,"listed_count":112,"favourites_count":1,"statuses_count":332804,"created_at":"Sun Jun 05 11:44:51 +0000 2016","utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/739434327839100930\/pW4Eyp0y.jpg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/739434327839100930\/pW4Eyp0y.jpg","profile_background_tile":false,"profile_link_color":"000000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/739434790688956417\/KjhAltTQ_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/739434790688956417\/KjhAltTQ_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/739422728168476673\/1465129858","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/CfNGl6MjiN","expanded_url":"http:\/\/pypi.python.org\/pypi\/botlib\/35","display_url":"pypi.python.org\/pypi\/botlib\/35","indices":[37,60]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1509386520586"}
{"created_at":"Mon Oct 30 18:02:01 +0000 2017","id":925060236213448705,"id_str":"925060236213448705","text":"RT @Ruben_Amon: Permitid que vuelva a decirlo: Puigdemonty Python","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1970822406,"id_str":"1970822406","name":"politocracia","screen_name":"politocracia","location":null,"url":"http:\/\/www.politocracia.es","description":"Escribimos sobre pol\u00edtica \/\/Dise\u00f1amos\/\/Cuestionamos\/\/Cultura pop\/\/ \u00a1\u00a1Es tiempo de utop\u00edas!!","translator_type":"none","protected":false,"verified":false,"followers_count":1024,"friends_count":1309,"listed_count":38,"favourites_count":1989,"statuses_count":5675,"created_at":"Sat Oct 19 08:24:32 +0000 2013","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/445571872445300736\/2H5bsrZs.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/445571872445300736\/2H5bsrZs.jpeg","profile_background_tile":true,"profile_link_color":"F0586F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/797390986750099456\/mgQhB5j2_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/797390986750099456\/mgQhB5j2_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1970822406\/1508582822","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Oct 30 17:06:34 +0000 2017","id":925046281004863490,"id_str":"925046281004863490","text":"Permitid que vuelva a decirlo: Puigdemonty Python","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":306414975,"id_str":"306414975","name":"Rub\u00e9n Am\u00f3n","screen_name":"Ruben_Amon","location":null,"url":null,"description":"Muramos por las ideas, pero de muerte lenta. Lo dec\u00eda Brassens. Escribo en El Pa\u00eds, predico en Antena3 y Ondacero http:\/\/elpais.com\/autor\/ruben_amon_delgado\/a","translator_type":"none","protected":false,"verified":false,"followers_count":95252,"friends_count":825,"listed_count":1376,"favourites_count":1577,"statuses_count":8800,"created_at":"Fri May 27 20:58:47 +0000 2011","utc_offset":-10800,"time_zone":"Greenland","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/878325240094261250\/MmmZT3l1_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/878325240094261250\/MmmZT3l1_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/306414975\/1494277914","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":6,"reply_count":33,"retweet_count":141,"favorite_count":322,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"und"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Ruben_Amon","name":"Rub\u00e9n Am\u00f3n","id":306414975,"id_str":"306414975","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"und","timestamp_ms":"1509386521758"}
{"created_at":"Mon Oct 30 18:02:01 +0000 2017","id":925060235810738176,"id_str":"925060235810738176","text":"RT @CSGOEmpire: The 2nd winner of our 10x Gut Knife | Ruby Giveaway will be picked in 24 hours!\n\nEnter here:\n\n-RT\n\nhttps:\/\/t.co\/n8oAM1iAe3","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Lite\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":920673086504042501,"id_str":"920673086504042501","name":"Julien Pillaert","screen_name":"pillaert_julien","location":"NY,Am\u00e9rique ","url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":3,"friends_count":19,"listed_count":0,"favourites_count":6,"statuses_count":78,"created_at":"Wed Oct 18 15:29:03 +0000 2017","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/923431800940257281\/jjoGtoQ7_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/923431800940257281\/jjoGtoQ7_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Oct 30 16:39:25 +0000 2017","id":925039449125347333,"id_str":"925039449125347333","text":"The 2nd winner of our 10x Gut Knife | Ruby Giveaway will be picked in 24 hours!\n\nEnter here:\n\n-RT\n\nhttps:\/\/t.co\/n8oAM1iAe3","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3635140215,"id_str":"3635140215","name":"CSGOEmpire","screen_name":"CSGOEmpire","location":"contact@csgoempire.com","url":"https:\/\/csgoempire.com\/r\/twitter\/","description":"The premier CS:GO gambling platform. Claim your free $0.50 coins to get started!","translator_type":"none","protected":false,"verified":false,"followers_count":408589,"friends_count":863,"listed_count":155,"favourites_count":260,"statuses_count":1854,"created_at":"Sat Sep 12 21:21:38 +0000 2015","utc_offset":3600,"time_zone":"Copenhagen","geo_enabled":false,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"FAB81E","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/3635140215\/1506568105","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":924672162635579392,"quoted_status_id_str":"924672162635579392","quoted_status":{"created_at":"Sun Oct 29 16:19:57 +0000 2017","id":924672162635579392,"id_str":"924672162635579392","text":"\u2764\ufe0f\ud83d\udde1\ufe0f10x Ruby Gut Knife Giveaway with @anomalyxd\n\n-RT\n-Follow\n-Visit https:\/\/t.co\/jKqCwU6Tjn\n\n1 winner picked daily!\u2026 https:\/\/t.co\/Qqy6wHiPbd","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3635140215,"id_str":"3635140215","name":"CSGOEmpire","screen_name":"CSGOEmpire","location":"contact@csgoempire.com","url":"https:\/\/csgoempire.com\/r\/twitter\/","description":"The premier CS:GO gambling platform. Claim your free $0.50 coins to get started!","translator_type":"none","protected":false,"verified":false,"followers_count":408589,"friends_count":863,"listed_count":155,"favourites_count":260,"statuses_count":1854,"created_at":"Sat Sep 12 21:21:38 +0000 2015","utc_offset":3600,"time_zone":"Copenhagen","geo_enabled":false,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"FAB81E","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/3635140215\/1506568105","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"\u2764\ufe0f\ud83d\udde1\ufe0f10x Ruby Gut Knife Giveaway with @anomalyxd\n\n-RT\n-Follow\n-Visit https:\/\/t.co\/jKqCwU6Tjn\n\n1 winner picked daily! GL! https:\/\/t.co\/LkI7DxL5Qe","display_text_range":[0,119],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/jKqCwU6Tjn","expanded_url":"http:\/\/goo.gl\/GpXVYf","display_url":"goo.gl\/GpXVYf","indices":[68,91]}],"user_mentions":[{"screen_name":"anomalyxd","name":"Anomaly","id":3238641808,"id_str":"3238641808","indices":[37,47]}],"symbols":[],"media":[{"id":924671557137588224,"id_str":"924671557137588224","indices":[120,143],"media_url":"http:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","url":"https:\/\/t.co\/LkI7DxL5Qe","display_url":"pic.twitter.com\/LkI7DxL5Qe","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392\/photo\/1","type":"photo","sizes":{"small":{"w":680,"h":383,"resize":"fit"},"large":{"w":1920,"h":1080,"resize":"fit"},"medium":{"w":1200,"h":675,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"}}}]},"extended_entities":{"media":[{"id":924671557137588224,"id_str":"924671557137588224","indices":[120,143],"media_url":"http:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","url":"https:\/\/t.co\/LkI7DxL5Qe","display_url":"pic.twitter.com\/LkI7DxL5Qe","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392\/photo\/1","type":"photo","sizes":{"small":{"w":680,"h":383,"resize":"fit"},"large":{"w":1920,"h":1080,"resize":"fit"},"medium":{"w":1200,"h":675,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"}}}]}},"quote_count":685,"reply_count":1574,"retweet_count":24167,"favorite_count":13214,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/jKqCwU6Tjn","expanded_url":"http:\/\/goo.gl\/GpXVYf","display_url":"goo.gl\/GpXVYf","indices":[68,91]},{"url":"https:\/\/t.co\/Qqy6wHiPbd","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/924672162635579392","display_url":"twitter.com\/i\/web\/status\/9\u2026","indices":[117,140]}],"user_mentions":[{"screen_name":"anomalyxd","name":"Anomaly","id":3238641808,"id_str":"3238641808","indices":[37,47]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"quote_count":16,"reply_count":68,"retweet_count":1981,"favorite_count":805,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/n8oAM1iAe3","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392","display_url":"twitter.com\/CSGOEmpire\/sta\u2026","indices":[99,122]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":924672162635579392,"quoted_status_id_str":"924672162635579392","quoted_status":{"created_at":"Sun Oct 29 16:19:57 +0000 2017","id":924672162635579392,"id_str":"924672162635579392","text":"\u2764\ufe0f\ud83d\udde1\ufe0f10x Ruby Gut Knife Giveaway with @anomalyxd\n\n-RT\n-Follow\n-Visit https:\/\/t.co\/jKqCwU6Tjn\n\n1 winner picked daily!\u2026 https:\/\/t.co\/Qqy6wHiPbd","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3635140215,"id_str":"3635140215","name":"CSGOEmpire","screen_name":"CSGOEmpire","location":"contact@csgoempire.com","url":"https:\/\/csgoempire.com\/r\/twitter\/","description":"The premier CS:GO gambling platform. Claim your free $0.50 coins to get started!","translator_type":"none","protected":false,"verified":false,"followers_count":408589,"friends_count":863,"listed_count":155,"favourites_count":260,"statuses_count":1854,"created_at":"Sat Sep 12 21:21:38 +0000 2015","utc_offset":3600,"time_zone":"Copenhagen","geo_enabled":false,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"FAB81E","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/692344830312210432\/-QVZw69__normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/3635140215\/1506568105","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"\u2764\ufe0f\ud83d\udde1\ufe0f10x Ruby Gut Knife Giveaway with @anomalyxd\n\n-RT\n-Follow\n-Visit https:\/\/t.co\/jKqCwU6Tjn\n\n1 winner picked daily! GL! https:\/\/t.co\/LkI7DxL5Qe","display_text_range":[0,119],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/jKqCwU6Tjn","expanded_url":"http:\/\/goo.gl\/GpXVYf","display_url":"goo.gl\/GpXVYf","indices":[68,91]}],"user_mentions":[{"screen_name":"anomalyxd","name":"Anomaly","id":3238641808,"id_str":"3238641808","indices":[37,47]}],"symbols":[],"media":[{"id":924671557137588224,"id_str":"924671557137588224","indices":[120,143],"media_url":"http:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","url":"https:\/\/t.co\/LkI7DxL5Qe","display_url":"pic.twitter.com\/LkI7DxL5Qe","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392\/photo\/1","type":"photo","sizes":{"small":{"w":680,"h":383,"resize":"fit"},"large":{"w":1920,"h":1080,"resize":"fit"},"medium":{"w":1200,"h":675,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"}}}]},"extended_entities":{"media":[{"id":924671557137588224,"id_str":"924671557137588224","indices":[120,143],"media_url":"http:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNUX4dEX4AA6FLh.jpg","url":"https:\/\/t.co\/LkI7DxL5Qe","display_url":"pic.twitter.com\/LkI7DxL5Qe","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392\/photo\/1","type":"photo","sizes":{"small":{"w":680,"h":383,"resize":"fit"},"large":{"w":1920,"h":1080,"resize":"fit"},"medium":{"w":1200,"h":675,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"}}}]}},"quote_count":685,"reply_count":1574,"retweet_count":24167,"favorite_count":13214,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/jKqCwU6Tjn","expanded_url":"http:\/\/goo.gl\/GpXVYf","display_url":"goo.gl\/GpXVYf","indices":[68,91]},{"url":"https:\/\/t.co\/Qqy6wHiPbd","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/924672162635579392","display_url":"twitter.com\/i\/web\/status\/9\u2026","indices":[117,140]}],"user_mentions":[{"screen_name":"anomalyxd","name":"Anomaly","id":3238641808,"id_str":"3238641808","indices":[37,47]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/n8oAM1iAe3","expanded_url":"https:\/\/twitter.com\/CSGOEmpire\/status\/924672162635579392","display_url":"twitter.com\/CSGOEmpire\/sta\u2026","indices":[115,138]}],"user_mentions":[{"screen_name":"CSGOEmpire","name":"CSGOEmpire","id":3635140215,"id_str":"3635140215","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1509386521662"}
{"created_at":"Mon Oct 30 18:02:02 +0000 2017","id":925060240843866112,"id_str":"925060240843866112","text":"RT @LiLMiniJ: @AJKreisberg @jessicaqueller @GBerlanti @SupergirlStaff\n\nJust an FYI. Nobody likes Ruby. So whatev you're thinking\u2026 ","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":869815419565207552,"id_str":"869815419565207552","name":"Danny","screen_name":"_danfes","location":"Ruhrgebiet, Germany","url":null,"description":"Wife \ud83c\udff3\ufe0f\u200d\ud83c\udf08 | Football (Soccer) \ud83d\udda4\ud83d\udc9b| Traveling \ud83d\uddfa |","translator_type":"none","protected":false,"verified":false,"followers_count":27,"friends_count":95,"listed_count":0,"favourites_count":3301,"statuses_count":896,"created_at":"Wed May 31 07:18:51 +0000 2017","utc_offset":3600,"time_zone":"Berlin","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"FAB81E","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/915998042641227776\/OWab8Off_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/915998042641227776\/OWab8Off_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/869815419565207552\/1496656622","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Oct 30 17:37:50 +0000 2017","id":925054150404132864,"id_str":"925054150404132864","text":"@AJKreisberg @jessicaqueller @GBerlanti @SupergirlStaff\n\nJust an FYI. Nobody likes Ruby. So whatev you're thinking\u2026 https:\/\/t.co\/OZ3C8FzOLt","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":633354279,"in_reply_to_user_id_str":"633354279","in_reply_to_screen_name":"AJKreisberg","user":{"id":751943697512574976,"id_str":"751943697512574976","name":"J - #\u2764\ufe0f","screen_name":"LiLMiniJ","location":"#PatriotsNation","url":null,"description":"Blah Blah Feminism Blah. \u2640\ufe0f\u2640\ufe0f Fangirling. Wannabe Cyclist. Tunes aficionado. Rabid sports fan. Animal lover. #MaggieMatters #DontLetMaggieGo #BringMaggieBack","translator_type":"none","protected":false,"verified":false,"followers_count":387,"friends_count":955,"listed_count":7,"favourites_count":16254,"statuses_count":12726,"created_at":"Sun Jul 10 00:58:42 +0000 2016","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/918123746673565697\/avrCTZGH_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/918123746673565697\/avrCTZGH_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/751943697512574976\/1501111415","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"@AJKreisberg @jessicaqueller @GBerlanti @SupergirlStaff\n\nJust an FYI. Nobody likes Ruby. So whatev you're thinking - STOP!\n\n#BringMaggieBack https:\/\/t.co\/s2dB3efXBc","display_text_range":[0,140],"entities":{"hashtags":[{"text":"BringMaggieBack","indices":[124,140]}],"urls":[],"user_mentions":[{"screen_name":"AJKreisberg","name":"Andrew Kreisberg","id":633354279,"id_str":"633354279","indices":[0,12]},{"screen_name":"jessicaqueller","name":"jessica queller","id":2337336336,"id_str":"2337336336","indices":[13,28]},{"screen_name":"GBerlanti","name":"Greg Berlanti","id":635028535,"id_str":"635028535","indices":[29,39]},{"screen_name":"SupergirlStaff","name":"SupergirlWritersRoom","id":3504052692,"id_str":"3504052692","indices":[40,55]}],"symbols":[],"media":[{"id":925054135484993537,"id_str":"925054135484993537","indices":[141,164],"media_url":"http:\/\/pbs.twimg.com\/tweet_video_thumb\/DNZz1ceW0AE1Z9Z.jpg","media_url_https":"https:\/\/pbs.twimg.com\/tweet_video_thumb\/DNZz1ceW0AE1Z9Z.jpg","url":"https:\/\/t.co\/s2dB3efXBc","display_url":"pic.twitter.com\/s2dB3efXBc","expanded_url":"https:\/\/twitter.com\/LiLMiniJ\/status\/925054150404132864\/photo\/1","type":"animated_gif","sizes":{"large":{"w":600,"h":450,"resize":"fit"},"medium":{"w":600,"h":450,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":600,"h":450,"resize":"fit"}},"video_info":{"aspect_ratio":[4,3],"variants":[{"bitrate":0,"content_type":"video\/mp4","url":"https:\/\/video.twimg.com\/tweet_video\/DNZz1ceW0AE1Z9Z.mp4"}]}}]},"extended_entities":{"media":[{"id":925054135484993537,"id_str":"925054135484993537","indices":[141,164],"media_url":"http:\/\/pbs.twimg.com\/tweet_video_thumb\/DNZz1ceW0AE1Z9Z.jpg","media_url_https":"https:\/\/pbs.twimg.com\/tweet_video_thumb\/DNZz1ceW0AE1Z9Z.jpg","url":"https:\/\/t.co\/s2dB3efXBc","display_url":"pic.twitter.com\/s2dB3efXBc","expanded_url":"https:\/\/twitter.com\/LiLMiniJ\/status\/925054150404132864\/photo\/1","type":"animated_gif","sizes":{"large":{"w":600,"h":450,"resize":"fit"},"medium":{"w":600,"h":450,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":600,"h":450,"resize":"fit"}},"video_info":{"aspect_ratio":[4,3],"variants":[{"bitrate":0,"content_type":"video\/mp4","url":"https:\/\/video.twimg.com\/tweet_video\/DNZz1ceW0AE1Z9Z.mp4"}]}}]}},"quote_count":0,"reply_count":0,"retweet_count":2,"favorite_count":2,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/OZ3C8FzOLt","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/925054150404132864","display_url":"twitter.com\/i\/web\/status\/9\u2026","indices":[116,139]}],"user_mentions":[{"screen_name":"AJKreisberg","name":"Andrew Kreisberg","id":633354279,"id_str":"633354279","indices":[0,12]},{"screen_name":"jessicaqueller","name":"jessica queller","id":2337336336,"id_str":"2337336336","indices":[13,28]},{"screen_name":"GBerlanti","name":"Greg Berlanti","id":635028535,"id_str":"635028535","indices":[29,39]},{"screen_name":"SupergirlStaff","name":"SupergirlWritersRoom","id":3504052692,"id_str":"3504052692","indices":[40,55]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"LiLMiniJ","name":"J - #\u2764\ufe0f","id":751943697512574976,"id_str":"751943697512574976","indices":[3,12]},{"screen_name":"AJKreisberg","name":"Andrew Kreisberg","id":633354279,"id_str":"633354279","indices":[14,26]},{"screen_name":"jessicaqueller","name":"jessica queller","id":2337336336,"id_str":"2337336336","indices":[27,42]},{"screen_name":"GBerlanti","name":"Greg Berlanti","id":635028535,"id_str":"635028535","indices":[43,53]},{"screen_name":"SupergirlStaff","name":"SupergirlWritersRoom","id":3504052692,"id_str":"3504052692","indices":[54,69]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1509386522862"}
You can stop the program by interrupting the flow or pressing Ctrl-C.
We want to capture this data into a file that we will use later for the analysis. You can do so by piping the output to a file using the following command: python twitter_streaming.py > twitter_data.txt.
I ran the program for 2 days (from 2014/07/15 till 2014/07/17) to get a meaningful data sample. This file size is 242 MB.
The data that we stored twitter_data.txt is in JSON format. JSON stands for JavaScript Object Notation. This format makes it easy to humans to read the data, and for machines to parse it. Below is an example for one tweet in JSON format. You can see that the tweet contains additional information in addition to the main text which in this example: "Yaayyy I learned some JavaScript today! #thatwasntsohard #yesitwas #stoptalkingtoyourself #hashbrown #hashtag".
{"created_at":"Mon Sep 28 00:59:26 +0000 2015","id":648300950789165056,"id_str":"648300950789165056","text":"Ok. CAMELOT. PLEASE DEAR GOD LET THERE BE A MONTY PYTHON REFERENCE. @OnceABC #itIsASillyPlace","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":930666258,"id_str":"930666258","name":"BJ CLAY","screen_name":"BJCLAY2779","location":"Huntington, WV","url":null,"description":"Easy going guy. Dad to 5 wonderful boys. Surving Heart Attack victim, at the age of 33. other then that you want to know something ask me.","protected":false,"verified":false,"followers_count":50,"friends_count":313,"listed_count":1,"favourites_count":45,"statuses_count":156,"created_at":"Tue Nov 06 22:34:29 +0000 2012","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/631349246332043264\/Y-V-UZg6_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/631349246332043264\/Y-V-UZg6_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/930666258\/1397610071","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"itIsASillyPlace","indices":[78,94]}],"urls":[],"user_mentions":[{"screen_name":"OnceABC","name":"Once Upon A Time","id":287858728,"id_str":"287858728","indices":[68,76]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1443401966174"}
Appears to be json formatted data. We will need pandas, json and re to parse this info. I created a small 5 minute sample
In [25]:
import json
import pandas as pd
tweets_data_path = 'data/twitterdata.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
tweets = pd.DataFrame()
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['time_zone'] = map(lambda tweet: tweet['user']['time_zone'], tweets_data)
tweets_by_lang = tweets['lang'].value_counts()
tweets
Out[25]:
In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
Out[28]:
In [29]:
tweets_by_country = tweets['time_zone'].value_counts()
fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Time Zone', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 Time Zones', fontsize=15, fontweight='bold')
tweets_by_country[:5].plot(ax=ax, kind='bar', color='blue')
Out[29]:
Our main goals in these text mining tasks are: compare the popularity of Python, Ruby and Javascript programming languages and to retrieve programming tutorial links. We will do this in 3 steps:
We will add tags to our tweets DataFrame in order to be able to manipualte the data easily. Target tweets that have "pogramming" or "tutorial" keywords. Extract links from the relevants tweets
In [30]:
import re
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets['python'] = tweets['text'].apply(lambda tweet: word_in_text('python', tweet))
tweets['javascript'] = tweets['text'].apply(lambda tweet: word_in_text('javascript', tweet))
tweets['ruby'] = tweets['text'].apply(lambda tweet: word_in_text('ruby', tweet))
print tweets['python'].value_counts()[True]
print tweets['javascript'].value_counts()[True]
print tweets['ruby'].value_counts()[True]
In [6]:
prg_langs = ['python', 'javascript', 'ruby']
tweets_by_prg_lang = [tweets['python'].value_counts()[True], tweets['javascript'].value_counts()[True], tweets['ruby'].value_counts()[True]]
x_pos = list(range(len(prg_langs)))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='g')
# Setting axis labels and ticks
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Ranking: python vs. javascript vs. ruby (Raw data)', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0.4 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()
Now that we extracted the relevant tweets, we want to retrieve links to programming tutorials. We will start by creating a function that uses regular expressions for retrieving link that start with "http://" or "https://" from a text. This function will return the url if found, otherwise it returns an empty string.
In [31]:
def extract_link(text):
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, text)
if match:
return match.group()
return ''
# add a column called link to our tweets DataFrame. This column will contain the urls information.
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))
tweets['programming'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet))
tweets['tutorial'] = tweets['text'].apply(lambda tweet: word_in_text('tutorial', tweet))
tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet) or word_in_text('tutorial', tweet))
print 'Tweets about programming ->' + str(tweets['programming'].value_counts()[True])
#print tweets['tutorial'].value_counts()[True]
print "Relevant tweets -> "+ str(tweets['relevant'].value_counts()[True])
tweets_relevant = tweets[tweets['relevant'] == True]
tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']
print '\npython:\n'
print tweets_relevant_with_link[tweets_relevant_with_link['python'] == True]['link']
print '\njavascript:\n'
print tweets_relevant_with_link[tweets_relevant_with_link['javascript'] == True]['link']
print '\nruby:\n'
print tweets_relevant_with_link[tweets_relevant_with_link['ruby'] == True]['link']
In [ ]: