In [1]:
from collections import Counter
import json
import os
import sys
import time
from urllib.parse import urlparse
In [2]:
%matplotlib inline
from IPython.display import display, Image
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import seaborn as sns
In [3]:
%load_ext autoreload
%autoreload 1
%autoreload
is indeed magic. Read more about it in the IPython docs. Here it means that anytime a twarc
function (which we'll import with %aimport
next) is called, the kernel should reload twarc
first. This allows me to change the library in the background and see the changes immediately here.
In [4]:
%aimport twarc
import tweepy
We start with establishing an API connection using twarc. This requires establishing our API key credentials, which I've included in my shell environment, so let's assign them first.
In [5]:
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET')
access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')
t = twarc.Twarc(consumer_key=consumer_key, consumer_secret=consumer_secret,
access_token=access_token, access_token_secret=access_token_secret)
In [6]:
def get_tweets (count=5):
i = 0
tweets = []
for tweet in t.search('lahoreblast', lang='en'):
i += 1
if i > count:
break
tweets.append(tweet)
return tweets
In [7]:
tweets = get_tweets(5)
for tweet in tweets:
print('%s - %s' % (tweet['id'], tweet["text"]))
Just to be thorough, let's try that again:
In [45]:
tweets = get_tweets(5)
for tweet in tweets:
print('%s - %s' % (tweet['id'], tweet["text"]))
Do you notice how one of the tweets (#714488353794580480, the "@NicoleGLeier" RT) repeats? This is due to the default behavior of the search API call result_type
option. By default, it uses mixed
, which includes a few popular tweets and then recent tweets.
If we change the twarc library (which I've now done under the hood) to accept the result_type
option, we can explore this more directly. And let's just look at the IDs to keep it simple.
In [8]:
def get_tweets (count=5, result_type='mixed'):
i = 0
tweets = []
for tweet in t.search('lahoreblast', lang='en', result_type=result_type):
i += 1
if i > count:
break
tweets.append(tweet)
return tweets
In [9]:
tweets = get_tweets(5, result_type='popular')
for tweet in tweets:
print('%s - %s' % (tweet['id'], tweet["text"]))
And repeating the same thing to see if the same popular tweets come through:
In [48]:
tweets = get_tweets(5, result_type='popular')
for tweet in tweets:
print('%s - %s' % (tweet['id'], tweet["text"]))
Exactly the same! Let's try a little more substantial set.
In [10]:
popular1 = set([t['id'] for t in get_tweets(20, result_type='popular')])
popular2 = set([t['id'] for t in get_tweets(20, result_type='popular')])
print('intersection: %s' % len(popular1.intersection(popular2)))
print('difference: %s' % len(popular1.difference(popular2)))
Okay then, let's try again with recent
. We need to "sleep" a little in between request to ensure there's a little time lag between calls, otherwise we'll just get mostly the same tweets again the second time. Thirty seconds later, the most recent tweets should be new for a big trending tag.
In [11]:
recent1 = set([t['id'] for t in get_tweets(20, result_type='recent')])
# a little delay to allow some new tweets through
time.sleep(30)
recent2 = set([t['id'] for t in get_tweets(20, result_type='recent')])
print('intersection: %s' % len(recent1.intersection(recent2)))
print('difference: %s' % len(recent1.difference(recent2)))
Works! Pull request sent. :)
In [12]:
tweets = get_tweets(5, result_type='recent')
for tweet in tweets:
print('%s - %s' % (tweet['id'], tweet["text"]))
In [13]:
recent_tweets = get_tweets(5000, result_type='recent')
In [14]:
len(recent_tweets)
Out[14]:
In [15]:
counter_hashtags = Counter()
for tweet in recent_tweets:
counter_hashtags.update([hashtag['text'] for hashtag in tweet['entities']['hashtags']
if not hashtag['text'].lower() == 'lahoreblast'])
In [16]:
counter_hashtags.most_common(10)
Out[16]:
In [17]:
x, y = zip(*counter_hashtags.most_common(50))
f, ax = plt.subplots(figsize=(14, 3))
sns.barplot(x=x, y=y)
plt.yticks(fontsize=13)
plt.xticks(rotation=85, fontsize=12)
Out[17]:
In [19]:
counter_urls = Counter()
for tweet in recent_tweets:
counter_urls.update([url['expanded_url'] for url in tweet['entities']['urls']])
In [20]:
counter_urls.most_common(25)
Out[20]:
In [22]:
counter_domains = Counter()
for tweet in recent_tweets:
counter_domains.update([urlparse(url['expanded_url']).netloc for url in tweet['entities']['urls']])
In [23]:
x, y = zip(*counter_domains.most_common(50))
f, ax = plt.subplots(figsize=(14, 3))
sns.barplot(x=x, y=y)
plt.yticks(fontsize=13)
plt.xticks(rotation=85, fontsize=13)
Out[23]:
In [24]:
counter_mentions = Counter()
for tweet in recent_tweets:
counter_mentions.update([m['screen_name'] for m in tweet['entities']['user_mentions']])
In [25]:
counter_mentions.most_common(25)
Out[25]:
In [26]:
x, y = zip(*counter_mentions.most_common(75))
f, ax = plt.subplots(figsize=(14, 3))
sns.barplot(x=x, y=y)
plt.yticks(fontsize=13)
plt.xticks(rotation=90, fontsize=12)
Out[26]:
In [27]:
counter_media = Counter()
for tweet in recent_tweets:
counter_media.update([m['media_url_https'] for m in tweet['entities'].get('media', [])])
In [28]:
counter_media.most_common(25)
Out[28]:
In [29]:
images = [Image(url) for url, count in counter_media.most_common(10)]
In [31]:
for url, count in counter_media.most_common(10):
print('%s references to %s' % (count, url))
display(Image(url))
In [32]:
popular = get_tweets(25, result_type='popular')
In [33]:
[t['text'] for t in popular]
Out[33]:
In [186]:
for t in popular:
u = t['user']
print('Screen name "%s", name "%s"' % (u['screen_name'], u['name']))
In [34]:
popular_screen_names = set([t['user']['screen_name'] for t in popular])
screen_names, counts = zip(*counter_mentions.most_common(1000))
common_mentions = set(screen_names)
All of the following screen names are both authors of popular tweets and mentions of many other users.
In [35]:
popular_screen_names.intersection(common_mentions)
Out[35]:
But these remaining few are authors of popular tweets but not as commonly mentioned.
In [36]:
popular_screen_names.difference(common_mentions)
Out[36]:
Let's take a closer look at the follow/follower counts of these two sets of users. We should be able to pull the key data out from the tweets we already have.
In [37]:
for t in popular:
u = t['user']
print('%s (%s): %s followers, following %s' % (u['screen_name'], u['name'],
u['followers_count'], u['friends_count']))
Tweepy has handy methods for collecting friend/follower info for given users. Let's pull that in and construct a network of who these popular twitterers are following.
In [38]:
import tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
In [40]:
follow_net = []
user_set = set()
for user in [t['user'] for t in popular]:
print('Fetching users for %s' % user['screen_name'])
time.sleep(120)
user_id = user['id']
follows = api.friends_ids(user_id)
follow_net.append((user_id, follows))
user_set.update(follows)
Unfortunately the rate limiting on that function call is severe: 15 calls every 15 minutes. When debugging, you run into that quickly!
Oh well, we got enough data to move forward some. First, let's see how much overlap there might be by checking the overall set against the complete count of each following list.
In [ ]:
[x for x, y in follow_net]
In [325]:
len(user_set)
Out[325]:
In [326]:
sum([len(ids) for u, ids in follow_net])
Out[326]:
That's already a significant difference. The graph should show this clearly.
In [371]:
g = nx.Graph()
for user_id, friends_ids in follow_net:
for friend_id in friends_ids:
g.add_edge(str(user_id), str(friend_id))
In [381]:
sg = nx.Graph()
popular_ids = [t['user']['id'] for t in popular]
for node in sorted(g.nodes()):
if node in [str(i) for i in popular_ids]:
continue
if len(g.edges(node)) > 3:
sg.add_edges_from(g.edges(node))
In [382]:
labels = {}
follow_net_users = [x for x, y in follow_net]
for u in [t['user'] for t in popular]:
if u['id'] in follow_net_users:
labels[str(u['id'])] = u['screen_name']
In [386]:
plt.figure(figsize=(12, 12))
pos = nx.fruchterman_reingold_layout(sg)
nx.draw_networkx_nodes(sg, pos, node_size=100, node_color='#888888')
nx.draw_networkx_edges(sg, pos, edge_color='#bbbbbb',
arrows=False, alpha=0.5)
#nx.draw_networkx_labels(sg, pos, labels, fontsize=18)
plt.axis('off')
plt.show()