The purpose of this exercise is to perform data analysis on Tweets related to the Shellshock.
The data has been downloaded using Tweeter's REST API and Tweepy python module and placed in a simple text file.
Each line in the input file is a JSON encoded Tweet related to the Shellshock vulnerability.
See Download Tweets for Shellshock.ipynb for the download part.
In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import sys
import json
from datetime import datetime
import gzip
import time
from ggplot import *
import requests as r
from httpcache import CachingHTTPAdapter
from IPython.display import HTML, display_json, \
display_html, display_javascript, JSON, Javascript
import re
import numpy as np
# Some Pandas options TODO revisit this later
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',2000)
# Cache http requests TODO confirm that this works
s = r.Session()
s.mount('http://', CachingHTTPAdapter())
s.mount('https://', CachingHTTPAdapter())
In [3]:
fName = "data/shellshockTweets.txt.gz"
tweets = []
st = time.time()
with gzip.GzipFile(fName, 'r') as fp:
for tweet in fp:
t = json.loads(tweet)
# Parse 'created_at' into a proper datetime.
t['created_at'] = \
datetime.strptime(t['created_at'],
'%a %b %d %H:%M:%S +0000 %Y')
tweets.append(t)
fp.close()
et = time.time()
print ('Loaded {0} tweets in {1:.2f} secs'.format(len(tweets),(et-st)))
In [4]:
# TODO pep8ify this code block
st = time.time()
tweetsDF = pd.DataFrame(tweets)
tweetsDF['date'] = tweetsDF['created_at'].map(lambda x: x.date()) # Jus the Date
tweetsDF['timestamp_f'] = tweetsDF['created_at'].map(lambda x: time.mktime(x.timetuple())) # As float for density plot
tweetsDF['user_info'] = tweetsDF['user'].map(lambda u:u['name']+'(@'+u['screen_name']+')')
tweetsDF['user_loc'] = tweetsDF['user'].map(lambda u: u['location'])
tweetsDF['tweeted_to'] = tweetsDF['user'].map(lambda u: u['followers_count'])
tweetsDF['screen_name'] = tweetsDF['user'].map(lambda u: u['screen_name'])
tweetsDF['status'] = pd.isnull(tweetsDF['retweeted_status']).map(lambda s: 'original' if s else 'retweet') #Is it a retweet ?
# Take out the Link target from source.
regexp_link = r'''<a .+>(.+)</a>'''
pattern = re.compile(regexp_link)
tweetsDF['source'] = tweetsDF['source'].map(lambda x: pattern.findall(x)[0])
# Hashtags and URLs are burried deep in JSON,
# extract them and convernt them to a comma separated string.
tweetsDF['hashtags'] = tweetsDF['entities'].map(
lambda x: ' '.join(([ht['text'].lower().strip()
for ht in x['hashtags']])))
tweetsDF['urls'] = tweetsDF['entities'].map(
lambda x: ' '.join(([ht['expanded_url'].strip()
for ht in x['urls']])))
et = time.time()
# A dedicated DF for unique Tweets.
uniqtweetsDF = tweetsDF[tweetsDF['status'] == 'original']
uniqtweetsDF.reset_index(inplace=True)
print ('Parsed tweets in {0:.2f} secs and found {1} unique tweets'.format((et-st),len(uniqtweetsDF)))
In [ ]:
# Sample a few Tweets
from IPython.display import display
sample=tweetsDF.loc[[27,446,9898]]
HTML(sample.T.to_html())
In [50]:
# Same as above but with geom_histogram
ggplot(tweetsDF,
aes(x='factor(date)', fill='factor(status)', color='factor(status)')) + \
geom_histogram(alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Date') + ylab('Tweets (Blues are retweets)') + ggtitle('All Tweets related to Shellshock per day\n')
Out[50]:
Some Notes & Observations
Notes
Observations
In [25]:
# Same as above but unstacked.
ggplot(tweetsDF,
aes(x='factor(date)', fill='factor(status)', color='factor(status)')) + \
geom_histogram(alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Date') + ylab('Tweets') + \
ggtitle('Distribution of Original Tweets vs Retweets related to Shellshock per day')+ \
facet_grid('factor(status)')
Out[25]:
In [51]:
# Same as above but as a Density Plot
ggplot(tweetsDF,
aes(x='timestamp_f', color='factor(status)', fill='factor(status)')) + \
geom_density(alpha=0.2) + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Time') + ylab('Density (Blue is retweets)') + \
ggtitle('Density plot of Timestamp of all Tweets related to Shellshock\n')
Out[51]:
In [27]:
# Tweets and Retweets by Top 6 languages
ggplot(tweetsDF[tweetsDF['lang'].isin(
pd.value_counts(tweetsDF['lang'])[:6].index)],
aes(x='factor(date)', fill='factor(lang)')) + \
geom_histogram(alpha=0.7) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Date') + ylab('Tweets') + \
ggtitle('Daily Distribution of All Tweets related to Shellshock by Top 6 Languages') + \
facet_wrap('factor(lang)')
Out[27]:
Before diving in to charts lets look at what the top 20 retweeted tweets were.
Looking at the data below, none of the tweet (bar the first one) was retweeted more than a few hundred times at max.
Infosec tweets don't attract the same kind of attention as what the flavor of the month celebrity had for breakfast.
In [35]:
# Top 20 retweeted tweets
top20Retweeted = uniqtweetsDF.sort(columns='retweet_count',ascending=False)[:20].reset_index()
# Display Top 20 Retweeted tweets using Tweeter's oembed API.
HTML('<div><h3>Top 20 Retweeted Tweets</h3><br/>' + \
"".join(['<div> Retweeted '+"{:,}".format(row['retweet_count'])+' times:' + \
r.get('https://api.twitter.com/1/statuses/oembed.json?id='+ \
row['id_str']+'&align=center&omit_script=true').json()['html']+ \
'</div>'
for row in top20Retweeted[['id_str','retweet_count']].T.to_dict().values()]) + \
'</div>')
Out[35]:
In [52]:
# Plot the owners of these top 20 retweeted tweets against retweet count.
ggplot(top20Retweeted,
aes(x='factor(screen_name)', y='retweet_count') )+ \
geom_histogram(stat='identity',alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('User') + ylab('Number of Retweets') + \
ggtitle('Owners of Top 20 Retweeted tweets related to Shellshock\n')
Out[52]:
In [5]:
# Helper function to plot hashtags, urls.
def getAttrFreq(df, listKey, attrKey):
all_attrs = pd.Series([attr for attrs in df[listKey]
for attr in attrs.split(" ")])
all_attrs = all_attrs[all_attrs.map(len) > 0] #Discard blank ones
all_attrs.reset_index(drop=True, inplace=True)
freqDF = pd.value_counts(all_attrs).reset_index()
freqDF.columns = [attrKey, 'count']
return freqDF
def plotTopNAttrs(df, listKey, attrKey, howMany):
top_N = getAttrFreq(df, listKey, attrKey)
return ggplot(top_N[:howMany], aes(x='factor('+attrKey+')', y='count')) + \
geom_histogram(stat='identity',alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + xlab(attrKey.capitalize()) + \
ylab('Count') + ggtitle('Top {0} {1} in Tweets related to Shellshock\n'.format(howMany,listKey))
In [54]:
# Plot Top 20 hashtags in Tweets (including retweets)
plotTopNAttrs(tweetsDF, 'hashtags', 'hashtag', 20)
Out[54]:
In [55]:
# Plot Top 20 hashtags in unique Tweets.
plotTopNAttrs(uniqtweetsDF, 'hashtags', 'hashtag', 20)
Out[55]:
In [56]:
# Tweets and Retweets by Source
top_N_sources = pd.value_counts(tweetsDF['source'])[:10].reset_index()
top_N_sources.columns=['source','count']
ggplot(top_N_sources, aes(x='factor(source)', y='count')) + \
geom_histogram(stat='identity',alpha=0.75) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + xlab('Source') + \
ylab('Count') + ggtitle('Top 10 Sources of Tweets related to Shellshock\n')
Out[56]:
Looking at the chart above
Next we look at top 20 User Locations
Note A user location is the location a user puts in his profile, this is not the same as a tweets geo location, so if some one puts London, UK as their location and actually tweets from New York, NY, it would still count as a UK tweet for this graph.
In [66]:
# Tweets and Retweets by User Locations
# Note these are locations set in a User Profile, not Geo Locations associated with a Tweet.
top_N_userLocs = pd.value_counts(tweetsDF['user_loc'])[1:20].reset_index() #1st entry is <blank> so ignore it.
top_N_userLocs.columns=['user_location','count']
ggplot(top_N_userLocs, aes(x='factor(user_location)', y='count')) + \
geom_histogram(stat='identity',alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + xlab('User Location') + \
ylab('Count') + ggtitle('Top 20 User Locations of Tweets related to Shellshock\n')
# TODO Figure out if we can buckets like 'NY', 'New York', 'New York, NY' in a single bucket.
Out[66]:
Frankly this was bit of a surprise, UK and especially London leads the pack.
There is some data clean up required here, like 'London' and 'London, UK' or "New York" and "New York, NY" should be combined. But regardless London, UK still is ahead of the other locations by a fair bit of margin.
I had no idead London was a hotbed for infosec crowd.
In [58]:
# Plot Top 20 URLS in unique Tweets.
plotTopNAttrs(tweetsDF, 'urls', 'url', 20)
Out[58]:
I would not have expected cnet.com to be a leading cited source for infosec/vulnerability data. But there is a reason for this, that cnet URL is the one that is mentioned in the most retweeed tweet (the one that got retweeted ~ 11K times), hence it gets to be the leading URL in all the tweets.
Next we look at URLs in only original tweets for a comparison.
In [59]:
# Plot Top 20 URLS in unique Tweets.
plotTopNAttrs(uniqtweetsDF, 'urls', 'url', 20)
Out[59]:
As you can now see, the cnet URL is no where to be found if you just take in to account the original tweets. Here the more traditional infosec focused websites take lead with notable exception of BBC, wired.
In [19]:
users = pd.value_counts(tweetsDF['screen_name']).reset_index()
users.columns = ['screen_name','tweets']
# Plot the top 20 tweeters against tweet count.
ggplot(users[0:20],
aes(x='factor(screen_name)', y='tweets' )) + \
geom_histogram(stat='identity',alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=90)) + \
xlab('User') + ylab('Number of Tweets') + \
ggtitle('Top 20 tweeters of tweet related to Shellshock\n')
Out[19]:
This needs more drilling into, apparently a lot of accounts were repeat tweeting about 'shellshock', need to find if these were unique tweets or same stuff being tweeted over and over again.
In [70]:
import nltk
from nltk.corpus import cmudict
ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english'))
NON_ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words()) - ENGLISH_STOPWORDS
# for now only unique tweets in English language
uniqtexts = uniqtweetsDF.query('lang == "en"')['text']
t = nltk.tokenize.WhitespaceTokenizer()
p=re.compile(r'[^-#A-Za-z0-9]') # get rid of unwanted characters
tokenLists = uniqtexts.map(lambda text: [p.sub('', word.strip().lower())
for word in t.tokenize(text) if
(not word.lower() in ENGLISH_STOPWORDS) and
( len(word.strip()) > 1)])
all_tokens = pd.Series([token for tokenList in tokenLists for token in tokenList if
(token is not None) and
(token != '')])
In [71]:
# Plot the top 20 tokens.
top_N_words = pd.value_counts(all_tokens)[:20].reset_index()
top_N_words.columns=['token','count']
ggplot(top_N_words,
aes(x='factor(token)', y='count') )+ \
geom_histogram(stat='identity',alpha=0.8) + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Count') + ylab('Word') + \
ggtitle('Top 20 words in Original tweets in "English" related to Shellshock\n')
Out[71]:
Enough of Bar Charts let's look at some other interesting ways to visualize our data.
This plot shows the density plot of potential viewers of this tweet.
NOTE This is just based on immediate number of followers of a user, and does not include followers of followers etc.
As can be seen tweets were seen by < 500 or so users, again confirming that infosec crowd is not as popular as celebrities.
In [8]:
ggplot(tweetsDF.query('(tweeted_to > 10)').reset_index(),
aes(x='tweeted_to', color='factor(status)', fill='factor(status)')) + \
geom_density(alpha=0.2) + scale_x_log10() + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Potential Viewers') + ylab('Density (Blue is retweets)') + \
ggtitle('Density Plot of Potential Viewers of Each Tweet')
Out[8]:
In [9]:
ggplot(uniqtweetsDF.query('(retweet_count >0)').reset_index(),
aes(x='retweet_count', color='factor(status)', fill='factor(status)')) + \
geom_density(alpha=0.2) + scale_x_log10() + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlim(low=1) + xlab('Times Retweeted') + ylab('Density') + \
ggtitle('Density Plot of Retweet Count of Each Original Tweet')
Out[9]:
So here my assumption was that there should be some positivecorrelation between the number of followers you have and the number of retweets you'll get.
Surprisingly what I found was a bit of negetive correlation, i.e. lower the number of our followers higher the number of retweets. And the only reason this can be is if you have low number of followers but they themselves have high number of followers.
It was still interesting to see this graphed out.
In [15]:
ggplot(uniqtweetsDF.query('(retweet_count>0) & (tweeted_to<500000)').reset_index(),
aes(x='tweeted_to', y='retweet_count') )+ \
geom_point() + scale_y_continuous(labels='comma') + \
theme(axis_text_x=element_text(angle=45, hjust=1)) + \
xlab('Potential Viewers') + ylab('Number of Retweets') + \
scale_y_log10() + ylim(1,2000) + xlim(0,500000) +\
ggtitle('Potential Viewers v/s Retweeted tweets related to Shellshock')
Out[15]:
In [85]:
tweetsWithLoc=tweetsDF[pd.notnull(tweetsDF['geo'])].reset_index()
# Get rid of geo points with coords [0.0,0.0]
tweetsWithLoc = tweetsWithLoc[tweetsWithLoc['geo'].map(
lambda g: True if ((g['coordinates'][0] != 0)|(g['coordinates'][1] != 0)) else False)].reset_index()
print '\n{0} out of {1} tweets i.e ({2:.2f})% have location data\n'.format(
len(tweetsWithLoc), len(tweetsDF), (100.0*len(tweetsWithLoc)/len(tweetsDF)))
In [ ]:
js_loader = """
function verifyJSLoaded(){
/*var jsapiLoaded = (typeof google === 'object' && typeof google.maps === 'object');
console.log("Google API Loaded: " + jsapiLoaded);
return jsapiLoaded;*/
return false;
}
function loadScript() {
if (!verifyJSLoaded()) {
console.log('Loading Google API.');
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://maps.googleapis.com/maps/api/js?sensor=false&libraries=visualization&callback=console.log";
document.body.appendChild(script);
}
}
loadScript();
"""
Javascript(js_loader)
In [76]:
html_template = '<div id="{0}" style="width: 1024px; height: 768px"></div>'
#This is to make sure the JS gets loaded before we try to load the maps
time.sleep(1)
In [77]:
def gen_javascript(gJSONs, div_id):
"""
Generates javascript to draw a heatmap with Google Maps API.
"""
# Creates Javascript objects which will comprise geoData.
coords = ',\n '.join(["new google.maps.LatLng(%s, %s)" % tuple(pair) for pair in gJSONs])
template_jscript = """
var geoData = [
%s
];
var map, heatmap;
function hmap_initialize() {
var mapOptions = {
zoom: 1,
center: new google.maps.LatLng(30.5171, 0.1062),
mapTypeId: google.maps.MapTypeId.SATELLITE
};
map = new google.maps.Map(document.getElementById('%s'),
mapOptions);
var pointArray = new google.maps.MVCArray(geoData);
heatmap = new google.maps.visualization.HeatmapLayer({
data: pointArray
});
heatmap.setMap(map);
}
hmap_initialize();
"""
return template_jscript % (coords, div_id)
In [ ]:
HTML(html_template.format('map_001'))
In [ ]:
jscript = gen_javascript(tweetsWithLoc['geo'].map(lambda x: x['coordinates']),'map_001')
Javascript(jscript)
In [86]:
# All of above works in a localy running notebook but not in nbviewer,
# So here's a static image of what the above code generated
from IPython.display import Image
Image('tweetsOnMap.png')
Out[86]:
In [ ]: