In [ ]:
%matplotlib inline
import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import requests
import os
import numpy as np
In [ ]:
crisisInfo = {
"boston": {
"name": "Boston Marathon Bombing",
"time": 1366051740, # Timestamp in seconds since 1/1/1970, UTC
# 15 April 2013, 14:49 EDT -> 18:49 UTC
"directory": "boston",
"keywords": ["boston", "exploision", "bomb", "marathon"],
"box": { # Bounding box for geographic limits
"lowerLeftLon": -124.848974,
"lowerLeftLat": 24.396308,
"upperRightLon": -66.885444,
"upperRightLat": 49.384358,
}
},
"paris_hebdo": {
"name": "Charlie Hebdo Attack",
"time": 1420626600, # Timestamp in seconds since 1/1/1970, UTC
# 7 January 2015, 11:30 CET -> 10:30 UTC
"directory": "paris_hebdo",
"keywords": ["paris", "hebdo"],
"box": {
"lowerLeftLon": -5.1406,
"lowerLeftLat": 41.33374,
"upperRightLon": 9.55932,
"upperRightLat": 51.089062,
}
},
"nepal": {
"name": "Nepal Earthquake",
"time": 1429942286, # Timestamp in seconds since 1/1/1970, UTC
# 25 April 2015, 6:11:26 UTC
"directory": "nepal",
"keywords": ["nepal", "earthquake", "quake", "nsgs"],
"box": {
"lowerLeftLon": 80.0562,
"lowerLeftLat": 26.3565,
"upperRightLon": 88.1993,
"upperRightLat": 30.4330,
}
},
"paris_nov": {
"name": "Paris November Attacks",
"time": 1447446000, # Timestamp in seconds since 1/1/1970, UTC
# 13 November 2015, 20:20 UTC to 23:58 UTC
"directory": "paris_nov",
"keywords": ["paris", "shots", "explosion"],
"box": {
"lowerLeftLon": -5.1406,
"lowerLeftLat": 41.33374,
"upperRightLon": 9.55932,
"upperRightLat": 51.089062,
}
},
"brussels": {
"name": "Brussels Transit Attacks",
"time": 1458629880, # Timestamp in seconds since 1/1/1970, UTC
# 22 March 2016, 6:58 UTC to 08:11 UTC
"directory": "brussels",
"keywords": ["brussels", "bomb", "belgium", "explosion"],
"box": {
"lowerLeftLon": 2.54563,
"lowerLeftLat": 49.496899,
"upperRightLon": 6.40791,
"upperRightLat": 51.5050810,
}
},
}
In [ ]:
print ("Available Crisis Names:")
for k in sorted(crisisInfo.keys()):
print ("\t", k)
In [ ]:
# Replace the name below with your selected crisis
selectedCrisis = "nepal"
The first thing we do is read in tweets from a directory of compressed files. Our collection of compressed tweets is in the 00_data directory, so we'll use pattern matching (called "globbing") to find all the tweet files in the given directory.
Then, for each file, we'll open it, read each line (which is a tweet in JSON form), and build an object out of it. As part of this process, we will extract each tweet's post time and create a map from minute timestamps to the tweets posted during that minute.
In [ ]:
# Determine host-specific location of data
tweetDirectory = crisisInfo[selectedCrisis]["directory"]
tweetGlobPath = os.path.join("..", "00_data", tweetDirectory, "statuses.log.*.gz")
print ("Reading files from:", tweetGlobPath)
# Dictionary for mapping dates to data
frequencyMap = {}
# For counting tweets
globalTweetCounter = 0
# Twitter's time format, for parsing the created_at date
timeFormat = "%a %b %d %H:%M:%S +0000 %Y"
reader = codecs.getreader("utf-8")
for tweetFilePath in glob.glob(tweetGlobPath):
print ("Reading File:", tweetFilePath)
for line in gzip.open(tweetFilePath, 'rb'):
# Try to read tweet JSON into object
tweetObj = None
try:
tweetObj = json.loads(reader.decode(line)[0])
except Exception as e:
continue
# Deleted status messages and protected status must be skipped
if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
continue
# Try to extract the time of the tweet
try:
currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
except:
print (line)
raise
currentTime = currentTime.replace(second=0)
# Increment tweet count
globalTweetCounter += 1
# If our frequency map already has this time, use it, otherwise add
if ( currentTime in frequencyMap.keys() ):
timeMap = frequencyMap[currentTime]
timeMap["count"] += 1
timeMap["list"].append(tweetObj)
else:
frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}
# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime
# We want to look at per-minute data, so we fill in any missing minutes
timeIntervalStep = datetime.timedelta(0, 60) # Time step in seconds
while ( thisTime <= lastTime ):
if ( thisTime not in frequencyMap.keys() ):
frequencyMap[thisTime] = {"count":0, "list":[]}
thisTime = thisTime + timeIntervalStep
print ("Processed Tweet Count:", globalTweetCounter)
In [ ]:
import matplotlib.pyplot as plt
crisisMoment = crisisInfo[selectedCrisis]["time"]
crisisTime = datetime.datetime.utcfromtimestamp(crisisMoment)
crisisTime = crisisTime.replace(second=0)
print ("Crisis Time:", crisisTime)
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Tweet Frequency")
# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())
# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])
# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
# We'll have ticks every few minutes (more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 10)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
# Plot the post frequency
yData = [x if x > 0 else 0 for x in postFreqList]
ax.plot(range(len(frequencyMap)), yData, color="blue", label="Posts")
crisisXCoord = sortedTimes.index(crisisTime)
ax.scatter([crisisXCoord], [np.mean(yData)], c="r", marker="x", s=100, label="Crisis")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
In [ ]:
# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}
# Iterate through the time stamps
for t in sortedTimes:
timeObj = frequencyMap[t]
# For each tweet, pull the screen name and add it to the list
for tweet in timeObj["list"]:
user = tweet["user"]["screen_name"]
if ( user not in globalUserCounter ):
globalUserCounter[user] = 1
globalUserMap[user] = [tweet]
else:
globalUserCounter[user] += 1
globalUserMap[user].append(tweet)
print ("Unique Users:", len(globalUserCounter.keys()))
In [ ]:
sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, globalUserCounter[u],
"\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")
Many of these tweets are not relevant to the event at hand. Twitter is a very noisy place.
Hashtags, however, are high signal keywords. Maybe the most common hashtags will be more informative.
In [ ]:
# A map for hashtag counts
hashtagCounter = {}
# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
# We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in hashtagCounter ):
hashtagCounter[hashtagString] = 1
else:
hashtagCounter[hashtagString] += 1
print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
print ("\t", "#" + ht, hashtagCounter[ht])
We can do the same with URLs to find the most shared URL.
In [ ]:
# A map for hashtag counts
urlCounter = {}
# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
urlList = tweet["entities"]["urls"]
for url in urlList:
urlStr = url["url"]
if ( urlStr not in urlCounter ):
urlCounter[urlStr] = 1
else:
urlCounter[urlStr] += 1
print ("Unique URLs:", len(urlCounter.keys()))
sortedUrls = sorted(urlCounter, key=urlCounter.get, reverse=True)
print ("Top Twenty URLs:")
for url in sortedUrls[:20]:
print ("\t", url, urlCounter[url])
Note how each URL is shortened using Twitter's shortener. To get a better idea of the content, we should expand the url.
In [ ]:
print ("Top Expanded URLs:")
for url in sortedUrls[:10]:
try:
r = requests.get(url)
realUrl = r.url
print ("\t", url, urlCounter[url], "->", realUrl)
except:
print ("\t", url, urlCounter[url], "->", "UNKNOWN Failure")
Since URLs and Hashtags are both entities, we can do the same for other entities, like mentions and media.
In [ ]:
# A map for mention counts
mentionCounter = {}
# For each minute, pull the list of mentions and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
mentions = tweet["entities"]["user_mentions"]
for mention in mentions:
mentionStr = mention["screen_name"]
if ( mentionStr not in mentionCounter ):
mentionCounter[mentionStr] = 1
else:
mentionCounter[mentionStr] += 1
print ("Unique Mentions:", len(mentionCounter.keys()))
sortedMentions = sorted(mentionCounter, key=mentionCounter.get, reverse=True)
print ("Top Twenty Mentions:")
for mention in sortedMentions[:20]:
print ("\t", mention, mentionCounter[mention])
In [ ]:
# A map for media counts
mediaCounter = {}
# For each minute, pull the list of media and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
if ( "media" not in tweet["entities"] ):
continue
mediaList = tweet["entities"]["media"]
for media in mediaList:
mediaStr = media["media_url"]
if ( mediaStr not in mediaCounter ):
mediaCounter[mediaStr] = 1
else:
mediaCounter[mediaStr] += 1
print ("Unique Media:", len(mediaCounter.keys()))
sortedMedia = sorted(mediaCounter, key=mediaCounter.get, reverse=True)
print ("Top Twenty Media:")
for media in sortedMedia[:20]:
print ("\t", media, mediaCounter[media])
We can some data is relevant, both in pictures and in hashtags and URLs. Are the most retweeted retweets also useful? Or are they expressing condolence? Or completely unrelated?
In [ ]:
# A map for media counts
tweetRetweetCountMap = {}
rtList = []
# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
tweetId = tweet["id_str"]
rtCount = tweet["retweet_count"]
if ( "retweeted_status" in tweet ):
tweetId = tweet["retweeted_status"]["id_str"]
rtCount = tweet["retweeted_status"]["retweet_count"]
tweetRetweetCountMap[tweetId] = rtCount
rtList.append(rtCount)
sortedRetweets = sorted(tweetRetweetCountMap, key=tweetRetweetCountMap.get, reverse=True)
print ("Top Ten Retweets:")
for tweetId in sortedRetweets[:10]:
thisTweet = None
for t in reversed(sortedTimes):
for tweet in frequencyMap[t]["list"]:
if ( tweet["id_str"] == tweetId ):
thisTweet = tweet
break
if ( "retweeted_status" in tweet and tweet["retweeted_status"]["id_str"] == tweetId ):
thisTweet = tweet["retweeted_status"]
break
if ( thisTweet is not None ):
break
print ("\t", tweetId, tweetRetweetCountMap[tweetId], thisTweet["text"])
Retweets seem to be dominated by recent elements. To correct for this, we should remove retweets that are older than the event.
In [ ]:
print ("Top Ten RECENT Retweets:")
foundTweets = 0
for tweetId in sortedRetweets:
thisTweet = None
# Find the most recent copy of the tweet
for t in reversed(sortedTimes):
for tweet in frequencyMap[t]["list"]:
if ( tweet["id_str"] == tweetId ):
thisTweet = tweet
break
if ( "retweeted_status" in tweet and tweet["retweeted_status"]["id_str"] == tweetId ):
thisTweet = tweet["retweeted_status"]
break
if ( thisTweet is not None ):
break
createdTime = datetime.datetime.strptime(thisTweet['created_at'], timeFormat)
# If tweet creation time is before the crisis, assume irrelevant
if ( createdTime < crisisTime ):
continue
print ("\t", tweetId, tweetRetweetCountMap[tweetId], thisTweet["text"])
foundTweets += 1
if ( foundTweets > 10 ):
break
In [ ]:
# What keywords are we interested in?
targetKeywords = crisisInfo[selectedCrisis]["keywords"]
# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = frequencyMap[t]
# Temporary counter for this minute
localTargetCounts = {x:0 for x in targetKeywords}
localTotalCount = 0
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
localTotalCount += 1
# Add to the counter if the target keyword is in this tweet
for keyword in targetKeywords:
if ( keyword in tweetString ):
localTargetCounts[keyword] += 1
# Add the counts for this minute to the main counter
totalCount.append(localTotalCount)
for keyword in targetKeywords:
targetCounts[keyword].append(localTargetCounts[keyword])
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.semilogy(range(len(frequencyMap)), totalCount, label="Total")
ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")
for keyword in targetKeywords:
ax.semilogy(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')
plt.show()
Data in social media can be relevant to an event in three ways: temporally relevant, geographically relevant, or topically relevant. So far, we've looked at temporally relevant data, or data that was posted at about the same time as the target event. Now we'll explore geographically relevant data, or data posted near the event.
Twitter allows users to share their GPS locations when tweeting, but only about 2% of tweets have this information. We can extract this geospatial data to look at patterns in different locations.
Each tweet has a field called "coordinates" describing from where the tweet was posted. The field might be null if the tweet contains no location data, or it could contain bounding box information, place information, or GPS coordinates in the form of (longitude, latitude). We want tweets with this GPS data.
For more information on tweet JSON formats, check out https://dev.twitter.com/overview/api/tweets
In [ ]:
# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0
# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
geos = list(filter(lambda tweet: tweet["coordinates"] != None and
"coordinates" in tweet["coordinates"],
frequencyMap[t]["list"]))
geoCount += len(geos)
# Add to the timestamp map
geoFrequencyMap[t] = {"count": len(geos), "list": geos}
print ("Number of Geo Tweets:", geoCount)
In [ ]:
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Geo Tweet Frequency")
gpsFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
xData = range(len(geoFrequencyMap))
gpsYData = [x if x > 0 else 0 for x in gpsFreqList]
freqYData = [x if x > 0 else 0 for x in postFreqList]
ax.semilogy(xData, freqYData, color="blue", label="Posts")
ax.semilogy(xData, gpsYData, color="green", label="GPS Posts")
ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
In [ ]:
import matplotlib
import functools
from mpl_toolkits.basemap import Basemap
# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = functools.reduce(lambda x, y: x + y, tmpGeoList)
# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]
# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'
fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
llcrnrlon=-180, urcrnrlon=180, resolution='l')
worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')
# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)
plt.show()
We can use existing Geographic Information System (GIS) tools to determine from where a tweet was posted. For example, we could ask whether a particular tweet was posted from the United States. This filtering is often performed using shape files. For our purposes though, we established a bounding box along with the crisis data, so we'll use that as our filter for simplicity.
In [ ]:
# Get the bounding box for our crisis
bBox = crisisInfo[selectedCrisis]["box"]
fig, ax = plt.subplots(figsize=(11,8.5))
# Create a new map to hold the shape file data
targetMap = Basemap(llcrnrlon=bBox["lowerLeftLon"],
llcrnrlat=bBox["lowerLeftLat"],
urcrnrlon=bBox["upperRightLon"],
urcrnrlat=bBox["upperRightLat"],
projection='merc',
resolution='i', area_thresh=10000)
targetMap.fillcontinents(color=land_color, lake_color=water_color,
zorder=1)
targetMap.drawcoastlines()
targetMap.drawparallels(np.arange(-90.,120.,30.))
targetMap.drawmeridians(np.arange(0.,420.,60.))
targetMap.drawmapboundary(fill_color=water_color, zorder=0)
targetMap.drawcountries()
# Now we build the polygon for filtering
# Convert from lon, lat of lower-left to x,y coordinates
llcCoord = targetMap(bBox["lowerLeftLon"], bBox["lowerLeftLat"])
# Same for upper-right corner
urcCoord = targetMap(bBox["upperRightLon"], bBox["upperRightLat"])
# Now make the polygon we'll us for filtering
boxPoints = np.array([[llcCoord[0], llcCoord[1]],
[llcCoord[0], urcCoord[1]],
[urcCoord[0], urcCoord[1]],
[urcCoord[0], llcCoord[1]]])
boundingBox = matplotlib.patches.Polygon(boxPoints)
# Maps of timestamps to tweets for inside/outside Ferguson
inTargetFreqMap = {}
plottablePointsX = []
plottablePointsY = []
# For each geo-coded tweet, extract coordinates and convert
# them to the Basemap space
for t in sortedTimes:
geos = geoFrequencyMap[t]["list"]
convPoints = [(targetMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]
# Local counters for this time
inTargetFreqMap[t] = {"count": 0, "list": []}
# For each point, check if it is within the bounding box or not
for point in convPoints:
x = point[0][0]
y = point[0][1]
if ( boundingBox.contains_point((x, y))):
inTargetFreqMap[t]["list"].append(point[1])
plottablePointsX.append(x)
plottablePointsY.append(y)
# Plot points in our target
targetMap.scatter(plottablePointsX, plottablePointsY, s=100, marker='x', color="red", zorder=2)
# Count the number of tweets that fall in the area
targetTweetCount = np.sum([len(inTargetFreqMap[t]["list"]) for t in sortedTimes])
print ("Tweets in Target Area:", targetTweetCount)
print ("Tweets outside:", (geoCount - targetTweetCount))
plt.show()
In [ ]:
# Merge our list of relevant tweets
geoRelevantTweets = [tw for x in sortedTimes for tw in inTargetFreqMap[x]["list"]]
print("Time of Crisis:", crisisTime)
# Print the first few tweets
for tweet in geoRelevantTweets[:10]:
print("Tweet By:", tweet["user"]["screen_name"])
print("\t", "Tweet Text:", tweet["text"])
print("\t", "Tweet Time:", tweet["created_at"])
print("\t", "Source:", tweet["source"])
print("\t", "Retweets:", tweet["retweet_count"])
print("\t", "Favorited:", tweet["favorite_count"])
print("\t", "Twitter's Guessed Language:", tweet["lang"])
if ( "place" in tweet ):
print("\t", "Tweet Location:", tweet["place"]["full_name"])
print("-----")
In [ ]:
from IPython.display import display
from IPython.display import Image
geoTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], geoRelevantTweets))
print ("Tweets with Media:", len(geoTweetsWithMedia))
if ( len(geoTweetsWithMedia) == 0 ):
print ("Sorry, not tweets with media...")
for tweet in geoTweetsWithMedia:
imgUrl = tweet["entities"]["media"][0]["media_url"]
print (tweet["text"])
display(Image(url=imgUrl))
Another popular type of analysis people do on social networks is "sentiment analysis," which is used to figure out how people feel about a specific topic. Some tools also provide measurements like subjectivity/objectivity of text content.
We'll cover:
In [ ]:
# What keywords are we interested in?
targetKeywords = crisisInfo[selectedCrisis]["keywords"]
# Map for storing topically relevant data
topicRelevantMap = {}
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = frequencyMap[t]
topicRelevantMap[t] = {"count": 0, "list": []}
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
# Add to the counter if the target keyword is in this tweet
for keyword in targetKeywords:
if ( keyword.lower() in tweetString ):
topicRelevantMap[t]["list"].append(tweetObj)
topicRelevantMap[t]["count"] += 1
break
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.semilogy(range(len(frequencyMap)), totalCount, label="Total")
ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")
relYData = [topicRelevantMap[t]["count"] for t in sortedTimes]
ax.semilogy(range(len(relYData)), relYData, label="Relevant")
ax.legend()
ax.grid(b=True, which=u'major')
plt.show()
In [ ]:
allTweets = [x for t in sortedTimes for x in topicRelevantMap[t]["list"]]
# get the top retweeted tweets
onlyRetweets = filter(lambda x: "retweeted_status" in x, allTweets)
topTweets = sorted(onlyRetweets, key=lambda x: x["retweeted_status"]["retweet_count"],
reverse=True)[:10]
print("Top Retweets:")
for x in topTweets:
print(x["id"], x["user"]["screen_name"], x["retweeted_status"]["retweet_count"], x["text"])
# get tweets from users with the msot followers
topTweets = sorted(allTweets, key=lambda x: x["user"]["followers_count"], reverse=True)[:10]
print()
print("Top Accounts:")
for x in topTweets:
print(x["id"], x["user"]["screen_name"], x["user"]["followers_count"], x["text"])
# get the top retweeted tweets but only from verified accounts
verifiedTweets = filter(lambda x: x["retweeted_status"]["user"]["verified"], onlyRetweets)
topTweets = sorted(verifiedTweets, key=lambda x: x["retweeted_status"]["retweet_count"],
reverse=True)[:10]
print()
print("Top Retweets from Verified Accounts:")
for x in verifiedTweets:
print(x["id"], x["user"]["screen_name"], x["retweet_count"], x["text"])
In [ ]:
# A frequency map for timestamps to geo-coded tweets
relGeoFreqMap = {}
relGeoCount = 0
# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
geos = list(filter(lambda tweet: tweet["coordinates"] != None and
"coordinates" in tweet["coordinates"],
topicRelevantMap[t]["list"]))
relGeoCount += len(geos)
# Add to the timestamp map
relGeoFreqMap[t] = {"count": len(geos), "list": geos}
print ("Number of Relevant Geo Tweets:", relGeoCount)
# Create a list of all geo-coded tweets
tmpGeoList = [relGeoFreqMap[t]["list"] for t in sortedTimes]
relGeoTweets = functools.reduce(lambda x, y: x + y, tmpGeoList)
# For each geo-coded tweet, extract its GPS coordinates
relGeoCoord = [x["coordinates"]["coordinates"] for x in relGeoTweets]
fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
llcrnrlon=-180, urcrnrlon=180, resolution='l')
worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
worldMap.drawcountries()
ax.set_title('Global Relevant Tweets')
# Convert points from GPS coordinates to (x,y) coordinates
allConvPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in allConvPoints]
y = [p[1] for p in allConvPoints]
worldMap.scatter(x, y, s=100, marker='x', color="blue", zorder=2)
# Convert points from GPS coordinates to (x,y) coordinates
relConvPoints = [worldMap(p[0], p[1]) for p in relGeoCoord]
x = [p[0] for p in relConvPoints]
y = [p[1] for p in relConvPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)
plt.show()
Observation: Most topically relevant tweets are not geotagged.
In [ ]:
from textblob import TextBlob
# Sentiment values
polarVals = []
objVals = []
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = topicRelevantMap[t]
# For calculating averages
localPolarVals = []
localObjVals = []
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
blob = TextBlob(tweetString)
polarity = blob.sentiment.polarity
objectivity = blob.sentiment.subjectivity
localPolarVals.append(polarity)
localObjVals.append(objectivity)
# Add data to the polarity and objectivity measure arrays
if ( len(timeObj["list"]) > 10 ):
polarVals.append(np.mean(localPolarVals))
objVals.append(np.mean(localObjVals))
else:
polarVals.append(0.0)
objVals.append(0.0)
# Now plot this sentiment data
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Sentiment")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
xData = range(len(sortedTimes))
ax.scatter([crisisXCoord], [0], c="r", marker="x", s=100, label="Crisis")
# Polarity is scaled [-1, 1], for negative and positive polarity
ax.plot(xData, polarVals, label="Polarity")
# Subjetivity is scaled [0, 1], with 0 = objective, 1 = subjective
ax.plot(xData, objVals, label="Subjectivity")
ax.legend()
ax.grid(b=True, which=u'major')
plt.show()
In [ ]:
import nltk
nltk.download("vader_lexicon")
import nltk.sentiment.util
import nltk.sentiment.vader
In [ ]:
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
In [ ]:
# Sentiment values
polarVals = []
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = topicRelevantMap[t]
# For calculating averages
localPolarVals = []
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
polarity = vader.polarity_scores(tweetString)["compound"]
localPolarVals.append(polarity)
# Add data to the polarity and objectivity measure arrays
if ( len(timeObj["list"]) > 10 ):
polarVals.append(np.mean(localPolarVals))
else:
polarVals.append(0.0)
# Now plot this sentiment data
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)
plt.title("Sentiment")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
xData = range(len(sortedTimes))
ax.scatter([crisisXCoord], [0], c="r", marker="x", s=100, label="Crisis")
# Polarity is scaled [-1, 1], for negative and positive polarity
ax.plot(xData, polarVals, label="Polarity")
ax.legend()
ax.grid(b=True, which=u'major')
plt.ylim((-0.3, 0.55))
plt.show()
Along with sentiment analysis, a question often asked of social networks is "What are people talking about?" We can answer this question using tools from topic modeling and natural language processing. With crises, people can have many responses, from sharing specific data about the event, sharing condolonces, or opening their homes to those in need.
To generate these topic models, we will use the Gensim package's implementation of Latent Dirichlet Allocation (LDA), which basically constructs a set of topics where each topic is described as a probability distribution over the words in our tweets. Several other methods for topic modeling exist as well.
In [ ]:
# Gotta pull in a bunch of packages for this
import gensim.models.ldamulticore
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing
In [ ]:
nltk.download("stopwords")
from nltk.corpus import stopwords
We first extract all relevant tweets' text for building our models.
In [ ]:
# Get all tweets and conver to lowercase
allTweetText = [x["text"].lower() for t in sortedTimes for x in topicRelevantMap[t]["list"]]
print ("All Tweet Count:", len(allTweetText))
Now we build a list of stop words (words we don't care about) and build a feature generator (the vectorizer) that assigns integer keys to tokens and counts the number of each token.
In [ ]:
enStop = stopwords.words('english')
esStop = stopwords.words('spanish')
# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + esStop + ["http", "https", "rt", "@", ":", "co"]
vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode',
tokenizer=None,
token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
stop_words=stopList)
# Analyzer
analyze = vectorizer.build_analyzer()
# Create a vectorizer for all our content
vectorizer.fit(allTweetText)
# Get all the words in our text
names = vectorizer.get_feature_names()
# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))
We then use the vectorizer to transform our tweet text into a feature set, which essentially is a table with rows of tweets, columns for each keyword, and each cell is the number of times that keyword appears in that tweet.
We then convert that table into a model the Gensim package can handle, apply LDA, and grab the top 10 topics, 10 words that describe that topic, and print them.
In [ ]:
# Create a corpus for
corpus = vectorizer.transform(allTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.LdaMulticore(gsCorpus,
id2word=id2WordDict,
num_topics=20,
passes=2) # ++ passes for better results
ldaTopics = lda.show_topics(num_topics=10,
num_words=10,
formatted=False)
for (i, tokenList) in ldaTopics:
print ("Topic %d:" % i, ' '.join([pair[0] for pair in tokenList]))
We can also be a little more strict and get rid of some noise by looking only at words with more than X characters. Stop words are often short, so by putting a floor on the length of a token, we can theoretically get higher-signal data.
In [ ]:
docArrays = filter(lambda x: len(x) > 4, [y for x in allTweetText for y in analyze(x)])
fd = nltk.FreqDist(docArrays)
print ("Most common from analyzer:")
for x in fd.most_common(20):
print (x[0], x[1])
Information flows and social networks are important considerations during crises, when people are trying to get updates on safe spaces, loved ones, places of shelter, etc. Twitter is noisy though, and a lot of the data may be irrelevant, condolences/thoughts expressed by celebrities, or otherwise uninformative. Using network analysis, we can get some idea about who the most important Twitter users were during this time, and how people split into groups online.
For this analysis, we'll use the NetworkX package to construct a social graph of how people interact. Each person in our Twitter data will be a node in our graph, and edges in the graph will represent mentions during this timeframe. Then we will explore a few simple analytical methods in network analysis, including:
To limit the amount of data we're looking at, we'll only build the network for people who tweeted about a relevant keyword and the people they mention. We build this network simply by iterating through all the tweets in our relevant list and extract the "user_mentions" list from the "entities" section of the tweet object. For each mention a user makes, we will add an edge from that user to the user he/she mentioned.
In [ ]:
import networkx as nx
# We'll use a directed graph since mentions/retweets are directional
graph = nx.DiGraph()
for tweet in [x for t in sortedTimes for x in topicRelevantMap[t]["list"]]:
userName = tweet["user"]["screen_name"]
graph.add_node(userName)
mentionList = tweet["entities"]["user_mentions"]
for otherUser in mentionList:
otherUserName = otherUser["screen_name"]
if ( graph.has_node(otherUserName) == False ):
graph.add_node(otherUserName)
graph.add_edge(userName, otherUserName)
print ("Number of Users:", len(graph.node))
In network analysis, "centrality" is used to measure the importance of a given node. Many different types of centrality are used to describe various types of importance though. Examples include "closeness centrality," which measures how close a node is to all other nodes in the network, versus "betweeness centrality," which measures how many shortest paths run through the given node. Nodes with high closeness centrality are important for rapidly disseminating information or spreading disease, whereas nodes with high betweeness are more important to ensure the network stays connected.
The PageRank is another algorithm for measuring importance and was proposed by Sergey Brin and Larry Page for the early version of Google's search algorithm. NetworkX has an implementation of the PageRank algorithm that we can use to look at the most important/authoritative users on Twitter based on their connections to other users.
In [ ]:
# Now we prune for performance reasons
# remove all nodes with few edges
nodeList = [n for n,d in graph.degree_iter() if d<2]
graph.remove_nodes_from(nodeList)
print ("Number of Remaining Users:", len(graph.node))
In [ ]:
# THis may take a while
pageRankList = nx.pagerank_numpy(graph)
In [ ]:
highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
print (x, pageRankList[x])
In [ ]:
plt.figure(figsize=(8,8))
pos = nx.spring_layout(graph, scale=100, iterations=100, k=0.2)
nx.draw(graph,
pos,
node_color='#A0CBE2',
width=1,
with_labels=False,
node_size=50)
hrNames = highRankNodes[:20]
hrDict = dict(zip(hrNames, hrNames))
hrValues = [pageRankList[x] for x in hrNames]
nx.draw_networkx_nodes(graph,pos,nodelist=hrNames,
node_size=200,
node_color=hrValues,
cmap=plt.cm.Reds_r)
nx.draw_networkx_labels(graph,
pos,
labels=hrDict,
fontsize=36,
font_color="g")
plt.axis('off')
plt.show()
In [ ]: