In [4]:
%matplotlib inline
import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import os
import numpy as np
In [5]:
if ( sys.version_info.major == 3 ):
from functools import reduce
The first thing we do is read in tweets from a directory of compressed files. Our collection of compressed tweets are in the data_files/twitter directory, so we'll use pattern matching (called "globbing") to find all the tweet files in the given directory.
Then, for each file, we'll open it, read each line (which is a tweet in JSON form), and build an object out of it. As part of this process, we will extract each tweet's post time and create a map from minute timestamps to the tweets posted during that minute.
In [3]:
tweetPath = os.path.join("data_files", "twitter")
tweetFiles = {
"time01": os.path.join(tweetPath, "statuses.*.gz")
}
frequencyMap = {}
globalTweetCounter = 0
timeFormat = "%a %b %d %H:%M:%S +0000 %Y"
reader = codecs.getreader("utf-8")
for (key, path) in tweetFiles.items():
localTweetList = []
for filePath in glob.glob(path):
print ("Reading File:", filePath)
for line in gzip.open(filePath, 'rb'):
# Try to read tweet JSON into object
tweetObj = None
try:
tweetObj = json.loads(reader.decode(line)[0])
except Exception as e:
continue
# Deleted status messages and protected status must be skipped
if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
continue
# Try to extract the time of the tweet
try:
currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
except:
print (line)
raise
currentTime = currentTime.replace(second=0)
# Increment tweet count
globalTweetCounter += 1
# If our frequency map already has this time, use it, otherwise add
if ( currentTime in frequencyMap.keys() ):
timeMap = frequencyMap[currentTime]
timeMap["count"] += 1
timeMap["list"].append(tweetObj)
else:
frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}
# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime
timeIntervalStep = datetime.timedelta(0, 60) # Time step in seconds
while ( thisTime <= lastTime ):
if ( thisTime not in frequencyMap.keys() ):
frequencyMap[thisTime] = {"count":0, "list":[]}
thisTime = thisTime + timeIntervalStep
print ("Processed Tweet Count:", globalTweetCounter)
In [6]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Tweet Frequency")
# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())
# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])
# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
# We'll have ticks every thirty minutes (much more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
# Plot the post frequency
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
In [7]:
# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}
# Iterate through the time stamps
for t in sortedTimes:
timeObj = frequencyMap[t]
# For each tweet, pull the screen name and add it to the list
for tweet in timeObj["list"]:
user = tweet["user"]["screen_name"]
if ( user not in globalUserCounter ):
globalUserCounter[user] = 1
globalUserMap[user] = [tweet]
else:
globalUserCounter[user] += 1
globalUserMap[user].append(tweet)
print ("Unique Users:", len(globalUserCounter.keys()))
In [8]:
sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, globalUserCounter[u], "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")
In [9]:
import tweepy
consumer_key = "RfWoIb9wocCY0kOYKUYnf5VOo"
consumer_secret = "FqsdZGdD4yvzwPj0yoe7lHRxgG4tjz2WVZbozxpOPnDunMhzv9"
access_token = "2421639553-0IF33x71RsEJL2aKCksu0C1VR8383nqRQK0dYSE"
access_token_secret = "3wSJCvLhgPBi8NUNVWbvosK2DAraGgB9K0NN0URNLVWjs"
# Set up the authorization mechanisms for Tweepy to access Twitter's API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
In [10]:
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, globalUserCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\tDescription:", user.description)
except Exception as te:
print ("\tDescription Error:", te)
print ("----------")
In [13]:
plt.figure(figsize=(16,8))
# the histogram of the data
plt.hist(
[globalUserCounter[x] for x in globalUserCounter],
bins=100,
normed=0,
alpha=0.75,
label="Counts",
log=True)
plt.xlabel('Number of Tweets')
plt.ylabel('Counts')
plt.title("Histogram of Frequency")
plt.grid(True)
plt.legend()
plt.show()
In [14]:
avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter])
print("Average Number of Posts:", avgPostCount)
In [15]:
# A map for hashtag counts
hashtagCounter = {}
# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
# We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in hashtagCounter ):
hashtagCounter[hashtagString] = 1
else:
hashtagCounter[hashtagString] += 1
print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
print ("\t", "#" + ht, hashtagCounter[ht])
In [18]:
# What keywords are we interested in?
targetKeywords = ["obama", "tear gas"]
# targetKeywords.append("lowery")
# targetKeywords.append("reilly")
targetKeywords.append("iraq")
# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = frequencyMap[t]
# Temporary counter for this minute
localTargetCounts = {x:0 for x in targetKeywords}
localTotalCount = 0
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
localTotalCount += 1
# Add to the counter if the target keyword is in this tweet
for keyword in targetKeywords:
if ( keyword in tweetString ):
localTargetCounts[keyword] += 1
# Add the counts for this minute to the main counter
totalCount.append(localTotalCount)
for keyword in targetKeywords:
targetCounts[keyword].append(localTargetCounts[keyword])
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.plot(range(len(frequencyMap)), totalCount, label="Total")
for keyword in targetKeywords:
ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')
plt.show()
In [19]:
# A map for counting each language
languageCounter = {}
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
lang = tweet["lang"]
if ( lang not in languageCounter ):
languageCounter[lang] = 1
else:
languageCounter[lang] += 1
In [20]:
languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True)
for l in languages:
print (l, languageCounter[l])
In [21]:
plt.figure(figsize=(16,8))
# the histogram of the data
plt.bar(
np.arange(len(languages)),
[languageCounter[x] for x in languages],
log=True)
plt.xticks(np.arange(len(languages)) + 0.5, languages)
plt.xlabel('Languages')
plt.ylabel('Counts (Log)')
plt.title("Language Frequency")
plt.grid(True)
plt.show()
Twitter allows users to share their GPS locations when tweeting, but only about 2% of tweets have this information. We can extract this geospatial data to look at patterns in different locations.
In this module, we will look at:
Each tweet has a field called "coordinates" describing from where the tweet was posted. The field might be null if the tweet contains no location data, or it could contain bounding box information, place information, or GPS coordinates in the form of (longitude, latitude). We want tweets with this GPS data.
For more information on tweet JSON formats, check out https://dev.twitter.com/overview/api/tweets
In [22]:
# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0
# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"]))
geoCount += len(geos)
# Add to the timestamp map
geoFrequencyMap[t] = {"count": len(geos), "list": geos}
print ("Number of Geo Tweets:", geoCount)
In [23]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Geo Tweet Frequency")
postFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=45)
ax.plot(range(len(geoFrequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
In [24]:
import matplotlib
from mpl_toolkits.basemap import Basemap
# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = reduce(lambda x, y: x + y, tmpGeoList)
# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]
# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'
fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
llcrnrlon=-180, urcrnrlon=180, resolution='l')
worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')
# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)
plt.show()
We can even use existing Geographic Information System (GIS) tools to determine from where a tweet was posted. For example, we could ask whether a particular tweet was posted from the United States.
To make this determination, we can use geocoding services like Google Maps, or we can use GIS data files called Shape Files, which contain geometric information for a variety of geographic entities (e.g., lakes, roads, county lines, states, countries, etc.).
For our purposes, we pulled a shape file containing the county borders for the state of Missouri, which were sourced from the US Census Department (http://www.census.gov/cgi-bin/geo/shapefiles2010/layers.cgi).
The first step then is to read in this shape file. To divide the Twitter data into those from inside Ferguson, MO and those outside, we found the county containing Ferguson, and we extract the shapes for that county.
In [25]:
# Create a new map to hold the shape file data
stLouisMap = Basemap(llcrnrlon=-130, llcrnrlat=22, urcrnrlon=-64,
urcrnrlat=52, projection='merc', lat_1=33, lat_2=45,
lon_0=-95, resolution='i', area_thresh=10000)
# Read in the shape file
moStateShapeFile = os.path.join("data_files", "moCountyShapes", "tl_2010_29_county10")
shp_info = stLouisMap.readshapefile(moStateShapeFile, 'states', drawbounds=True)
# Find only those polygons that describe St. Louis county
stLouisCountyPolygons = []
for (shapeDict, shape) in zip(stLouisMap.states_info, stLouisMap.states):
if (shapeDict["NAME10"] == "St. Louis"):
stLouisCountyPolygons.append(matplotlib.patches.Polygon(shape))
print ("Shape Count:", len(stLouisCountyPolygons))
For each tweet, we can check whether its GPS coordinates came from St. Louis county or not.
In [26]:
# Maps of timestamps to tweets for inside/outside Ferguson
inStLouisFreqMap = {}
outStLouisFreqMap = {}
# For each geo-coded tweet, extract coordinates and conver them to the Basemap space
for t in sortedTimes:
geos = geoFrequencyMap[t]["list"]
convPoints = [(stLouisMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]
# Local counters for this time
inStLouisFreqMap[t] = {"count": 0, "list": []}
outStLouisFreqMap[t] = {"count": 0, "list": []}
# For each point, check if it is within St. Louis county or not
for point in convPoints:
x = point[0][0]
y = point[0][1]
inStLouisFlag = False
for polygon in stLouisCountyPolygons:
if ( polygon.contains_point((x, y)) ):
inStLouisFreqMap[t]["list"].append(point[1])
inStLouisFlag = True
break
if ( inStLouisFlag == False ):
outStLouisFreqMap[t]["list"].append(point[1])
print ("Tweets in St. Louis:", np.sum([len(inStLouisFreqMap[t]["list"]) for t in sortedTimes]))
print ("Tweets outside St. Louis:", np.sum([len(outStLouisFreqMap[t]["list"]) for t in sortedTimes]))
In [27]:
inStLouisTweets = reduce(lambda x, y: x + y, [inStLouisFreqMap[t]["list"] for t in sortedTimes])
userCounter = {}
userMap = {}
for tweet in inStLouisTweets:
user = tweet["user"]["screen_name"]
if ( user not in userCounter ):
userCounter[user] = 1
userMap[user] = [tweet]
else:
userCounter[user] += 1
userMap[user].append(tweet)
print ("Unique Users in St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)
In [28]:
print("Top Users in Ferguson:")
for u in sortedUsers[:10]:
print (u, userCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\t", user.description)
except Exception as te:
print ("\t", te)
print ("\t", userMap[u][0]["text"], "\n----------")
In [29]:
outStLouisTweets = reduce(lambda x, y: x + y, [outStLouisFreqMap[t]["list"] for t in sortedTimes])
userCounter = {}
userMap = {}
for tweet in outStLouisTweets:
user = tweet["user"]["screen_name"]
if ( user not in userCounter ):
userCounter[user] = 1
userMap[user] = [tweet]
else:
userCounter[user] += 1
userMap[user].append(tweet)
print ("Unique Users outside St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)
In [30]:
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, userCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\t", user.description)
except Exception as te:
print ("\t", te)
print ("\t", userMap[u][0]["text"], "\n----------")
In [31]:
inStlHashtagCounter = {}
for tweet in inStLouisTweets:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in inStlHashtagCounter ):
inStlHashtagCounter[hashtagString] = 1
else:
inStlHashtagCounter[hashtagString] += 1
print ("Unique Hashtags in Ferguson:", len(inStlHashtagCounter.keys()))
sortedInStlHashtags = sorted(inStlHashtagCounter, key=inStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags in Ferguson:")
for ht in sortedInStlHashtags[:20]:
print ("\t", "#" + ht, inStlHashtagCounter[ht])
In [32]:
outStlHashtagCounter = {}
for tweet in outStLouisTweets:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in outStlHashtagCounter ):
outStlHashtagCounter[hashtagString] = 1
else:
outStlHashtagCounter[hashtagString] += 1
print ("Unique Hashtags Outside Ferguson:", len(outStlHashtagCounter.keys()))
sortedOutStlHashtags = sorted(outStlHashtagCounter, key=outStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags Outside Ferguson:")
for ht in sortedOutStlHashtags[:20]:
print ("\t", "#" + ht, outStlHashtagCounter[ht])
Twitter is excellent for sharing media, either photographs, movies, or links websites. When you share pictures, Twitter stores them and links to them directly. We can use this data to sample some random pictures taken from each hour of the data we have.
We'll look at:
In [33]:
hourlyInterval = {}
for t in sortedTimes:
newTime = t.replace(second=0, minute=0)
currentTimeObject = frequencyMap[t]
if ( newTime not in hourlyInterval ):
hourlyInterval[newTime] = {
"count": currentTimeObject["count"],
"list": currentTimeObject["list"]
}
else:
hourlyInterval[newTime]["count"] += currentTimeObject["count"]
hourlyInterval[newTime]["list"] = hourlyInterval[newTime]["list"] + currentTimeObject["list"]
Then we filter out retweets and keep only those tweets with a media listing in the "entities" section. Then, we select a random image from the list of pictures for that hour and display it.
In [34]:
from IPython.display import display
from IPython.display import Image
for h in sorted(hourlyInterval.keys()):
noRetweets = list(filter(lambda tweet: not tweet["text"].lower().startswith("rt"), hourlyInterval[h]["list"]))
tweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], noRetweets))
print (h, hourlyInterval[h]["count"], len(tweetsWithMedia), )
randIndex = np.random.random_integers(0, len(tweetsWithMedia)-1, size=1)
imgUrl = tweetsWithMedia[randIndex]["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
In [35]:
stlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], inStLouisTweets))
print ("Tweets with Media:", len(stlTweetsWithMedia))
for tweet in stlTweetsWithMedia:
imgUrl = tweet["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
In [37]:
outStlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], outStLouisTweets))
print ("Tweets outside St. Louis with Media:", len(outStlTweetsWithMedia))
np.random.shuffle(outStlTweetsWithMedia)
for tweet in outStlTweetsWithMedia[:10]:
imgUrl = tweet["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
Another popular type of analysis people do on social networks is "sentiment analysis," which is used to figure out how people feel about a specific topic.
One way to explore sentiment is to use a list of keywords with tagged sentiment information (e.g., "happy" or "awesome" might have high sentiment whereas "terrible" or "awful" might have very low sentiment). Then, we can count the occurrence of these tagged keywords to get a sense of how people feel about the topic at hand.
We use the AFINN Sentiment Dictionary for our keyword list. Link here: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
In [38]:
import re
# Read in the sentiment/valence files
dataFilePath = os.path.join("data_files", "SentiStrength")
valenceFile = os.path.join(dataFilePath, "EmotionLookupTable.txt")
emoticonFile = os.path.join(dataFilePath, "EmoticonLookupTable.txt")
valenceList = []
# Open the valence file and read in each word/valence pair
for line in open(valenceFile, "r"):
# Split the line based on tabs and select the first two elements
(word, valence) = line.split("\t")[:2]
wordRegex = re.compile(word)
valencePair = (wordRegex, int(valence))
valenceList.append(valencePair)
# Open the emoticon file and read in the valence for each emoticon
for line in codecs.open(emoticonFile, "r", "utf-8"):
# Split the line based on tabs and select the first two elements
(emoticon, valence) = line.split("\t")[:2]
emoticonRegex = re.compile(re.escape(emoticon))
valencePair = (emoticonRegex, int(valence))
valenceList.append(valencePair)
print ("Number of Sentiment Keywords:", len(valenceList))
In [39]:
# Examples of sentiment pairs
for i in np.random.random_integers(0, len(valenceList)-1, 10):
print(valenceList[i][0].pattern, "\t", valenceList[i][1])
In [40]:
# Generate sentiment measures for each time
timeSentiments = {}
for t in sortedTimes:
tweetList = frequencyMap[t]["list"]
sentimentList = []
thisMinuteSentiment = None
for tweet in tweetList:
# Calculate the average sentiment for this tweet
tweetText = tweet["text"].lower()
# skip retweets
if ( tweetText.startswith("rt ") ):
continue
valCount = 0
valSum = 0.0
valAvg = 0.0
for valencePair in valenceList:
if ( valencePair[0].search(tweetText) is not None ):
valCount += 1
valSum += valencePair[1]
if ( valCount > 0 ):
valAvg = valSum / valCount
sentimentList.append(valAvg)
if ( len(sentimentList) > 0 ):
thisMinuteSentiment = np.array(sentimentList).mean()
else:
thisMinuteSentiment = 0.0
timeSentiments[t] = thisMinuteSentiment
In [41]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Sentiment Over Time")
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
sentList = [timeSentiments[x] for x in sortedTimes]
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax2 = ax.twinx()
ax2.plot([0], [0], color="blue", label="Posts")
ax2.plot(range(len(frequencyMap)), sentList, color="green", label="Sentiment")
ax2.set_ylim(-6,6)
ax.grid(b=True, which=u'major')
ax2.legend()
plt.show()
Based on this data, we can see that most people are pretty unhappy with the events in Ferguson, MO. This result is not all that unexpected.
In [42]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Sentiment Histrogram")
for (loc, (tweetList, color)) in {"Inside": (inStLouisTweets, "green"), "Outside": (outStLouisTweets, "blue")}.items():
localSentimentList = []
for tweet in tweetList:
# Calculate the average sentiment for this tweet
tweetText = tweet["text"].lower()
# skip retweets
if ( tweetText.startswith("rt ") ):
continue
valCount = 0
valSum = 0.0
valAvg = 0.0
for valencePair in valenceList:
if ( valencePair[0].search(tweetText) is not None ):
valCount += 1
valSum += valencePair[1]
if ( valCount > 0 ):
valAvg = valSum / valCount
localSentimentList.append(valAvg)
print("Number of Sentiment Tweets:", len(localSentimentList))
ax.hist(localSentimentList, range=(-5, 5), normed=True, alpha=0.5, color=color, label=loc)
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
Along with sentiment analysis, a question often asked of social networks is "What are people talking about?" We can answer this question using tools from topic modeling and natural language processing, and we can even divide this data to see what people in Ferguson are talking about versus those outside.
To generate these topic models, we will use the Gensim package's implementation of Latent Dirichlet Allocation (LDA), which basically constructs a set of topics where each topic is described as a probability distribution over the words in our tweets. Several other methods for topic modeling exist as well.
In [43]:
import gensim.models.ldamodel
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing
from nltk.corpus import stopwords
We first extract the text of all English tweets that are not retweets and make the text lowercase.
In [44]:
enFilter = lambda x: True if x["lang"] == "en" else False
# Get all tweets, filter out retweets, save only those in English, and conver to lowercase
allTweetList = reduce(lambda x, y: x + y, [frequencyMap[t]["list"] for t in sortedTimes])
noRetweetsList = list(filter(lambda x: not x["text"].lower().startswith("rt"), allTweetList))
onlyEnglishTweets = list(filter(enFilter, noRetweetsList))
lowerTweetText = [x["text"].lower() for x in onlyEnglishTweets]
print ("All Tweet Count:", len(allTweetList))
print ("Reduced Tweet Count:", len(lowerTweetText))
Now we build a list of stop words (words we don't care about) and build a feature generator (the vectorizer) that assigns integer keys to tokens and counts the number of each token.
In [45]:
enStop = stopwords.words('english')
# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + ["http", "https", "rt", "@", ":"]
vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode',
tokenizer=None,
token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
stop_words=stopList,
binary=True)
# Create a vectorizer for all our content
vectorizer.fit(lowerTweetText)
# Get all the words in our text
names = vectorizer.get_feature_names()
# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))
We then use the vectorizer to transform our tweet text into a feature set, which essentially is a table with rows of tweets, columns for each keyword, and each cell is the number of times that keyword appears in that tweet.
We then convert that table into a model the Gensim package can handle, apply LDA, and grab the top 10 topics, 10 words that describe that topic, and print them.
In [46]:
# Create a corpus for
corpus = vectorizer.transform(lowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
# lda = gensim.models.ldamodel.LdaModel(gsCorpus, id2word=id2WordDict, num_topics=10)
lda = gensim.models.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=100, passes=2)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
In [47]:
inStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, inStLouisTweets)]
corpus = vectorizer.transform(inStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=10, passes=10)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
In [48]:
outStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, outStLouisTweets)]
corpus = vectorizer.transform(outStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=50, passes=10)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
Issues of race, class, poverty, and police militarization all came out during the protests and clashes with law enforcement, and it didn't take much to find people on either side of each issue on Twitter. At the same time, people were turning to Twitter for news about the events on the ground since many perceived that mainstream media wasn't giving the events adequate or fair coverage. Using network analysis, we can get some idea about who the most important Twitter users were during this time, and how people split into groups online.
For this analysis, we'll use the NetworkX package to construct a social graph of how people interact. Each person in our Twitter data will be a node in our graph, and edges in the graph will represent mentions during this timeframe. Then we will explore a few simple analytical methods in network analysis, including:
To limit the amount of data we're looking at, we'll only build the network for people who have GPS locations in their tweets and the people they mention. We build this network simply by iterating through all the tweets in our GPS list and extract the "user_mentions" list from the "entities" section of the tweet object. For each mention a user makes, we will add an edge from that user to the user he/she mentioned.
In addition, we will append a location attribute to each user based on whether we saw them in Ferguson or outside of Ferguson.
In [49]:
import networkx as nx
graph = nx.DiGraph()
geoCodedMap = {1: inStLouisTweets, 0: outStLouisTweets}
for (location, locationList) in geoCodedMap.items():
print (location, len(locationList))
for tweet in locationList:
userName = tweet["user"]["screen_name"]
graph.add_node(userName, loc=location)
mentionList = tweet["entities"]["user_mentions"]
for otherUser in mentionList:
otherUserName = otherUser["screen_name"]
if ( graph.has_node(otherUserName) == False ):
graph.add_node(otherUserName, loc=-1)
graph.add_edge(userName, otherUserName)
print ("Number of Users:", len(graph.node))
In network analysis, "centrality" is used to measure the importance of a given node. Many different types of centrality are used to describe various types of importance though. Examples include "closeness centrality," which measures how close a node is to all other nodes in the network, versus "betweeness centrality," which measures how many shortest paths run through the given node. Nodes with high closeness centrality are important for rapidly disseminating information or spreading disease, whereas nodes with high betweeness are more important to ensure the network stays connected.
The PageRank is another algorithm for measuring importance and was proposed by Sergey Brin and Larry Page for the early version of Google's search algorithm. NetworkX has an implementation of the PageRank algorithm that we can use to look at the most important/authoritative users on Twitter based on their connections to other users.
In [50]:
pageRankList = nx.pagerank_numpy(graph)
highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
user = api.get_user(x)
print (x, pageRankList[x], "\n\t", user.description, "\n----------")
In [51]:
print (len(graph.nodes(data=True)))
colors = [0.9 if x[1]["loc"] == 1 else 0.1 for x in graph.nodes(data=True)]
pos = {x:(np.random.rand(2) * 10) for x in graph.nodes()}
nx.draw_networkx_nodes(graph, pos, node_color=colors)
nx.draw_networkx_edges(graph, pos)
Out[51]:
This graph is relatively uninformative, so we will turn to other tools for better visualization.
We first save this graph to a file, so we can import into other tools.
In [52]:
nx.write_graphml(graph, "inVsOutNetwork.graphml", encoding='utf-8', prettyprint=False)
In [ ]:
In [ ]:
# If you want to play with the full graph,
# here is code that will build it up for you.
# Be careful. It's large.
fullGraph = nx.DiGraph()
inStlUsers = set(map(lambda x: x["user"]["screen_name"], inStLouisTweets))
outStlUsers = set(map(lambda x: x["user"]["screen_name"], outStLouisTweets))
for (userName, tweetList) in globalUserMap.items():
location = -1
if ( userName in inStlUsers ):
location = 1
elif (userName in outStlUsers ):
location = 0
fullGraph.add_node(userName, loc=location)
for tweet in tweetList:
mentionList = tweet["entities"]["user_mentions"]
for otherUser in mentionList:
otherUserName = otherUser["screen_name"]
if ( fullGraph.has_node(otherUserName) == False ):
fullGraph.add_node(otherUserName, loc=-1)
fullGraph.add_edge(userName, otherUserName)
print ("Number of Users:", len(fullGraph.node))
nx.write_graphml(fullGraph, "fullNetwork.graphml", encoding='utf-8', prettyprint=False)